In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

!mkdir -p '/content/gdrive/My Drive/colab-materials-data8-notebooks/'
!git clone https://github.com/data-8/materials-sp22-colab '/content/gdrive/My Drive/colab-materials-data8-notebooks/materials-sp22-colab/'

%pip install otter-grader==4.4.1
%cd /content/gdrive/MyDrive/colab-materials-data8-notebooks/materials-sp22-colab//

from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action='ignore',category=np.VisibleDeprecationWarning)

## Lecture 10 ##

## Prediction ##

In [None]:
families = Table.read_table('family_heights.csv')
families

In [None]:
parent_avgs = (families.column('father') + families.column('mother'))/2

In [None]:
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
    'Sex', families.column('sex')
)
heights

In [None]:
heights.scatter('Parent Average', 'Child')

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
nearby = heights.where('Parent Average', are.between(67.5, 68.5))
nearby_mean = np.average(nearby.column('Child'))
nearby_mean

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='red', s=50);

In [None]:
def predict(h):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    return np.average(nearby.column('Child'))

In [None]:
predict(68)

In [None]:
predict(70)

In [None]:
predict(73)

In [None]:
predicted_heights = heights.apply(predict, 'Parent Average')

In [None]:
heights = heights.with_column('Prediction', predicted_heights)

In [None]:
heights.select('Parent Average', 'Child', 'Prediction').scatter('Parent Average')

## Prediction Accuracy ##

In [None]:
def difference(x, y):
    return x - y

In [None]:
pred_errs = heights.apply(difference, 'Prediction', 'Child')
heights = heights.with_column('errors',pred_errs)
heights

In [None]:
heights.hist('errors')

In [None]:
heights.hist('errors', group='Sex')

# Discussion Question

In [None]:
def predict_smarter(h, s):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    nearby_same_sex = nearby.where('Sex', s)
    return np.average(nearby_same_sex.column('Child'))

In [None]:
predict_smarter(68, 'female')

In [None]:
predict_smarter(68, 'male')

In [None]:
smarter_predicted_heights = heights.apply(predict_smarter, 'Parent Average', 'Sex')
heights = heights.with_column('Smarter Prediction', smarter_predicted_heights)

In [None]:
smarter_pred_errs = heights.apply(difference, 'Child', 'Smarter Prediction')
heights = heights.with_column('Smarter Errors', smarter_pred_errs)

In [None]:
heights.hist('Smarter Errors', group='Sex')

## Grouping by One Column ##

In [None]:
cones = Table.read_table('cones.csv').drop('Color')
cones

In [None]:
cones.group('Flavor')

In [None]:
cones.group('Flavor', np.average)

In [None]:
cones.group('Flavor', np.min)

## Grouping By One Column: Welcome Survey ##

In [None]:
survey = Table.read_table('welcome_survey_sp22.csv')
survey.show(3)

In [None]:
survey.hist('Extroversion')

In [None]:
by_extra = survey.group('Extroversion', np.average)
by_extra

In [None]:
by_extra.plot('Extroversion', 'Number of textees average')

In [None]:
survey.group("Year")

In [None]:
(survey
 .select("Year", "Hours of sleep")
 .group(0, np.average)
 .take(1, 7, 8, 3))

## Lists

In [None]:
[1, 5, 'hello', 5.0]

In [None]:
[1, 5, 'hello', 5.0, make_array(1,2,3)]

## Grouping by Two Columns ##

![Do right-handed people tend to sleep on their left side and left-handed people sleep on their right?](handed.png)

In [None]:
survey.group(['Handedness', 'Sleep position']).show()