In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

!mkdir -p '/content/gdrive/My Drive/colab-materials-data8-notebooks/'
!git clone https://github.com/data-8/materials-sp22-colab '/content/gdrive/My Drive/colab-materials-data8-notebooks/materials-sp22-colab/'

%cd /content/gdrive/MyDrive/colab-materials-data8-notebooks/materials-sp22-colab/lectures/
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)                 

In [None]:
scores = Table.read_table("scores.csv")
scores.drop(2).show(5)

In [None]:
mt1 = scores.column('Midterm 1')
mt2 = scores.column('Midterm 2')
print('Midterm 1 avg:', np.average(mt1), 'std dev:', np.std(mt1))
print('Midterm 2 avg:', np.average(mt2), 'std dev:', np.std(mt2))

### Option 1: Scale Up

In [None]:
mt1_actual = 21
mt2_estimate_1 = mt1_actual / 40 * 50
mt2_estimate_1

In [None]:
mt2_estimate_1 - np.average(mt2)

In [None]:
(mt2_estimate_1 - np.average(mt2)) / np.std(mt2)

In [None]:
(mt1_actual - np.average(mt1)) / np.std(mt1)

### Option 2: Z-Score

In [None]:
mt1_actual = 21
mt1_z = (mt1_actual - np.average(mt1)) / np.std(mt1)
mt2_estimate_2 = np.average(mt2) + mt1_z * np.std(mt2)
mt2_estimate_2

In [None]:
mt2_estimate_2 - np.average(mt2)

In [None]:
(mt2_estimate_2 - np.average(mt2)) / np.std(mt2)

In [None]:
scores.hist('Midterm 1', unit='point')

In [None]:
scores.hist('Midterm 2', unit='point')

### Option 3: Percentile

In [None]:
mt1_actual = 21
mt1_percentile = sum(mt1 <= mt1_actual) / len(mt1) * 100
mt1_percentile

In [None]:
percentile(mt1_percentile, mt1)

In [None]:
percentile(mt1_percentile, mt2)

In [None]:
scores.where('Midterm 1', 21).hist('Midterm 2', normed=False)

### Option 4: Linear Regression

In [None]:
def standard_units(arr):
    """Converts an array to standard units """
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    """Computes correlation: t is a table, and x and y are column names """
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    """Computes the slope of the regression line, like correlation above """
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    """Computes the intercept of the regression line, like slope above """
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates (predictions) at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

In [None]:
r = correlation(scores, 'Midterm 1', 'Midterm 2')
r

In [None]:
mt1_actual = 21
mt1_z = (mt1_actual - np.average(mt1)) / np.std(mt1)
mt2_estimate_2 = np.average(mt2) + mt1_z * r * np.std(mt2)
mt2_estimate_2

In [None]:
scores.scatter('Midterm 1', 'Midterm 2')

In [None]:
a = slope(scores, 'Midterm 1', 'Midterm 2')
b = intercept(scores, 'Midterm 1', 'Midterm 2')
scores.drop(2).with_column('Fitted', a * mt1 + b ).scatter('Midterm 1')

In [None]:
scores.with_column('Residual', mt2 - (a * mt1 + b)).scatter('Midterm 1', 'Residual')

In [None]:
scores.where("Midterm 1", mt1_actual).hist('Midterm 2')

In [None]:
scores.where("Midterm 1", are.between_or_equal_to(mt1_actual-2, mt1_actual+2)).hist('Midterm 2')

In [None]:
def avg_mt2(mt1):
    near = scores.where("Midterm 1", are.between_or_equal_to(mt1-2, mt1+2))
    return near.column("Midterm 2").mean()

avg_mt2(mt1_actual)

In [None]:
mt2_avg = scores.apply(avg_mt2, 'Midterm 1')

In [None]:
scores.drop(2).with_column('Avg', mt2_avg).scatter('Midterm 1')

## Tutoring

In [None]:
scores.show(5)

In [None]:
scores.scatter('Midterm 1', 'Midterm 2', group='Mentored')

In [None]:
scores.hist('Midterm 1', group='Mentored', bins=np.arange(0, 41, 5), normed=False)

In [None]:
scores.hist('Midterm 2', group='Mentored', bins=np.arange(0, 51, 5), normed=False)

In [None]:
no_mentor = scores.where("Mentored", False)

def avg_mt2_no_mentor(mt1):
    near = no_mentor.where("Midterm 1", are.between_or_equal_to(mt1-2, mt1+2))
    return near.column("Midterm 2").mean()

predicted_mt2 = scores.apply(avg_mt2_no_mentor, "Midterm 1")

In [None]:
scores.drop(2).with_column('Predicted Mt2', predicted_mt2).scatter('Midterm 1')

In [None]:
scores = scores.with_column("Improvement", scores.column('Midterm 2') - predicted_mt2)

scores.hist("Improvement", bins=np.arange(-30, 31, 5), group="Mentored", unit="point")

In [None]:
def of_at_least_5(values):
    return sum(values >= 5) / len(values)

scores.select('Mentored', 'Improvement').group('Mentored', of_at_least_5).set_format(1, PercentFormatter)

In [None]:
scores.group("Mentored", np.mean)

In [None]:
def mean_ci(observations):
    means = []
    for i in np.arange(2000):
        means.append(observations.sample().column("Improvement").mean())
    lower, upper = percentile(2.5, means), percentile(97.5, means)
    print("Mean improvement:", observations.column("Improvement").mean())
    print("95% CI of mean improvement:", lower, "to", upper)

mentored = scores.where("Mentored", True)
mean_ci(mentored)

In [None]:
mean_ci(mentored.where("Midterm 1", are.below(20)))

In [None]:
mean_ci(mentored.where("Midterm 1", are.between(20, 30)))

In [None]:
mean_ci(mentored.where("Midterm 1", are.above_or_equal_to(30)))