In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)  

def correlation(t, x, y):
    return np.mean(standard_units(t.column(x))*standard_units(t.column(y)))

def slope(table, x, y):
    r = correlation(table, x, y)
    return r * np.std(table.column(y))/np.std(table.column(x))

def intercept(table, x, y):
    a = slope(table, x, y)
    return np.mean(table.column(y)) - a * np.mean(table.column(x))

def fitted_values(table, x, y):
    a = slope(table, x, y)
    b = intercept(table, x, y)
    return a * table.column(x) + b

def scatter_fit(table, x, y):
    plots.scatter(table.column(x), table.column(y), s=20)
    plots.plot(table.column(x), fitted_values(table, x, y), lw=2, color='gold')
    plots.xlabel(x)
    plots.ylabel(y)

In [None]:
def draw_and_compare(true_slope, true_int, sample_size):
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    eps = np.random.normal(0, 6, sample_size)
    y = (true_slope*x + true_int) + eps
    tyche = Table().with_columns(
        'x', x,
        'y', y
    )

    plots.figure(figsize=(6, 16))
    plots.subplot(4, 1, 1)
    plots.scatter(tyche['x'], tyche['y'], s=20)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')

    plots.subplot(4, 1, 2)
    plots.scatter(tyche['x'],tyche['y'], s=20)
    plots.title('What We Get to See')

    plots.subplot(4, 1, 3)
    scatter_fit(tyche, 'x', 'y')
    plots.xlabel("")
    plots.ylabel("")
    plots.title('Regression Line: Estimate of True Line')

    plots.subplot(4, 1, 4)
    scatter_fit(tyche, 'x', 'y')
    plots.ylabel("")
    xlims = np.array([np.min(tyche['x']), np.max(tyche['x'])])
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title("Regression Line and True Line")

## Regression Recap ##

### The Model and Our Estimate ###

In [None]:
# The true line,
# the points created,
# and our estimate of the true line.
# Arguments: true slope, true intercept, number of points

draw_and_compare(4, -5, 50)

In [None]:
heights = Table.read_table('galton.csv')

In [None]:
heights = heights.where('gender', 'female').select('mother', 'childHeight')
heights = heights.relabeled('childHeight', 'daughter')

In [None]:
heights

In [None]:
r = correlation(heights, 'mother', 'daughter')
r

In [None]:
heights.scatter('mother', 'daughter', fit_line=True)

In [None]:
heights.scatter('mother', 'daughter', fit_line=True)
average_daughter = np.average(heights.column('daughter'))
plots.plot([58, 72], [average_daughter, average_daughter])
plots.plot([60, 70], [58, 70]);

In [None]:
heights.scatter('mother', 'daughter', fit_line=True)

### Code to Compute Regression Estimates and Residuals ###

In [None]:
def standard_units(x):
    return (x - np.average(x)) / np.std(x)

def correlation(table, x, y):
    x_su = standard_units(table.column(x))
    y_su = standard_units(table.column(y))
    return np.average(x_su * y_su)

def slope(table, x, y):
    r = correlation(table, x, y)
    return r * np.std(table.column(y)) / np.std(table.column(x))

def intercept(table, x, y):
    a = slope(table, x, y)
    return np.average(table.column(y)) - a*np.average(table.column(x))

def prediction_at(table, x, y, given_x):
    return slope(table, x, y)*given_x + intercept(table, x, y)

def fitted_values(table, x, y):
    return slope(table, x, y)*table.column(x) + intercept(table, x, y)

def residuals(table, x, y):
    return table.column(y) - fitted_values(table, x, y)

### Equation of the Regression Line ###

In [None]:
best_slope = slope(heights, 'mother', 'daughter')
best_slope

In [None]:
best_intercept = intercept(heights, 'mother', 'daughter')
best_intercept

### `minimize` ###

In [None]:
# mean square error of predicting daughter's height (d)
# based on mother's height (m)
# using any slope (a) and any intercept (b)

def mse_d_m(a, b):
    x = heights.column('mother')
    y = heights.column('daughter')
    estimates = a*x + b
    return np.mean((y - estimates)**2)

In [None]:
minimize(mse_d_m)

In [None]:
best_slope, best_intercept

In [None]:
mse_d_m(best_slope, best_intercept)

In [None]:
mse_d_m(0.3, 44)

### Fitting a Non-Linear Function ###

In [None]:
shotput = Table.read_table('shotput.csv')

In [None]:
shotput

In [None]:
shotput.scatter(0, 1)

In [None]:
# mean square error of predicting distance
# based on a quadratic function of weight
x = shotput.column('Weight Lifted')
y = shotput.column('Shot Put Distance')
def mse_quad(a, b, c):
    estimates = a*(x**2) + b*x + c
    return np.mean((y - estimates)**2)

In [None]:
best = minimize(mse_quad)
best

In [None]:
quadratic_fit = best.item(0)*(x**2) + best.item(1)*x + best.item(2)

In [None]:
shotput.with_column('Fitted Values', quadratic_fit).scatter(0)

### The Rough Size of the Residuals ###

In [None]:
heights.scatter('mother', 'daughter', fit_line=True)
average_daughter = np.average(heights.column('daughter'))
plots.plot([58, 72], [average_daughter, average_daughter]);

In [None]:
average_daughter

In [None]:
# rms error if using the flat line at average_daughter

sd_daughters = np.std(heights.column('daughter'))
sd_daughters

In [None]:
# SD of residuals (errors in regression estimates)

r = correlation(heights, 'mother', 'daughter')
np.sqrt(1 - r**2) * sd_daughters

In [None]:
np.std(residuals(heights, 'mother', 'daughter'))

In [None]:
resids = residuals(heights, 'mother', 'daughter')
heights.with_column('residual', resids).scatter('mother', 'residual')

### Prediction ###

In [None]:
prediction_at(heights, 'mother', 'daughter', 66)