# Lecture 25: Linear Regression

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Review of correlation

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid.show(3)

In [None]:
# Visualize: is there a linear relationship between two variables?
hybrid.scatter('acceleration', 'msrp', fit_line=True)

In [None]:
def standard_units(numbers):
    "Convert any array of numbers to standard units."
    return (numbers - np.mean(numbers)) / np.std(numbers)  

def correlation(t, x, y):
    """Return the correlation coefficient (r) of two variables."""
    return np.mean(standard_units(t.column(x)) * standard_units(t.column(y)))
   
def standardize(t):
    "Covert table t to standard units."
    su = Table()
    for label in t.labels:
        su = su.with_column(label + ' (su)', standard_units(t.column(label)))
    return su

In [None]:
# Quantify: how scattered are the points around a straight line?
correlation(hybrid, 'acceleration', 'msrp')

## Properties of correlation

**1. Unaffected by changing units**

In [None]:
accel_msrp = hybrid.select('acceleration', 'msrp')
correlation(accel_msrp, 'acceleration', 'msrp')

In [None]:
accel_msrp_su = standardize(accel_msrp)
correlation(accel_msrp_su, 'acceleration (su)', 'msrp (su)')

In [None]:
accel_msrp_1k = accel_msrp.with_column(
    'msrp ($k)', accel_msrp.column('msrp') / 1000
)
correlation(accel_msrp_1k, 'acceleration', 'msrp ($k)')

Because we're measuring the **relative** spread around a straight line.

In [None]:
accel_msrp.scatter('acceleration', 'msrp')
accel_msrp_su.scatter('acceleration (su)', 'msrp (su)')
accel_msrp_1k.scatter('acceleration', 'msrp ($k)')

**2. Unaffected by swapping variables**

In [None]:
correlation(hybrid, 'acceleration', 'msrp')

In [None]:
correlation(hybrid, 'msrp', 'acceleration')

Because the roles of $x$ and $y$ are symmetric in formula, and graphs are symmetric when you swap axes.

In [None]:
accel_msrp.scatter('acceleration', 'msrp')
accel_msrp.scatter('msrp', 'acceleration')

## A little justification for the correlation formula

In [None]:
# some toy data
t = Table().with_columns(
    'x', make_array(1,3,4,5,7),
    'y', make_array(5,9,7,1,13)
)
t
t.scatter('x', 'y', s=80)

In [None]:
correlation(t, 'x', 'y')

In [None]:
# let's plot in standard units
# recall it's just the axes that change, not the relative positions of the data points

t_su = standardize(t)
t_su.scatter('x (su)', 'y (su)', s=80)

In [None]:
# plot the data with the size of each dot proportional
# to the product that dot contributes to the formula

t_su_prod = t_su.with_columns(
    'product', t_su.column('x (su)') * t_su.column('y (su)'),
    'abs product', np.abs(t_su.column('x (su)') * t_su.column('y (su)'))
)
t_su_prod.scatter('x (su)', 'y (su)', sizes='abs product', labels='product', s=100)
plots.xlim(-3, 3);
plots.ylim(-3, 3);
plots.plot([-3, 3], [0,0], color='blue', linestyle='dashed');
plots.plot([0,0], [-3,3], color='blue', linestyle='dashed');
plots.text(2, 2, '+', fontsize=48);
plots.text(-2.5, 2, '-', fontsize=64);
plots.text(-2.75, -2.5, '+', fontsize=48);
plots.text(2.25, -2.5, '-', fontsize=64);

* Because axes are in standard units, $+$ quadrants have positive products, and $-$ quadrants have negative products
* $r$ is the average of those labels on the dots
* More big dots in $+$ quadrants brings $r$ closer to $1$
* More big dots in $-$ quadrants brings $r$ closer to $-1$
* Dots relatively spread out brings $r$ closer to $0$

## Prediction using correlation

In [None]:
galton = Table.read_table('galton.csv')
heights = galton.select('midparentHeight', 'childHeight')\
  .relabeled('midparentHeight', 'MidParent')\
  .relabeled('childHeight', 'Child')
heights.show(3)

In [None]:
galton.show(3)

In [None]:
heights.scatter('MidParent')

In [None]:
heights_su = standardize(heights)
heights_su.scatter('MidParent (su)')

In [None]:
def predict_child_su(new_midparent_height):
    close_points = heights_su.where('MidParent (su)', 
                                are.between(new_midparent_height - 0.5, 
                                            new_midparent_height + 0.5))
    return close_points.column('Child (su)').mean()    

heights_su_pred = heights_su.with_column(
    'Prediction (su)', heights_su.apply(predict_child_su, 'MidParent (su)')
)
heights_su_pred.scatter('MidParent (su)')

The yellow line is the *graph of averages*.

**Q:** What is the predicted height (in su) of a child whose parents have an average height?

A. -1  
B. 0  
C. 1  
D. Whatever $r$ is for this scatter plot  
E. Can't answer from this plot  

<br/><br/><br/><br/><br/><br/>

In [None]:
r = correlation(heights, 'MidParent', 'Child')
r

In [None]:
galton_f = galton.where('gender',are.equal_to('female'))
heights_f = galton_f.select('midparentHeight', 'childHeight')\
  .relabeled('midparentHeight', 'MidParent')\
  .relabeled('childHeight', 'Child')
heights_f.show(3)

In [None]:
heights_f.scatter('MidParent')
plots.ylim(55,80) # same as original plot
correlation(heights_f,'MidParent','Child')

## The graph of averages and $r$

In [None]:
heights_su_pred.scatter('MidParent (su)')

# plot the line "y = r * x"
plots.plot([-3, 3.5], [-3*r, 3.5*r], color='blue', lw=2);

The line $y = r \times x$ is a version of the graph of averages, smoothed to a line.

## Regression to the mean

In [None]:
heights_su.scatter('MidParent (su)')

# Plot the "SD line"
plots.plot([-3, 3.5], [-3, 3.5], color='green', lw=2);

In [None]:
heights_su.scatter('MidParent (su)')

# Plot the "SD line"
plots.plot([-3, 3.5], [-3, 3.5], color='green', lw=2);

# plot the regression line "y = r * x"
plots.plot([-3, 3.5], [-3*r, 3.5*r], color='blue', lw=2);

Regression line is flatter than SD line:  extreme values of $x$ result in not-as-extreme values of $y$

## Prediction by linear regression

In [None]:
parent_mean = np.mean(heights.column('MidParent'))
parent_sd = np.std(heights.column('MidParent'))
child_mean = np.mean(heights.column('Child'))
child_sd = np.std(heights.column('Child'))
print('Parent: mean =', parent_mean, '; SD =', parent_sd)
print(' Child: mean =', child_mean, '; SD =', child_sd)
print('     r:', r)

In [None]:
def predict_with_r(parent):
    """Return a prediction of the height of a child whose parents 
    have a midparent height of mp, using linear regression.
    """
    parent_su = (parent - parent_mean) / parent_sd
    child_su = r * parent_su
    return child_su * child_sd + child_mean

In [None]:
def predict_with_average(parent):
    """"Return a prediction of the height of a child whose parents 
    have a midparent height of mp, using the average of nearby parents.
    """
    close_points = heights.where('MidParent', 
                                are.between(parent - 0.5, 
                                            parent + 0.5))
    return close_points.column('Child').mean()  

In [None]:
heights.with_columns(
    'Prediction with averages', heights.apply(predict_with_average, 'MidParent'),
    'Prediction with r', heights.apply(predict_with_r, 'MidParent')
).scatter('MidParent')

## Slope and Intercept

In [None]:
def slope(t, x, y):
    """The slope of the regression line (original units)."""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)."""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

In [None]:
predict_with_r(74)

In [None]:
def predict_with_slope_and_intercept(x):
    s = slope(heights, 'MidParent', 'Child')
    i = intercept(heights, 'MidParent', 'Child')
    return s * x + i

In [None]:
predict_with_slope_and_intercept(74)