# Lecture 32: Residuals

In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Non-linear regression

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
msrp_mpg = hybrid.select('mpg', 'msrp')
msrp_mpg.scatter('mpg', fit_line=True)

**Q:** Does linear regression look appropriate?

A.  Yes  
B.  No  
C.  I don't know  

In [None]:
def msrp_mpg_linear_rmse(any_slope, any_intercept):
    x = msrp_mpg.column('mpg')
    y = msrp_mpg.column('msrp')
    predicted = any_slope*x + any_intercept
    return np.sqrt(np.mean((y - predicted) ** 2))

In [None]:
best_linear = minimize(msrp_mpg_linear_rmse)
best_linear

In [None]:
msrp_mpg_linear_rmse(best_linear.item(0), best_linear.item(1))

In [None]:
def msrp_mpg_quadratic_mse(a, b, c):
    x = msrp_mpg.column('mpg')
    y = msrp_mpg.column('msrp')
    predicted = a*(x**2) + b*x + c
    return np.sqrt(np.mean((y - predicted) ** 2))

In [None]:
best_quadratic = minimize(msrp_mpg_quadratic_mse)
best_quadratic

In [None]:
msrp_mpg_quadratic_mse(best_quadratic.item(0), 
                       best_quadratic.item(1), 
                       best_quadratic.item(2))

In [None]:
x = msrp_mpg.column('mpg')
quadratic_curve = best_quadratic.item(0)*(x**2) \
                  + best_quadratic.item(1)*x \
                  + best_quadratic.item(2)
msrp_mpg.with_column(
    'Best Quadratic Curve', 
    quadratic_curve
).scatter('mpg')

## Residuals

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
)

heights

In [None]:
def standard_units(any_numbers):
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)  

# t is a table; x and y are column labels

def correlation(t, x, y):
    return np.mean(standard_units(t.column(x))*standard_units(t.column(y)))

def slope(t, x, y):
    r = correlation(t, x, y)
    return r * np.std(t.column(y))/np.std(t.column(x))

def intercept(t, x, y):
    a = slope(t, x, y)
    return np.mean(t.column(y)) - a * np.mean(t.column(x))

def prediction(t, x, y):
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

In [None]:
heights = heights.with_columns('Prediction', prediction(heights, 'MidParent', 'Child'))
heights

In [None]:
heights.scatter('MidParent')

**Q:** What is the minimum and maximum residual for a midparent height of about 70?

A.  min 60, max 78   
B.  min 67, max 67  
C.  min 7, max 12  
D.  min -7, max 12  

<br/><br/><br/><br/><br/><br/>

In [None]:
def residuals(t, x, y):
    return t.column(y) - prediction(t, x, y)

In [None]:
heights = heights.with_columns('Residual', residuals(heights, 'MidParent', 'Child'))
heights

In [None]:
heights_residuals = heights.where('MidParent', are.between(69.5, 70.5)).sort('Residual').column('Residual')
heights_residuals.min(), heights_residuals.max()

## Residuals as regression diagnostics

In [None]:
heights.scatter('MidParent')

In [None]:
def plot_residuals(t, x, y):
    tbl = t.with_columns(
        'Fitted', prediction(t, x, y),
        'Residual', residuals(t, x, y)
    )
    tbl.select(x, y, 'Fitted').scatter(0)
    tbl.scatter(x, 'Residual')

In [None]:
plot_residuals(heights, 'MidParent', 'Child')

**Q:** Do you see any pattern in the residuals?

A.  Yes  
B.  No  

<br/><br/><br/><br/>

In [None]:
plot_residuals(msrp_mpg, 'mpg', 'msrp')

**Q:** Do you see any pattern in the residuals?

A.  Yes  
B.  No  

<br/><br/><br/><br/>

## Dugongs

In [None]:
# length in meters, age in years as estimated by physical characteristics such as teeth
dugong = Table.read_table('dugong.csv')
dugong

In [None]:
correlation(dugong, 'Length', 'Age')

In [None]:
plot_residuals(dugong, 'Length', 'Age')

**Q:** Do you see any pattern in the residuals?

A.  Yes  
B.  No  

<br/><br/><br/><br/>

In [None]:
def dugong_quadratic_rmse(a, b, c):
    x = dugong.column('Length')
    y = dugong.column('Age')
    predicted = a*(x**2) + b*x + c
    return np.sqrt(np.mean((y - predicted) ** 2))

dugong_best_quadratic = minimize(dugong_quadratic_rmse)
x = dugong.column('Length')
dugong_quadratic_curve = dugong_best_quadratic.item(0)*(x**2) \
                  + dugong_best_quadratic.item(1)*x \
                  + dugong_best_quadratic.item(2)
dugong.with_column(
    'Best Quadratic Curve', 
    dugong_quadratic_curve
).scatter('Length')
dugong.with_column(
    'Residual', dugong.column('Age') - dugong_quadratic_curve
).scatter('Length', 'Residual')

**Q:** Do you see any pattern in the residuals?

A.  Yes  
B.  No  

<br/><br/><br/><br/>

## Mean and SD of Residuals

On average, deviation from mean (a value in the middle) is 0.

In [None]:
x = make_array(4, 8, 15, 16, 23, 42)
mean = np.mean(x)
np.mean(x - mean)

On average, deviation from prediction (a line drawn through the middle) is 0.

In [None]:
np.mean(residuals(dugong, 'Length', 'Age'))

In [None]:
np.mean(residuals(heights, 'MidParent', 'Child'))

In [None]:
np.mean(residuals(msrp_mpg, 'mpg', 'msrp'))

No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is  $\sqrt{1-r^2}$.

$$
\mbox{SD of residuals} ~=~ \sqrt{1 - r^2} \cdot \mbox{SD of }y
$$

In [None]:
np.std(residuals(heights, 'MidParent', 'Child'))

In [None]:
r = correlation(heights, 'MidParent', 'Child')
r

In [None]:
np.sqrt(1 - r**2) * np.std(heights.column('Child'))

In [None]:
np.std(residuals(hybrid, 'acceleration', 'mpg'))

In [None]:
r = correlation(hybrid, 'acceleration', 'mpg')
r

In [None]:
np.sqrt(1 - r**2)*np.std(hybrid.column('mpg'))

## Clustering

In [None]:
def rmse(t, x, y):
    return np.std(t.column(y)) * (1 - correlation(t, x, y) ** 2)

def plot_predictions(t, x, y):
    tbl = t.select(x, y)
    tbl.with_columns('Prediction', prediction(t, x, y)).scatter(0)
    err = rmse(tbl, x, y)
    print('r          :', correlation(t, x, y))
    print('RMSE       :', err )    
    print('RMSE/std(y):', err / np.std(tbl.column(y)) )

In [None]:
plot_predictions(heights, 'MidParent', 'Child')

In [None]:
def random_r(r):
    "Generate a table of random values with correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return Table().with_columns('x', x, 'y', y)

In [None]:
plot_predictions(random_r(correlation(heights, 'MidParent', 'Child')), 'x', 'y')

In [None]:
exams = Table.read_table('exams.csv')
plot_predictions(exams, 'prelim', 'final')

In [None]:
plot_predictions(random_r(correlation(exams, 'prelim', 'final')), 'x', 'y')

In [None]:
plot_predictions(random_r(.999), 'x', 'y')

In [None]:
plot_predictions(random_r(.5), 'x', 'y')

In [None]:
plot_predictions(random_r(0), 'x', 'y')

## Bounds

In [None]:
plot_predictions(heights, 'MidParent', 'Child')

In [None]:
heights

In [None]:
rmse_heights = rmse(heights, 'MidParent', 'Child')
heights = heights.with_column(
    'Residual / RMSE', heights.column('Residual') / rmse_heights
)
heights

In [None]:
heights.hist('Residual / RMSE', bins=np.arange(-3, 4, 1))

Rule of thumb:
* About 68% of actual y values within 1 RMSE of regression line prediction
* About 95% of actual y values within 2 RMSE of regression line prediction

In [None]:
def predict_height(mp):
    return slope(heights, 'MidParent', 'Child') * mp + intercept(heights, 'MidParent', 'Child')

min_x = heights.column('MidParent').min()
max_x = heights.column('MidParent').max()

pred_y_min_x = predict_height(min_x)
pred_y_max_x = predict_height(max_x)

plot_predictions(heights, 'MidParent', 'Child')
plots.plot(make_array(min_x, max_x), make_array(pred_y_min_x, pred_y_max_x)-rmse_heights, color='green');
plots.plot(make_array(min_x, max_x), make_array(pred_y_min_x, pred_y_max_x)+rmse_heights, color='green');
plots.plot(make_array(min_x, max_x), make_array(pred_y_min_x, pred_y_max_x)-2*rmse_heights, color='blue');
plots.plot(make_array(min_x, max_x), make_array(pred_y_min_x, pred_y_max_x)+2*rmse_heights, color='blue');