BerkeleyX: Data8.3x

Foundations of Data Science: Prediction and Machine Learning

Section 6: Residuals

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lec 6.1 Introduction

In [None]:
def standard_units(any_numbers):
    """Convert any array of numbers to standard units."""
    return (any_numbers - np.average(any_numbers)) / np.std(any_numbers)

# Below, t is a table; x and y are column indices or labels.

def correlation(t, x, y):
    """Return the correlation coefficient (r) of two variables."""
    return np.mean(standard_units(t.column(x)) * standard_units(t.column(y)))

def slope(t, x, y):
    """The slope of the regression line (original units)."""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)."""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

def fitted_values(t, x, y):
    """The fitted values along the regression line."""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b

Residuals

Difference between regression estimates and actual values

In [None]:
galton = Table.read_table('../../data/galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
)

heights.row(0), heights.row(-1)

In [None]:
heights = heights.with_columns('Fitted', fitted_values(heights, 0, 1))
heights.row(0), heights.row(-1)

heights.scatter(0)

In [None]:
def residuals(t, x, y):
    return t.column(y) - fitted_values(t, x, y)

In [None]:
heights = heights.with_columns('Residual', residuals(heights, 'MidParent', 'Child'))
heights.row(0), heights.row(-1)

heights.scatter(0)

In [None]:
def plot_residuals(t, x, y):
    with_residuals = t.with_columns(
        'Fitted', fitted_values(t, x, y),
        'Residual', residuals(t, x, y)
    )
    with_residuals.select(x, y, 'Fitted').scatter(0)
    with_residuals.scatter(x, 'Residual')
    
plot_residuals(heights, 'MidParent', 'Child')

Lec 6.2 Regression Diagnostics

Is an association linear? High correlation _and_ no pattern in the residuals

In [None]:
dugong = Table.read_table('../../data/dugong.csv')
print(dugong.row(0), dugong.row(-1))

print(correlation(dugong, 'Length', 'Age'))

# pattern in residual for extreme values - only in the middle we could think of a linear association
plot_residuals(dugong, 'Length', 'Age')

In [None]:
height_vs_average_weight = Table.read_table('../../data/us_women.csv')
print(height_vs_average_weight.row(0), height_vs_average_weight.row(-1))
print(correlation(height_vs_average_weight, 0, 1))

# even if correlation is very high, the association is not linear!
plot_residuals(height_vs_average_weight, 0, 1)

Lec 6.3 Properties of Residuals

mean of residuals is always zero

In [None]:
def plot_fitted(t, x, y):
    tbl = t.select(x, y)
    tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)

In [None]:
plot_fitted(heights, 'MidParent', 'Child')

In [None]:
print('Correlation:', correlation(heights, 'MidParent', 'Child'))
print('Ratio of variances', np.var(fitted_values(heights, 'MidParent', 'Child')) / np.var(heights.column('Child')))
print('r squared is same of ratio of variances', correlation(heights, 'MidParent', 'Child') ** 2)
print('i.e. ratio of std is same of r:', np.std(fitted_values(heights, 'MidParent', 'Child')) / np.std(heights.column('Child')))

In [None]:
plot_fitted(dugong, 'Length', 'Age')

In [None]:
print('Correlation', correlation(dugong, 'Length', 'Age'))
print('Ratio of variances', np.std(fitted_values(dugong, 0, 1)) / np.std(dugong.column(1)))

In [None]:
hybrid = Table.read_table('../../data/hybrid.csv')
hybrid.row(0), hybrid.row(-1)

In [None]:
plot_fitted(hybrid, 'acceleration', 'mpg')

In [None]:
print(correlation(hybrid, 'acceleration', 'mpg'))
print(np.std(fitted_values(hybrid, 3, 4)) / np.std(hybrid.column(4)))

SD of residuals always equals to sqrt(1−r^2) * SD of y