In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

In [None]:
movie_reviews = Table.read_table("movie_reviews.csv")
movie_reviews.show(5)

## Least Squares

In [None]:
slope(movie_reviews, "RottenTomatoes", "IMDB")

In [None]:
intercept(movie_reviews, "RottenTomatoes", "IMDB")

In [None]:
def find_rmse(rmse_slope, rmse_intercept):
    x = movie_reviews.column("RottenTomatoes")
    y = movie_reviews.column("IMDB")
    predicted = rmse_slope * x  + rmse_intercept
    return (np.mean((y-predicted) ** 2)) ** 0.5

In [None]:
find_rmse(0.4, 6)

In [None]:
find_rmse(100, 0.66)

In [None]:
find_rmse(0, 0)

In [None]:
def parabola(x):
    return x**2 

minimize(parabola)

In [None]:
def parabola(x, y):
    return (x+6)**2 + (y-14)**2 

minimize(parabola)

In [None]:
minimize(find_rmse)

In [None]:
least_squares_line = minimize(find_rmse)
find_rmse(least_squares_line.item(0), least_squares_line.item(1))

## Residuals

In [None]:
def residuals(t, x, y):
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

In [None]:
residuals(movie_reviews, "RottenTomatoes", "IMDB")

In [None]:
movie_reviews = movie_reviews.with_columns(
    'Fitted Value', fitted_values(movie_reviews, 'RottenTomatoes', 'IMDB'),
    'Residual', residuals(movie_reviews, 'RottenTomatoes', 'IMDB')
)
movie_reviews

In [None]:
def plot_residuals(t, x, y):
    tbl = t.with_columns(
        'Fitted', fitted_values(t, x, y),
        'Residual', residuals(t, x, y)
    )
    tbl.select(x, y, 'Fitted').scatter(0)
    tbl.scatter(x, 'Residual')

plot_residuals(movie_reviews, "RottenTomatoes", "IMDB")

No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is  $\sqrt{1-r^2}$.

$$
\mbox{SD of residuals} ~=~ \sqrt{1 - r^2} \cdot \mbox{SD of }y
$$

In [None]:
r = correlation(movie_reviews, "RottenTomatoes", "IMDB")
SD_y = np.std(movie_reviews.column("IMDB"))

SD_residuals = (1 - r**2) ** 0.5 * SD_y
SD_residuals

In [None]:
np.std(residuals(movie_reviews, "RottenTomatoes", "IMDB"))

## Regression Inference

In [None]:
def prediction_at(t, x, y, x_value):
    '''
    t - table
    x - label of x column
    y - label of y column
    x_value - the x value for which we want to predict y
    '''
    return slope(t, x, y) * x_value + intercept(t, x, y)

prediction_at_70 = prediction_at(movie_reviews, "RottenTomatoes", "IMDB", 70)
prediction_at_70

In [None]:
x = 70
movie_reviews.scatter('RottenTomatoes', 'IMDB', fit_line=True)
plots.plot([x, x], [4, prediction_at_70], color='gold', lw=2);

In [None]:
for i in np.arange(4):
    resample = movie_reviews.sample()
    predicted_y = prediction_at(resample, "RottenTomatoes", "IMDB", 70)
    print('Predicted y from bootstramp sample was', predicted_y)

In [None]:
def bootstrap_prediction(t, x, y, new_x, repetitions=1000):

    # Bootstrap the scatter, predict, collect
    predictions = make_array()
    for i in np.arange(repetitions):
        resample = t.sample()
        predicted_y = prediction_at(resample, x, y, new_x)
        predictions = np.append(predictions, predicted_y)

    # Find the ends of the approximate 95% prediction interval
    left = percentile(2.5, predictions)
    right = percentile(97.5, predictions)

    # Display results
    Table().with_column('Prediction', predictions).hist(bins=20)
    plots.xlabel('predictions at x='+str(new_x))
    plots.plot([left, right], [0, 0], color='yellow', lw=8);
    print('Approximate 95%-confidence interval for height of true line:')
    print(left, right, '(width =', right - left, ')') 
    
bootstrap_prediction(movie_reviews, "RottenTomatoes", "IMDB", 70)

In [None]:
def bootstrap_slope(t, x, y, repetitions=5000):
    
    # Bootstrap the scatter, find the slope, collect
    slopes = make_array()
    for i in np.arange(repetitions):
        bootstrap_sample = t.sample()
        bootstrap_slope = slope(bootstrap_sample, x, y)
        slopes = np.append(slopes, bootstrap_slope)
    
    # Find the endpoints of the 95% confidence interval for the true slope
    left = percentile(2.5, slopes)
    right = percentile(97.5, slopes)
    
    # Slope of the regression line from the original sample
    observed_slope = slope(t, x, y)
    
    # Display results
    Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
    print('Slope of regression line:', observed_slope)
    print('Approximate 95%-confidence interval for the slope of the true line:')
    print(left, 'to', right)

bootstrap_slope(movie_reviews, "RottenTomatoes", "IMDB")