In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in Data 8.


def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)
    
def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):
    y = x*slope + intercept
    plots.plot(x, y, color=color)
    
def draw_vertical_line(x_position, color='black'):
    x = make_array(x_position, x_position)
    y = make_array(-4, 4)
    plots.plot(x, y, color=color)
    
def make_correlated_data(r):
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y

def r_scatter(r):
    """Generate a scatter plot with a correlation approximately r"""
    plots.figure(figsize=(5,5))
    x, y = make_correlated_data(r)
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)
    
def r_table(r):
    """
    Generate a table of 1000 data points with a correlation approximately r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return Table().with_columns('x', x, 'y', y)

## Prediction lines

In [None]:
example = r_table(0.99)
example.show(3)

In [None]:
example.scatter('x', 'y')
resize_window()

In [None]:
def nn_prediction_example(x_val):
    """ Predicts y-value for x based on the example table """
    neighbors = example.where('x', are.between(x_val - .25, x_val + .25))
    return np.mean(neighbors.column('y'))
    

In [None]:
nn_prediction_example(-2.25)

In [None]:
example = example.with_columns(
    'Predicted y', 
    example.apply(nn_prediction_example, 'x'))

In [None]:
example.scatter('x')
resize_window()

In [None]:
example.scatter('x')
draw_line(slope=1, color='dodgerblue')
resize_window()

In [None]:
example = r_table(0)
example.scatter('x', 'y')
resize_window()

In [None]:
example = example.with_columns(
    'Predicted y', 
    example.apply(nn_prediction_example, 'x'))

In [None]:
example = example.with_column(
    'Predicted y', example.apply(nn_prediction_example, 'x'))
example.scatter('x')
draw_line(slope = 0)
resize_window()

In [None]:
example = r_table(0.5)
example.scatter('x', 'y')
resize_window()

In [None]:
example = r_table(0.5)
example.scatter('x', 'y')
resize_window()
draw_vertical_line(1.5)
draw_line(slope=1, intercept=0)

In [None]:
example = example.with_column('Predicted y', example.apply(nn_prediction_example, 'x'))
example.scatter('x')
draw_line(slope=1)
draw_vertical_line(1.5)
resize_window()

In [None]:
example.scatter('x')
draw_line(slope=1, intercept=0, color='red')
draw_line(slope=0.5, intercept=0, color='dodgerblue')
resize_window()

In [None]:
# Questions?
# Slides

## Linear regression: defining the line

In [None]:
# Copy-pasted from above
def standard_units(x):
    """Converts an array x to standard units"""
    return (x - np.mean(x)) / np.std(x)

def correlation(t, x, y):
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    return np.mean(x_su * y_su)


In [None]:
def slope(t, x, y):
    r = correlation(t, x, y)
    slope = np.std(t.column(y)) / np.std(t.column(x)) * r
    return slope
def intercept(t, x, y):
    m = slope(t, x, y)
    return np.mean(t.column(y)) - m * np.mean(t.column(x))

In [None]:
example = r_table(0.5)
slope(example, 'x', 'y')
intercept(example, 'x', 'y')

## Movies Data

In [None]:
movies = Table.read_table('movies.csv')
movies.show(11)

In [None]:
cash = movies.select("Budget", "Domestic Gross")
cash

In [None]:
cash.scatter('Budget')

In [None]:
def predict_gross_nn(b):
    """Return a prediction of the domestic gross for a movie 
    with a budget of b
    
    The prediction is the average domestic gross of the movies
    whose budget is in the range b plus or minus $20 million dollars.
    """
    
    close_points = cash.where('Budget', are.between(b-20, b+20))
    return np.mean(close_points.column("Domestic Gross"))   

In [None]:
cash_with_predictions = cash.with_column(
    'Prediction', cash.apply(predict_gross_nn, 'Budget')
    )

In [None]:
cash_with_predictions.scatter('Budget')

In [None]:
nn_prediction = predict_gross_nn(200)
nn_prediction

In [None]:
cash_slope = slope(cash, "Budget", "Domestic Gross")
cash_intercept = intercept(cash, "Budget", "Domestic Gross")
cash_slope, cash_intercept

In [None]:
linear_prediction = cash_slope * 200 + cash_intercept
linear_prediction

In [None]:
linear_predictions = cash_slope * cash.column("Budget") + cash_intercept
cash_with_predictions.with_column("Linear Prediction", linear_predictions).scatter("Budget")

In [None]:
cash_with_predictions.with_column("Linear Prediction", linear_predictions).scatter("Budget")

# plot a line (oos)
x_vals = make_array(-100, 500)
linear_predictions_range = cash_slope * x_vals + cash_intercept
plots.plot(x_vals, linear_predictions_range, c="g");
plots.xlim([0, 400]);