In [None]:
from datascience import * # from this library, import all functions
from math import *

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import numpy as np

In [None]:
import seaborn as sns

# Seaborn conveniently has a tool to read in starter datasets 
cars_df = sns.load_dataset("mpg")
cars = Table.from_df(cars_df) # convert from pandas dataframe to datascience table
# (more on pandas for the sklearn workshop next week)
cars.show(5)

I have the suspicion that the heavier your car is, the longer it takes to get to full speed. In other words, I think that as car weight increases, acceleration decreases. If it does, then by how much?

Let's do some linear modeling to figure this out.

## Step 0: Exploratory Data Analysis

In [None]:
# Step 1: always graph your data!

sns.scatterplot(...)

In [None]:
# Let's quantify the linearity of this dataset, but notice the scale of the axes! 

x = cars.column("weight")
y = cars.column("acceleration")

def standard_units(arr):
    return ...

weight_su = standard_units(x)
accel_su = standard_units(y)

In [None]:
# Again, this is a linear transformation; relative distance of points stays the same
sns.scatterplot(x = weight_su, y = accel_su);
plt.xlabel("Weight (standard units)");
plt.ylabel("Acceleration (standard units)");

In [None]:
# Now, let's get an actual number. Calculate r using the equation
r = ...
r

In [None]:
# Shortcut: let's review using scipy.stats
from scipy.stats import pearsonr # notice the syntax

...

That looks good to me. Now, let's actually build the model!

## Step 1: Choose a model

In this case, let's focus on 2 models: 
1. The constant model (guess the same value for every value of x): $$ \hat{y} = \theta $$
1. A bivariate linear regression model where m = slope and b = intercept: $$ \hat{y} = mx + b $$

In [None]:
# The constant model: 
theta = ...

sns.scatterplot(x = "weight", y = "acceleration", data = cars);
plt.axhline(theta, color = "gold");

In [None]:
# A linear model; automagically calculated, but we'll do this mathematically on our own

sns.regplot(x = "weight", y = "acceleration", data = cars, ci = None, line_kws = {"color": "gold"});
plt.xlabel("weight");
plt.ylabel("acceleration");

So what model is "better"? How do we quantify and decide that? 

## Step 2: Choosing a loss function

A loss function is a way we can **quantify how bad a prediction is for a single observation.** If our prediction is close to the true value, loss should be low. If our prediction is far away, then we want a high loss value.

In [None]:
# L1 loss: mean absolute error
def mae(obs, pred):
    ...


# L2 loss: mean squared error
def mse(obs, pred):
    ...

## Step 3: Fit our model by minimizing the objective (L2 loss) function 

For purposes of our class, we will focus on MSE. Now that we have some models and loss functions, let's work on finding the **inputs** that will reduce our L2 loss.

The inputs in this case are as follows:
- Constant model = theta
- Linear model = slope and intercept

In [None]:
# Before we calculate our model's loss, choose a theta to play with
your_theta = ...
your_predictions = [your_theta] * cars.num_rows
print("You choosing " + str(your_theta)+ " for theta gives an MSE of", mse(cars.column("acceleration"), your_predictions))

In [None]:
constant_predictions = np.array([theta] * cars.num_rows)
print("Using the mean for theta gives an MSE of", mse(cars.column("acceleration"), constant_predictions))

In [None]:
# So, let's graph the MSE for a bunch of different thetas (constants)
# What is the best?

error_thetas = np.linspace(0, 30, 50) # guessing a bunch of thetas from 0-30
constant_loss = [mse(cars.column("acceleration"), theta) for theta in error_thetas] 
# calculate MSE for each individual theta

plt.scatter(x = error_thetas, y = constant_loss);  
plt.xlabel("Theta");
plt.ylabel("MSE");

#plt.axvline(theta, color = "red"); # something interesting

The property above is why we use **r** to inform our linear regression model; we'll use the mean of the y-values for each particular x-value to predict! 

So, after some algebraic rearrangements:
- y = mx+b

In standard units, m = r and b = 0
- y_su = r * x + 0

In original units, m = r * (SDy/SDx) and b = mean_y - slope * mean_x
- y_pred = (r * SDy/SDx) * x + mean_y - (r * SDy/SDx) * mean_x

Let's code that in to manually build the line.

In [None]:
# Using original units, calculate the descriptive stats
r

weight_mean = ...
weight_std = ...

accel_mean = ...
accel_std = ...

In [None]:
slope = ...
slope

In [None]:
intercept = ...
intercept

In [None]:
lsrl_predictions = ...
#lsrl_predictions

In [None]:
# Graphing your predictions onto the data!

sns.scatterplot(x = "weight", y = "acceleration", data = cars);
plt.scatter(cars.column("weight"), lsrl_predictions, color = "gold");

In [None]:
# What's your mean squared error for this line?
def predict_mse(m, b):
    # Using y = mx + b and the cars dataset
    ...

# How does it compare to the constant mse using the mean of y? (7.585)
predict_mse(slope, intercept) 

### Another approach: numerical optimization

Using a numerical optimization approach, we can get the same results.

In [None]:
# If we wanted to find the x that gives the minimum value in a parabola, we can use optimization
def a_parabola(x):
    return (x - 2)**2 + 3

parabola_x = np.linspace(-1, 5, 50)
plt.scatter(parabola_x, a_parabola(parabola_x));

In [None]:
# What is the input that will give the smallest output?
minimize(...)

In [None]:
# We can do the same thing for our LSRL using our data and the loss function
min_slope, min_intercept = ...

print("Mathematically calculated slope = ", slope)
print("minimize slope = ", min_slope)
print("Mathematically calculated intercept = ", intercept)
print("intercept = ", min_intercept)

## Miscellanea: visual diagnostics and non-linear data

In [None]:
# Now that we have the "best" line, how do we confirm that a line is the best tool?
# A residual plot (x on x, residuals on y)
# looking for no pattern (random cloud)
# equally above and below y = 0 
# we don't want to overpredict (negative residual) or underpredict (positive residual)

residuals = ...

plt.scatter(cars.column("weight"), residuals);
plt.axhline(0, color = "red");
plt.xlabel("Weight");
plt.ylabel("Residual");

In [None]:
# Example of a data transformation
# We can use math to make non-linear data linear!

sns.scatterplot(x = "horsepower", y = "mpg", data = cars)

In [None]:
transformed_x = ...
plt.scatter(transformed_x, cars.column("mpg"));
plt.xlabel("1 / horsepower");
plt.ylabel("mpg");

In [None]:
# Calculating and adding a line: using sklearn

cars_df["1/hp"] = 1 / cars_df["horsepower"]
cars_df = cars_df.dropna()

from sklearn.linear_model import LinearRegression
model = LinearRegression() # create the model
model.fit(cars_df[["1/hp"]], cars_df["mpg"]) # fit the model

In [None]:
sklearn_predictions = model.predict(cars_df[["1/hp"]])
sklearn_slope = model.coef_ 
sklearn_intercept = model.intercept_
sklearn_slope, sklearn_intercept

In [None]:
plt.scatter(transformed_x, cars.column("mpg"));
plt.scatter(cars_df["1/hp"], sklearn_predictions);
plt.xlabel("1 / horsepower");
plt.ylabel("mpg");

# Don't forget to convert when doing predictions!