# OLS Regression - Simple Train and Test

## Required Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import math
import os

## Notbook Settings

In [None]:
# set working directory
os.chdir(".")

# make sure it is set right
print(os.getcwd())

# make sure plots display in notebook
%matplotlib inline

## Data Load

Weekly marketing spend (in thousand's) by channel and the corresponding product sales (in million's).

In [None]:
# read in the advertising data set
ad_df = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv", index_col = [0])

# look at the top rows
ad_df.head()

## EDA

In [None]:
# describe the dataset
ad_df.describe()

In [None]:
# look at the data types
ad_df.info()

In [None]:
sns.pairplot(ad_df)

In [None]:
sns.distplot(ad_df["sales"])

In [None]:
sns.heatmap(ad_df.drop(columns = "sales").corr(), annot=True, cmap="YlGnBu", square = True)

## Training - Model 1

In [None]:
# specifying the regression model
ols_m1 = linear_model.LinearRegression()

from sklearn.model_selection import train_test_split

# feature set
X = ad_df.drop(columns = "sales")

# target
y = ad_df["sales"]

# creating training / testings datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print(len(X_train))
print(len(X_test))

In [None]:
# fit the regression model
ols_m1.fit(X_train,y_train)

## Assumption Checking

Errors should be independently and identically normally distributed with a mean of 0 and a fixed variance.

### Residuals vs. Fitted Values

In [None]:
# residuals versus fitted values
def plot_fit(fitted_values, residuals):
    plt.scatter(fitted_values, residuals)
    plt.axhline(y = 0, color = "r")
    plt.xlabel("Fitted Values")
    plt.ylabel("Residuals")

In [None]:
residuals = ols_m1.predict(X_train) - y_train

fitted_values = ols_m1.predict(X_train)

In [None]:
# call
plot_fit(fitted_values, residuals)

### Distribution of Residuals

In [None]:
# plot of residuals
def plot_residuals(residuals):
    residuals.name = "Residuals"
    sns.distplot(residuals)

In [None]:
# call
plot_residuals(residuals)

## Training - Model 2

### Remove High Influence Point?

In [None]:
X_train = X_train[residuals != residuals.max()]

y_train = y_train[residuals != residuals.max()]

In [None]:
# specifying the regression model
ols_m2 = linear_model.LinearRegression()

# fit the regression model
ols_m2.fit(X_train,y_train)

## Assumption Checking

In [None]:
residuals = ols_m2.predict(X_train) - y_train

fitted_values = ols_m2.predict(X_train)

plot_fit(fitted_values, residuals)

In [None]:
plot_residuals(residuals)

## Final Model Interpretation

### Feature Importance?

In [None]:
for estimate in zip(X_train.columns, ols_m1.coef_):
     print("Fit 1 estimate: ", estimate)

for estimate in zip(X_train.columns, ols_m2.coef_):
     print("Fit 2 estimate: ", estimate)

### Intercept?

In [None]:
print(ols_m1.intercept_)
print(ols_m2.intercept_)

## Testing

In [None]:
# predicted
predicted_m1 = ols_m1.predict(X_test)
predicted_m2 = ols_m2.predict(X_test)

# actual
validate = pd.DataFrame(y_test)

validate.columns = ['actual']

validate['predicted_m1'] = predicted_m1
validate['predicted_m2'] = predicted_m2

validate.head(10)

## Quality of Fit

In [None]:
# mean squared error
mse_m1 = np.sum((validate['actual'] - validate['predicted_m1'])**2) / len(validate)
mse_m2 = np.sum((validate['actual'] - validate['predicted_m2'])**2) / len(validate)

print("The Model 1 Mean Squared Error is " + str(mse_m1))
print("The Model 2 Mean Squared Error is " + str(mse_m2))

# sklearn mse
# mean_squared_error(predicted, y_test)

# r squared
r2_m1 = r2_score(predicted_m1, y_test)
r2_m2 = r2_score(predicted_m2, y_test)

print("The Model 1 R-Squared is " + str(r2_m1))
print("The Model 2 R-Squared is " + str(r2_m2))

## Next Steps

Can you spot any ways to improve the model? Is the OLS linear model appropriate here? Why do you think the R-squared value actually gets worse in model 2? Hint: Check the assumptions!!!
What about p-values???

## References

Data sourced from An Introduction to Statistical Learning
with Applications in R
Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani.