In [None]:
# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import scipy.stats as stats

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle

# get rid of warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%pylab inline

In [None]:
# Load the data file (with new features already engineered)
df = pd.read_pickle('data/w_eng_features.pkl')

In [None]:
# Check it out!
df.head(50)

#### Setting up for modeling:

In [None]:
X = df.loc[:,['air_quality', 'rel_humidity', 'avg_num_clear_days',
                     'pct_area_water', 'rainfall_inches', 'temp_f',
                     'highest_point_ft', 'mean_elevation_ft', 'HM__x__RF',
                     'TM__x__RF', 'HP__x__ME', 'pct_area_water_bc', 
                      'mean_elevation_bc', 'hp_bc']]

y = df['happiness_avg']

# create overall quality squared term, which we expect to 
# help based on the relationship we see in the pair plot 

In [None]:
X = df.loc[:,['mean_elevation_bc', 'air_quality', 'pct_area_water_bc', 'HM__x__RF', 'temp_f']]

y = df['happiness_avg']

# THIS IS A GOOD MIXTURE

In [None]:
X = df.loc[:,['mean_elevation_bc', 'air_quality', 'pct_area_water_bc', 'HM__x__RF']]

y = df['happiness_avg']

In [None]:
## Split the data 80 - 20 train_val/test

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2)
# X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=43)
X_train_val.shape

In [None]:
## Scale the data
std = StandardScaler()
std.fit(X_train_val.values)

In [None]:
## Scale the Predictors on both the train and test set
X_tr = std.transform(X_train_val.values)
X_te = std.transform(X_test.values)

In [None]:
#Mean Absolute Error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 

In [None]:
#Mean Squared Error (MSE)
def mse(y_true, y_pred):
    return np.mean(np.square(y_pred - y_true)) 

In [None]:
#Adjusted R-squared
def get_adj_r2(r2, n, p):
    return 1-(1-r2)*(n-1)/(n-1-p)

In [None]:
def make_resid_plot(actual, prediction):
    resid = actual - prediction
    with sns.axes_style('white'):
        fig, ax = plt.subplots(figsize=(10,6))
        stats.probplot(resid, dist="norm", plot=plt)
        plt.title("Normal Q-Q plot")

## Run a LassoCV Model

In [None]:
# Run the cross validation, find the best alpha, refit the model on all the data with that alpha

alphavec_l = np.logspace(-2,2,200)

lasso_model = LassoCV(alphas = alphavec_l, cv=5)
lasso_model.fit(X_tr, y_train_val)

In [None]:
# This is the best LASSO alpha value it found
lasso_model.alpha_

In [None]:
list(zip(X_train_val.columns, lasso_model.coef_))

In [None]:
# Make predictions on the test set using the LASSO model
test_set_pred_lasso = lasso_model.predict(X_te)
train_val_set_pred_lasso = lasso_model.predict(X_tr)

In [None]:
shape(X_te)

In [None]:
# Find the MSE and R^2 on the test set using this LASSO model
test_mse = mse(y_test, test_set_pred_lasso)
print('test_rmse: ', np.sqrt(test_mse))

In [None]:
# Find r2 score for LASSO
r2_lasso = r2_score(y_test, test_set_pred_lasso)
r2_lasso

In [None]:
r2_lasso_in_sample = r2_score(y_train_val, train_val_set_pred_lasso)
r2_lasso_in_sample

In [None]:
# Find adjusted r2 score for LASSO
adj_r2_lasso = get_adj_r2(r2_lasso, X.shape[0], X.shape[1])
adj_r2_lasso

In [None]:
plt.scatter(test_set_pred_lasso, y_test, alpha = 0.1)
plt.scatter(y_train_val, train_val_set_pred_lasso, alpha = 0.5)
plt.plot(np.linspace(50,65,1000), np.linspace(50,65,1000))

In [None]:
sns.set(font='sans-serif', font_scale=1.25, palette="bright")

In [None]:
#Make a pretty seaborn plot:
fig, ax = plt.subplots(figsize=(10,6))
# sns.scatterplot(x=y_test, y=test_set_pred_lasso)
sns.scatterplot(x=y_train_val, y=train_val_set_pred_lasso, marker = 'x')
sns.scatterplot(x=y_train_val, y=train_val_set_pred_ridge)
sns.lineplot(np.linspace(50,67,1000), np.linspace(50,67,1000), color='green')

ax.legend(labels=["Perfect Prediction", "Lasso Prediction", "Ridge Prediction"], loc = 4)
ax.set(xlabel='True Happiness', ylabel='Predicted Happiness', title='Lasso vs. Ridge Regression')
plt.savefig('data/lasso_v_ridge.png', dpi=100)
plt.show()

In [None]:
# make_resid_plot(y_train_val, train_val_set_pred_lasso)

## Run a RidgeCV Model

In [None]:
# Run the cross validation, find the best alpha, refit the model on all the data with that alpha

# alphavec_r = 10**np.linspace(-2,0,400)
alphavec_r = np.logspace(-2,2,200)
# alpha = 57

ridge_model = RidgeCV(alphas=alphavec_r, cv=4)
ridge_model.fit(X_tr, y_train_val)

In [None]:
# This is the best RIDGE alpha value found
ridge_model.alpha_

In [None]:
# These are the (standardized) coefficients found
# when it refit using that best alpha
list(zip(X_train_val.columns, ridge_model.coef_))

In [None]:
# Make predictions on the test set using the RIDGE model
test_set_pred_ridge = ridge_model.predict(X_te)
train_val_set_pred_ridge = ridge_model.predict(X_tr)

In [None]:
# Find the MSE and R^2 on the test set using this RIDGE model
# use mean squared error
test_mse = mse(y_test, test_set_pred_ridge)
train_val_mse = mse(y_train_val, train_val_set_pred_ridge)
print('test_rmse: ', np.sqrt(test_mse))
print('train_val_rmse: ', np.sqrt(train_val_mse))
print('total_mse: ', np.sqrt(test_mse)+np.sqrt(train_val_mse))

In [None]:
# Find r2 score for RIDGE
r2_ridge = r2_score(y_test, test_set_pred_ridge)
r2_ridge

In [None]:
# THIS IS THE MONEY
r2_ridge_in_sample = r2_score(y_train_val, train_val_set_pred_ridge)
r2_ridge_in_sample

In [None]:
# Find adjusted r2 score for LASSO
adj_r2_ridge = get_adj_r2(r2_ridge, X.shape[0], X.shape[1])
adj_r2_ridge

In [None]:
plt.scatter(y_test, test_set_pred_ridge, alpha = 0.1)
plt.scatter(y_train_val, train_val_set_pred_ridge, alpha = 0.5)
plt.plot(np.linspace(50,65,1000), np.linspace(50,65,1000))

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.scatterplot(x=y_test, y=test_set_pred_ridge)
sns.scatterplot(x=y_train_val, y=train_val_set_pred_ridge)
sns.lineplot(np.linspace(50,65,1000), np.linspace(50,65,1000))

ax.legend(labels=["Perfect Prediction","Test Points","Train/Validation Points"], loc = 4)
ax.set(xlabel='True Happiness', ylabel='Predicted Happiness', title='Ridge Regression')
plt.show()

In [None]:
#Pickle the model
pickle.dump(ridge_model, 'data/model_1')

In [None]:
# make_resid_plot(y_train_val, train_val_set_pred_ridge)

# RidgeCV is being weird. Let's try GridSearch instead

In [None]:
# Grid Search for Algorithm Tuning

# Use the same list of alphas as I did for RidgeCV (alphavec_r)

# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphavec_r))
grid.fit(X_tr, y_train_val)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

# OLS in scikit-learn

In [None]:
# Run the cross validation, find the best alpha, refit the model on all the data with that alpha
ols_model = LinearRegression()
ols_model.fit(X_tr, y_train_val)

In [None]:
list(zip(X_train_val.columns, ols_model.coef_))

In [None]:
# Make predictions on the test set using the ols model
test_set_pred_ols = ols_model.predict(X_te)
train_val_set_pred_ols = ols_model.predict(X_tr)

In [None]:
# Find the RMSE and R^2 on the test set using this ols model
test_mse = mse(y_test, test_set_pred_ols)
print('OLS Model test_rmse: ', np.sqrt(test_mse))

In [None]:
#Make a pretty seaborn plot:
fig, ax = plt.subplots(figsize=(10,6))
# sns.scatterplot(x=y_test, y=test_set_pred_lasso)
sns.scatterplot(x=y_train_val, y=train_val_set_pred_lasso, marker = 'x')
sns.scatterplot(x=y_train_val, y=train_val_set_pred_ridge)
# sns.scatterplot(x=y_train_val, y=train_val_set_pred_ols, marker = '+')
sns.lineplot(np.linspace(50,67,1000), np.linspace(50,67,1000), color='green')

ax.legend(labels=["Perfect Prediction", "Lasso Prediction", "Ridge Prediction"], loc = 4)
ax.set(xlabel='True Happiness', ylabel='Predicted Happiness', title='Lasso vs. Ridge Regression')
plt.savefig('data/lasso_v_ridge.png', dpi=100)
plt.show()

In [None]:
# resid_lasso = y_train_val - train_val_set_pred_lasso
# resid_ridge = y_train_val - train_val_set_pred_ridge
# resid_ols = y_train_val - train_val_set_pred_ols

# with sns.axes_style('white'):
#         fig, ax = plt.subplots(figsize=(10,6))
#         stats.probplot(resid_ols, dist="norm", plot=plt, marker='x')
#         stats.probplot(resid_lasso, dist="norm", plot=plt)
#         stats.probplot(resid_ridge, dist="norm", plot=plt)
#         plt.title("Normal Q-Q plot")
#         plt.show() 