In [282]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics
import math
import pprint
import pickle

### Train-Test Spliting of the Data

In [283]:
data = pd.read_csv('../data/teams_normalized.csv')

# separating the dataset into section ready for model fitting
train, test = train_test_split(data, test_size=0.15, train_size=0.85)
x_cols = ['Age', 'ORtg', 'DRtg', 'NRtg', 'Pace', '3PAr_Norm', 'FTr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'OeFG%', 'OTOV%', 'DRB%', 'OFT/FGA']
y_col = 'W'

x_train = train[x_cols]
y_train = train[y_col]
x_test= test[x_cols]
y_test = test[y_col]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(957, 16) (957,) (169, 16) (169,)


### Linear Regression
We can now run the linear regression algorithm on the split data.

In [284]:
linear_regression = Pipeline(
  steps=[('scaler', StandardScaler()), ('lr', linear_model.LinearRegression())])

# fitting the model
linear_regression.fit(x_train, y_train)

# testing the model
y_pred = linear_regression.predict(x_test)

#### Scoring the Linear Regression Model

In [285]:
accuracy = linear_regression.score(x_test, y_test)
print("Model accuracy on the test set: ", round(accuracy, 2), '\n')

# Baseline Model accuracy
print("R2 score =", round(metrics.r2_score(y_test, y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2)) 

Model accuracy on the test set:  0.86 

R2 score = 0.86
Mean absolute error = 3.12
Root mean squared error = 4.61


#### Unpacking the Linear Regression Model

In [286]:
lr = linear_regression.named_steps['lr']

pprint.pprint(dict(zip(x_cols, lr.coef_)))
print("Intercept: ", lr.intercept_)

{'3PAr_Norm': 0.08984375,
 'Age': 0.6759997646140283,
 'DRB%': 0.81787109375,
 'DRtg': 104541620820097.9,
 'FT/FGA': 2.49609375,
 'FTr': -1.615234375,
 'NRtg': 144318916827112.12,
 'OFT/FGA': -0.8125,
 'ORB%': 2.9736328125,
 'ORtg': -114096159853452.33,
 'OTOV%': 0.78179931640625,
 'OeFG%': -1.8671875,
 'Pace': 0.1328125,
 'TOV%': -2.396484375,
 'TS%': 0.984375,
 'eFG%': 3.90625}
Intercept:  40.08539138227167


While this model is quite accurate, some of the coeficients have really high values, which is a symptom of over fitting. The liner regression model is trying to match the results too acurately, that the algorithm is too data specific.

#### Saving the Linear Regression Model

In [287]:
with open("../models/linear_models/linear_regression.pickle", "wb") as f:
  pickle.dump(linear_regression, f)

### Ridge Model
Since the Linear model has some extremely high coeficient values, we can now use the Ridge model to remedy the overfitting.  

In [288]:
ridge = Pipeline(steps=[('scaler', StandardScaler()), ('ridge', linear_model.Ridge(alpha=.5))])

# fitting the model
ridge.fit(x_train, y_train)

# testing the model
ridge_y_pred = ridge.predict(x_test)

#### Scoring the Ridge Regression Model

In [289]:
ridge_accuracy = ridge.score(x_test, y_test)
print("Model accuracy on the test set: ", round(ridge_accuracy, 2), '\n')

# Baseline Model accuracy
print("R2 score =", round(metrics.r2_score(y_test, ridge_y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, ridge_y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, ridge_y_pred)), 2))

Model accuracy on the test set:  0.86 

R2 score = 0.86
Mean absolute error = 3.12
Root mean squared error = 4.63


#### Unpacking the Ridge Model

In [290]:
ridge_model = ridge.named_steps['ridge']

pprint.pprint(dict(zip(x_cols, ridge_model.coef_)))
print("Intercept: ", ridge_model.intercept_)

{'3PAr_Norm': 0.04620336840960141,
 'Age': 0.6704251276514837,
 'DRB%': 0.7769499864427234,
 'DRtg': -3.2865385701532235,
 'FT/FGA': 1.6596162943501909,
 'FTr': -1.1019356330343018,
 'NRtg': 3.755058821846545,
 'OFT/FGA': -0.8080738316434724,
 'ORB%': 2.6807904609590576,
 'ORtg': 1.7384104165708651,
 'OTOV%': 0.7856099216709219,
 'OeFG%': -1.9367330201422028,
 'Pace': 0.19910230365454884,
 'TOV%': -2.1589405704662674,
 'TS%': 1.86982071259111,
 'eFG%': 2.6245124195049074}
Intercept:  40.083594566353185


These coefficients are all much more reasonable in magnitude as compared to the prior regression model

#### Saving the Ridge Model

In [291]:
with open("../models/linear_models/ridge.pickle", "wb") as f:
  pickle.dump(ridge, f)

### The Lasso Model
Since this data has many columns, we might want the ability to wittle down the effective columns and for this we employ the lasso model.

In [294]:
lasso = Pipeline(
  steps=[('scaler', StandardScaler()), 
  ('lasso', linear_model.Lasso(alpha=0.1))])

lasso.fit(x_train, y_train)

lasso_y_pred = lasso.predict(x_test)

#### Scoring the Lasso Model

In [295]:
lasso_accuracy = lasso.score(x_test, y_test)
print("Model accuracy on the test set: ", round(lasso_accuracy, 2), '\n')

# Baseline Model accuracy
print("R2 score =", round(metrics.r2_score(y_test, lasso_y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, lasso_y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, lasso_y_pred)), 2))

Model accuracy on the test set:  0.86 

R2 score = 0.86
Mean absolute error = 3.07
Root mean squared error = 4.62


#### Unpacking the Lasso Model

In [297]:
lasso_model = lasso.named_steps['lasso']

pprint.pprint(dict(zip(x_cols, lasso_model.coef_)))
print("Intercept: ", lasso_model.intercept_)

{'3PAr_Norm': -0.0,
 'Age': 0.6325781045002427,
 'DRB%': -0.0,
 'DRtg': -0.0,
 'FT/FGA': 0.0,
 'FTr': -0.0,
 'NRtg': 10.693290631725096,
 'OFT/FGA': -0.26367283620476933,
 'ORB%': 0.20629287709595318,
 'ORtg': 0.8810534117913228,
 'OTOV%': -0.0,
 'OeFG%': -0.4394722485194868,
 'Pace': 0.2812723894180477,
 'TOV%': -0.0,
 'TS%': 0.33599544638378054,
 'eFG%': 0.0}
Intercept:  40.083594566353185


This model has allowed us to zero in some of the more influential factors.

#### Saving the Lasso Model

In [298]:
with open("../models/linear_models/lasso.pickle", "wb") as f:
  pickle.dump(ridge, f)