In [438]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics
import math
import pprint
import pickle

### Train-Test Spliting of the Data

In [439]:
data = pd.read_csv('../data/teams_normalized.csv')

# separating the dataset into section ready for model fitting
train, test = train_test_split(data, test_size=0.15, train_size=0.85)
x_cols = ['Age', 'ORtg', 'DRtg', 'NRtg', 'Pace', '3PAr_Norm', 'FTr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'OeFG%', 'OTOV%', 'DRB%', 'OFT/FGA']
y_col = 'W'

x_train = train[x_cols]
y_train = train[y_col]
x_test= test[x_cols]
y_test = test[y_col]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(957, 16) (957,) (169, 16) (169,)


Here we define some utilities for each of the models we will use.

In [440]:
def create_model(model_name, model):
  pipeline = Pipeline(
     steps=[('scaler', StandardScaler()), (model_name, model)]
  )
  pipeline.fit(x_train, y_train)
  print("model trained and created")
  return pipeline

def score_model(model):
  y_pred = model.predict(x_test)
  print("R2 score =", round(metrics.r2_score(y_test, y_pred), 2))
  print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, y_pred), 2)) 
  print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2))

def unpack_model(model_name, model):
  estimator = model.named_steps[model_name]
  pprint.pprint(dict(zip(x_cols, estimator.coef_)))
  print("Intercept: ", estimator.intercept_) 

def save_model(model, filename):
  with open("../models/linear_models/" + filename + ".pickle", "wb") as f:
    pickle.dump(model, f)
  print('Model saved as ' + filename + '.pickle')

### Linear Regression
We can now run the linear regression algorithm on the split data.

In [441]:
linear_regression = create_model('lr', linear_model.LinearRegression())

model trained and created


#### Scoring the Linear Regression Model

In [442]:
score_model(linear_regression)

R2 score = 0.88
Mean absolute error = 2.89
Root mean squared error = 4.1


#### Unpacking the Linear Regression Model

In [443]:
unpack_model('lr', linear_regression)

{'3PAr_Norm': 0.03515625,
 'Age': 0.5784039742947015,
 'DRB%': 1.375,
 'DRtg': 111636329189394.53,
 'FT/FGA': 7.740234375,
 'FTr': -4.87890625,
 'NRtg': 153389771611121.06,
 'OFT/FGA': -1.06640625,
 'ORB%': 3.277099609375,
 'ORtg': -120766236182078.25,
 'OTOV%': 1.58251953125,
 'OeFG%': -2.9140625,
 'Pace': 0.1015625,
 'TOV%': -2.62890625,
 'TS%': -6.7109375,
 'eFG%': 11.91015625}
Intercept:  40.37324783595032


While this model is quite accurate, some of the coeficients have really high values, which is a symptom of over fitting. The liner regression model is trying to match the results too acurately, that the algorithm is too data specific.

#### Saving the Linear Regression Model

In [444]:
save_model(linear_regression, 'linear_regression')

Model saved as linear_regression.pickle


### Ridge Model
Since the Linear model has some extremely high coeficient values, we can now use the Ridge model to remedy the overfitting.  

In [445]:
ridge = create_model('ridge', linear_model.Ridge(alpha=0.5))

model trained and created


#### Scoring the Ridge Regression Model

In [446]:
score_model(ridge)

R2 score = 0.88
Mean absolute error = 2.86
Root mean squared error = 4.09


#### Unpacking the Ridge Model

In [447]:
unpack_model('ridge', ridge)

{'3PAr_Norm': 0.03395036618622866,
 'Age': 0.6454712077285601,
 'DRB%': 1.2722675625967148,
 'DRtg': -2.7664799187290705,
 'FT/FGA': 3.5393412795508805,
 'FTr': -2.3006404346553393,
 'NRtg': 3.0064392117037184,
 'OFT/FGA': -1.0549815630012453,
 'ORB%': 3.18429774513603,
 'ORtg': 1.2612578313085998,
 'OTOV%': 1.5185209291590844,
 'OeFG%': -2.8592562437605826,
 'Pace': 0.10067155671978029,
 'TOV%': -2.631099942109758,
 'TS%': -0.07836925994716615,
 'eFG%': 5.538944136064887}
Intercept:  40.38244514106582


These coefficients are all much more reasonable in magnitude as compared to the prior regression model

#### Saving the Ridge Model

In [448]:
save_model(ridge, 'ridge')

Model saved as ridge.pickle


### The Lasso Model
Since this data has many columns, we might want the ability to wittle down the effective columns and for this we employ the lasso model.

In [449]:
lasso = create_model('lasso', linear_model.Lasso(alpha=0.1))

model trained and created


#### Scoring the Lasso Model

In [450]:
score_model(lasso)

R2 score = 0.88
Mean absolute error = 2.78
Root mean squared error = 4.0


#### Unpacking the Lasso Model

In [451]:
unpack_model('lasso', lasso)

{'3PAr_Norm': -0.0,
 'Age': 0.5858091338812544,
 'DRB%': -0.0,
 'DRtg': -0.0,
 'FT/FGA': 0.0,
 'FTr': -0.0,
 'NRtg': 10.928436734435957,
 'OFT/FGA': -0.1561243486930151,
 'ORB%': 0.20139819315098914,
 'ORtg': 0.7051996696978766,
 'OTOV%': -0.0,
 'OeFG%': -0.3028669717298625,
 'Pace': 0.23056080992662434,
 'TOV%': -0.0,
 'TS%': 0.44977195213090093,
 'eFG%': 0.0}
Intercept:  40.38244514106583


This model has allowed us to zero in some of the more influential factors.

#### Saving the Lasso Model

In [452]:
save_model(lasso, 'lasso')

Model saved as lasso.pickle


### Combining Concepts: the Elastic Model
These models all have their own strengths and weaknesses, so there is also the elastic model that combines the idea of restricting coefficient values, as well as setting as many to zero as possible.

In [453]:
elastic = create_model('elastic', linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7))

model trained and created


#### Scoring the Elastic Model

In [454]:
score_model(elastic)

R2 score = 0.88
Mean absolute error = 2.84
Root mean squared error = 4.06


#### Unpacking the Elastic Model

In [455]:
unpack_model('elastic', elastic)

{'3PAr_Norm': -0.006049679275853836,
 'Age': 0.6473811537341092,
 'DRB%': 0.0,
 'DRtg': -2.839709399076666,
 'FT/FGA': 0.0,
 'FTr': 0.0,
 'NRtg': 5.6689142204162595,
 'OFT/FGA': -0.386286496670391,
 'ORB%': 0.9266819722569105,
 'ORtg': 3.023415964424079,
 'OTOV%': 0.2582669874404685,
 'OeFG%': -1.0954479868155897,
 'Pace': 0.12276564256219205,
 'TOV%': -0.7282996765340775,
 'TS%': 1.4655998965826516,
 'eFG%': 0.6948438260103151}
Intercept:  40.38244514106583


We have now reached a middle ground between the sparsity of factors and the interpretable coefficients with this model.

#### Saving the Elastic Model

In [456]:
save_model(elastic, 'elastic')

Model saved as elastic.pickle


### A Different Type of Linear Regression
#### Bayesian Regression
With this method, we can obtain the number of wins in a distrobution, instead of simply obtain one exact number. We can also use the same charecteristic of the Ridge model to lessen the effect of overfitting. 

In [457]:
bayesian_ridge = create_model('bayesian_ridge', linear_model.BayesianRidge())

model trained and created


#### Scoring the Ridge Bayesian Model

In [458]:
score_model(bayesian_ridge)

R2 score = 0.88
Mean absolute error = 2.84
Root mean squared error = 4.07


#### Unpacking the Ridge Bayesian Model

In [461]:
unpack_model('bayesian_ridge', bayesian_ridge)
br = bayesian_ridge.named_steps['bayesian_ridge']
print('Alpha: ', br.alpha_)
print('Lambda: ', br.lambda_)

{'3PAr_Norm': 0.003243891942481092,
 'Age': 0.6572435211752546,
 'DRB%': 1.1084192450472063,
 'DRtg': -2.6658995421050977,
 'FT/FGA': 1.4721156144740246,
 'FTr': -0.9962156616901832,
 'NRtg': 3.5285018124646674,
 'OFT/FGA': -0.9739393486058664,
 'ORB%': 2.5670667037275723,
 'ORtg': 2.017325835243188,
 'OTOV%': 1.3574213328541813,
 'OeFG%': -2.6469497417430725,
 'Pace': 0.15188636416808216,
 'TOV%': -2.0800772880048677,
 'TS%': 2.0764235894360716,
 'eFG%': 2.478449807807311}
Intercept:  40.38244514106582
Alpha:  0.05666902020111394
Lambda:  0.21771479100674457


These numbers are representative of the normal distrobution that the model gives.

#### Saving the Ridge Bayesian Model

In [460]:
save_model(bayesian_ridge, 'bayesian_ridge')

Model saved as bayesian_ridge.pickle
