In [322]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics
import math
import pprint
import pickle

### Train-Test Spliting of the Data

In [323]:
data = pd.read_csv('../data/teams_normalized.csv')

# separating the dataset into section ready for model fitting
train, test = train_test_split(data, test_size=0.15, train_size=0.85)
x_cols = ['Age', 'ORtg', 'DRtg', 'NRtg', 'Pace', '3PAr_Norm', 'FTr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'OeFG%', 'OTOV%', 'DRB%', 'OFT/FGA']
y_col = 'W'

x_train = train[x_cols]
y_train = train[y_col]
x_test= test[x_cols]
y_test = test[y_col]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(957, 16) (957,) (169, 16) (169,)


Here we define some utilities for each of the models we will use.

In [344]:
def create_model(model_name, model):
  pipeline = Pipeline(
     steps=[('scaler', StandardScaler()), (model_name, model)]
  )
  pipeline.fit(x_train, y_train)
  return pipeline

def score_model(model):
  y_pred = model.predict(x_test)
  print("R2 score =", round(metrics.r2_score(y_test, y_pred), 2))
  print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, y_pred), 2)) 
  print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2))

def unpack_model(model_name, model):
  estimator = model.named_steps[model_name]
  pprint.pprint(dict(zip(x_cols, estimator.coef_)))
  print("Intercept: ", estimator.intercept_) 

def save_model(model, filename):
  with open("../models/linear_models/" + filename + ".pickle", "wb") as f:
    pickle.dump(model, f)

### Linear Regression
We can now run the linear regression algorithm on the split data.

In [339]:
linear_regression = create_model('lr', linear_model.LinearRegression())

#### Scoring the Linear Regression Model

In [340]:
score_model(linear_regression)

R2 score = 0.88
Mean absolute error = 2.99
Root mean squared error = 4.18


#### Unpacking the Linear Regression Model

In [343]:
unpack_model('lr', linear_regression)

{'3PAr_Norm': 0.048828125,
 'Age': 0.6187052840365252,
 'DRB%': 1.22802734375,
 'DRtg': 58418151458314.05,
 'FT/FGA': 4.5068359375,
 'FTr': -2.8251953125,
 'NRtg': 79320437339908.38,
 'OFT/FGA': -0.837890625,
 'ORB%': 2.903564453125,
 'ORtg': -62326106803322.11,
 'OTOV%': 1.27197265625,
 'OeFG%': -2.4921875,
 'Pace': 0.21484375,
 'TOV%': -2.431640625,
 'TS%': -2.8125,
 'eFG%': 7.45703125}
Intercept:  40.13777439707633


While this model is quite accurate, some of the coeficients have really high values, which is a symptom of over fitting. The liner regression model is trying to match the results too acurately, that the algorithm is too data specific.

#### Saving the Linear Regression Model

In [345]:
save_model(linear_regression, 'linear_regression')

### Ridge Model
Since the Linear model has some extremely high coeficient values, we can now use the Ridge model to remedy the overfitting.  

In [346]:
ridge = create_model('ridge', linear_model.Ridge(alpha=0.5))

#### Scoring the Ridge Regression Model

In [347]:
score_model(ridge)

R2 score = 0.88
Mean absolute error = 2.99
Root mean squared error = 4.19


#### Unpacking the Ridge Model

In [348]:
unpack_model('ridge', ridge)

{'3PAr_Norm': 0.050540117811299065,
 'Age': 0.5748525060142323,
 'DRB%': 1.1591164070949584,
 'DRtg': -2.9895775606855484,
 'FT/FGA': 2.132089172297543,
 'FTr': -1.3404175465687957,
 'NRtg': 3.5180028060619373,
 'OFT/FGA': -0.8280646998766574,
 'ORB%': 2.814525053418911,
 'ORtg': 1.6751235037427543,
 'OTOV%': 1.212752384741614,
 'OeFG%': -2.4504296614507934,
 'Pace': 0.18865012740612996,
 'TOV%': -2.3641834923012577,
 'TS%': 0.8263328778974907,
 'eFG%': 3.8644050304154796}
Intercept:  40.09613375130616


These coefficients are all much more reasonable in magnitude as compared to the prior regression model

#### Saving the Ridge Model

In [349]:
save_model(ridge, 'ridge')

### The Lasso Model
Since this data has many columns, we might want the ability to wittle down the effective columns and for this we employ the lasso model.

In [350]:
lasso = create_model('lasso', linear_model.Lasso(alpha=0.1))

#### Scoring the Lasso Model

In [334]:
score_model(lasso)

R2 score = 0.88
Mean absolute error = 2.99
Root mean squared error = 4.21


#### Unpacking the Lasso Model

In [351]:
unpack_model('lasso', lasso)

{'3PAr_Norm': -0.0,
 'Age': 0.5141073490925355,
 'DRB%': -0.0,
 'DRtg': -0.0,
 'FT/FGA': -0.0,
 'FTr': -0.0,
 'NRtg': 11.038544778854506,
 'OFT/FGA': -0.08656099126704868,
 'ORB%': 0.08792409127070973,
 'ORtg': 0.8066132321297876,
 'OTOV%': -0.0,
 'OeFG%': -0.27329295130797554,
 'Pace': 0.2527910529519817,
 'TOV%': -0.0,
 'TS%': 0.0,
 'eFG%': 0.22076566470601244}
Intercept:  40.09613375130616


This model has allowed us to zero in some of the more influential factors.

#### Saving the Lasso Model

In [352]:
save_model(lasso, 'lasso')