In [130]:
import pandas as pd
import numpy as np

### Train-Test Spliting of the Data

In [131]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('../data/teams_processed.csv')

# separating the dataset into section ready for model fitting
train, test = train_test_split(data, test_size=0.15, train_size=0.85)
x_cols = ['Age', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'OeFG%', 'OTOV%', 'DRB%', 'OFT/FGA']
y_col = 'W'

x_train = train[x_cols]
y_train = train[y_col]
x_test= test[x_cols]
y_test = test[y_col]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(963, 15) (963,) (170, 15) (170,)


### Linear Regression
We can now run the linear regression algorithm on the split data.

In [132]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

linear_regression = Pipeline(
  steps=[('scaler', StandardScaler()), ('lr', linear_model.LinearRegression())])

# fitting the model
linear_regression.fit(x_train, y_train)

# testing the model
y_pred = linear_regression.predict(x_test)

#### Scoring the Linear Regression Model

In [133]:
from sklearn import metrics
import math

accuracy = linear_regression.score(x_test, y_test)
print("Model accuracy on the test set: ", round(accuracy, 2), '\n')

# Baseline Model accuracy
print("R2 score =", round(metrics.r2_score(y_test, y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2)) 

Model accuracy on the test set:  0.87 

R2 score = 0.87
Mean absolute error = 2.86
Root mean squared error = 4.27


#### Unpacking the Linear Regression Model

In [134]:
import pprint

lr = linear_regression.named_steps['lr']

pprint.pprint(dict(zip(x_cols, lr.coef_)))
print("Intercept: ", lr.intercept_)

{'Age': 0.6144148484025759,
 'DRB%': 0.74267578125,
 'DRtg': 66556799074913.55,
 'FT/FGA': 3.228515625,
 'FTr': -2.158203125,
 'NRtg': 90759816350959.34,
 'OFT/FGA': -0.71484375,
 'ORB%': 2.3157958984375,
 'ORtg': -71321002730867.44,
 'OTOV%': 0.747528076171875,
 'OeFG%': -1.45703125,
 'Pace': 0.19140625,
 'TOV%': -1.79296875,
 'TS%': -0.775390625,
 'eFG%': 4.390625}
Intercept:  40.39264890469613


While this model is quite accurate, some of the coeficients have really high values, which is a symptom of over fitting. The liner regression model is trying to match the results too acurately, that the algorithm is too data specific.

#### Saving the Linear Regression Model

In [135]:
import pickle

with open("../models/linear_regression.pickle", "wb") as f:
  pickle.dump(linear_regression, f)

### Ridge Model
Since the Linear model has some extremely high coeficient values, we can now use the Ridge model to remedy the overfitting.  

In [136]:
ridge = Pipeline(steps=[('scaler', StandardScaler()), ('ridge', linear_model.Ridge(alpha=.5))])
# fitting the model
ridge.fit(x_train, y_train)

# testing the model
ridge_y_pred = ridge.predict(x_test)

#### Scoring the Ridge Regression Model

In [137]:
ridge_accuracy = ridge.score(x_test, y_test)
print("Model accuracy on the test set: ", round(ridge_accuracy, 2), '\n')

# Baseline Model accuracy
print("R2 score =", round(metrics.r2_score(y_test, ridge_y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, ridge_y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, ridge_y_pred)), 2))

Model accuracy on the test set:  0.87 

R2 score = 0.87
Mean absolute error = 2.88
Root mean squared error = 4.29


#### Unpacking the Ridge Model

In [139]:
ridge_model = ridge.named_steps['ridge']

pprint.pprint(dict(zip(x_cols, ridge_model.coef_)))
print("Intercept: ", ridge_model.intercept_)

{'Age': 0.5658592203087043,
 'DRB%': 0.7548371475117832,
 'DRtg': -3.511690008693406,
 'FT/FGA': 1.859362855432612,
 'FTr': -1.271674723012904,
 'NRtg': 4.270987178263317,
 'OFT/FGA': -0.7011798214232909,
 'ORB%': 2.2729508264788167,
 'ORtg': 2.157950108948622,
 'OTOV%': 0.7384263541954414,
 'OeFG%': -1.4762422311351964,
 'Pace': 0.1877653427839274,
 'TOV%': -1.726582153715316,
 'TS%': 1.2972456929621283,
 'eFG%': 2.417264362806264}
Intercept:  40.42159916926271


These coefficients are all much more reasonable in magnitude as compared to the prior regression model

#### Saving the Ridge Model

In [None]:
with open("../models/ridge.pickle", "wb") as f:
  pickle.dump(ridge, f)