In [30]:
import pandas as pd
import numpy as np

### Train-Test Spliting of the Data

In [31]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('../data/teams_processed.csv')

# separating the dataset into section ready for model fitting
train, test = train_test_split(data, test_size=0.15, train_size=0.85)
x_cols = ['Age', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'OeFG%', 'OTOV%', 'DRB%', 'OFT/FGA']
y_col = 'W'

x_train = train[x_cols]
y_train = train[y_col]
x_test= test[x_cols]
y_test = test[y_col]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(963, 15) (963,) (170, 15) (170,)


### Linear Regression
We can now run the linear regression algorithm on the split data.

In [32]:
from sklearn import linear_model

linear_regression  = linear_model.LinearRegression()

# fitting the model
linear_regression.fit(x_train, y_train)

# testing the model
y_pred = linear_regression.predict(x_test)

#### Scoring the Linear Regression Model

In [33]:
from sklearn import metrics
import math

accuracy = linear_regression.score(x_test, y_test)
print("Model accuracy on the test set: ", round(accuracy, 2), '\n')

# Baseline Model accuracy
print("R2 score =", round(metrics.r2_score(y_test, y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2)) 

Model accuracy on the test set:  0.86 

R2 score = 0.86
Mean absolute error = 3.37
Root mean squared error = 4.78


#### Unpacking the Linear Regression Model

In [34]:
import pprint

pprint.pprint(dict(zip(x_cols, linear_regression.coef_)))
print("Intercept: ", linear_regression.intercept_)

{'Age': 0.3862936276262755,
 'DRB%': 0.2786865234375,
 'DRtg': 7384080356722.959,
 'FT/FGA': 153.80022525787354,
 'FTr': -78.66358757019043,
 'NRtg': 7384080356724.619,
 'OFT/FGA': -28.03125,
 'ORB%': 0.6651611328125,
 'ORtg': -7384080356723.37,
 'OTOV%': 0.765106201171875,
 'OeFG%': -99.03291893005371,
 'Pace': 0.024169921875,
 'TOV%': -1.65142822265625,
 'TS%': -93.76582622528076,
 'eFG%': 272.6583080291748}
Intercept:  4.567946562725453


While this model is quite accurate, some of the coeficients have really high values, which is a symptom of over fitting. The liner regression model is trying to match the results too acurately, that the algorithm is too data specific.

#### Saving the Linear Regression Model

In [35]:
import pickle

with open("../models/linear_regression.pickle", "wb") as f:
  pickle.dump(linear_regression, f)

### Ridge Model
Since the Linear model has some extremely high coeficient values, we can now use the Ridge model to remedy the overfitting.  

In [36]:
ridge = linear_model.Ridge(alpha=.5)

# fitting the model
ridge.fit(x_train, y_train)

# testing the model
ridge_y_pred = ridge.predict(x_test)

#### Scoring the Ridge Regression Model

In [37]:
ridge_accuracy = ridge.score(x_test, y_test)
print("Model accuracy on the test set: ", round(ridge_accuracy, 2), '\n')

# Baseline Model accuracy
print("R2 score =", round(metrics.r2_score(y_test, ridge_y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, ridge_y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, ridge_y_pred)), 2))

Model accuracy on the test set:  0.86 

R2 score = 0.86
Mean absolute error = 3.41
Root mean squared error = 4.83


#### Unpacking the Ridge Model

In [38]:
pprint.pprint(dict(zip(x_cols, ridge.coef_)))
print("Intercept: ", ridge.intercept_)

{'Age': 0.4047763682617255,
 'DRB%': -0.029661078062402937,
 'DRtg': -0.7376582469328505,
 'FT/FGA': 2.271762497384097,
 'FTr': -1.1425058892346036,
 'NRtg': 1.6204920002090006,
 'OFT/FGA': -4.063688662158415,
 'ORB%': 0.06914645451446785,
 'ORtg': 0.8828337532745272,
 'OTOV%': -0.2521618064189411,
 'OeFG%': -0.6349942959491796,
 'Pace': 0.06748989989336078,
 'TOV%': 0.09613913326483858,
 'TS%': 1.5422001727840018,
 'eFG%': 0.48192510005789563}
Intercept:  9.798291085399491


These coefficients are all much more reasonable in magnitude as compared to the prior regression model

#### Saving the Ridge Model

In [39]:
with open("../models/ridge.pickle", "wb") as f:
  pickle.dump(ridge, f)