In [212]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
import math
from pprint import pprint
import pickle

In [213]:
def score_model(model):
  y_pred = model.predict(x_test)
  print("R2 score =", round(metrics.r2_score(y_test, y_pred), 2))
  print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, y_pred), 2)) 
  print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2))

def save_model(model, filename):
  with open("../models/tree_models/" + filename + ".pickle", "wb") as f:
    pickle.dump(model, f)
  print('Model saved as ' + filename + '.pickle')

### Loading and Splitting the Data

In [214]:
data = pd.read_csv('../data/teams_normalized.csv')

# separating the dataset into section ready for model fitting
train, test = train_test_split(data, test_size=0.15, train_size=0.85)
x_cols = ['Age', 'ORtg', 'DRtg', 'NRtg', 'Pace', '3PAr_Norm', 'FTr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'OeFG%', 'OTOV%', 'DRB%', 'OFT/FGA']
y_col = 'W'

x_train = train[x_cols]
y_train = train[y_col]
x_test= test[x_cols]
y_test = test[y_col]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(957, 16) (957,) (169, 16) (169,)


### Training the Model
We can select the loss function as the criterion for the decision tree.

In [215]:
decision_tree = tree.DecisionTreeRegressor(
  max_depth=7
)

decision_tree.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=7)

### Evaluating the Model

In [216]:
score_model(decision_tree)

R2 score = 0.8
Mean absolute error = 4.1
Root mean squared error = 6.03


In [217]:
print('Feature Importance:')
pprint(
  dict(zip(x_cols, decision_tree.feature_importances_))
  )

Feature Importance:
{'3PAr_Norm': 0.004411542001746181,
 'Age': 0.0032215280692138607,
 'DRB%': 0.002420516865841172,
 'DRtg': 4.075412998829142e-05,
 'FT/FGA': 0.0009412651216825948,
 'FTr': 0.0010019269919983019,
 'NRtg': 0.9490149036663714,
 'OFT/FGA': 0.0067193534634338225,
 'ORB%': 0.001793413765529656,
 'ORtg': 0.0032633304871720146,
 'OTOV%': 0.0016735613866001996,
 'OeFG%': 0.007883321744012524,
 'Pace': 0.010539597595364122,
 'TOV%': 0.0016426274788205312,
 'TS%': 0.003588266237393321,
 'eFG%': 0.0018440909948319998}


This model relies heavily on the Net Rating metric, and pretty minimizes all the other columns.

### Train-Test Spliting of the Data

In [218]:
tree.export_graphviz(
  decision_tree, 
  out_file='../models/tree_models/decision_tree.dot',
  feature_names=x_cols)

save_model(decision_tree, 'decision_tree')

Model saved as decision_tree.pickle


With this .dot file, we can visualize the actual decision tree the computer uses to calculate win totals. 

### Random Forest Regression
With the decison tree model, we can only have one tree, which limits the amount of computation, but with the RandomForest model we can use many different trees to make a forest and then make a compiled result. 

In [219]:
random_forest = ensemble.RandomForestRegressor(
  n_estimators=500
)

random_forest.fit(x_train, y_train)

RandomForestRegressor(n_estimators=500)

In [220]:
score_model(random_forest)

R2 score = 0.86
Mean absolute error = 3.53
Root mean squared error = 5.11
