In [131]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
import math
from pprint import pprint

### Loading and Splitting the Data

In [132]:
data = pd.read_csv('../data/teams_normalized.csv')

# separating the dataset into section ready for model fitting
train, test = train_test_split(data, test_size=0.15, train_size=0.85)
x_cols = ['Age', 'ORtg', 'DRtg', 'NRtg', 'Pace', '3PAr_Norm', 'FTr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'OeFG%', 'OTOV%', 'DRB%', 'OFT/FGA']
y_col = 'W'

x_train = train[x_cols]
y_train = train[y_col]
x_test= test[x_cols]
y_test = test[y_col]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(957, 16) (957,) (169, 16) (169,)


### Training the Model
We can select the loss function as the criterion for the decision tree.

In [133]:
decision_tree = tree.DecisionTreeRegressor()

decision_tree.fit(x_train, y_train)

DecisionTreeRegressor()

### Evaluating the Model

In [134]:
decision_tree_y_pred = decision_tree.predict(x_test)

print("R2 score =", round(metrics.r2_score(y_test, decision_tree_y_pred), 2))
print("Mean absolute error =", round(metrics.mean_absolute_error(y_test, decision_tree_y_pred), 2)) 
print("Root mean squared error =", round(math.sqrt(metrics.mean_squared_error(y_test, decision_tree_y_pred)), 2))

R2 score = 0.83
Mean absolute error = 4.08
Root mean squared error = 5.32


In [138]:
print('Feature Importance:')
pprint(
  dict(zip(x_cols, decision_tree.feature_importances_))
  )

Feature Importance:
{'3PAr_Norm': 0.004935428382509676,
 'Age': 0.00730018003397683,
 'DRB%': 0.00705007258558129,
 'DRtg': 0.017330968078491674,
 'FT/FGA': 0.004911948886159416,
 'FTr': 0.0036305175050385025,
 'NRtg': 0.9013762261159842,
 'OFT/FGA': 0.009768187695923828,
 'ORB%': 0.004671343019765389,
 'ORtg': 0.008826630477403211,
 'OTOV%': 0.004520271367993291,
 'OeFG%': 0.005917650567452693,
 'Pace': 0.003170741237144573,
 'TOV%': 0.0052279810249397166,
 'TS%': 0.006107457619306806,
 'eFG%': 0.005254395402328818}


This model relies heavily on the Net Rating metric, and pretty minimizes all the other columns.