In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import ggplot, geom_line, aes, labs, theme_minimal, geom_boxplot, coord_flip
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
# conda install -c conda-forge plotnine

In [9]:
dat = pd.read_csv('model_data.csv')
dat['WINNER_cat'] = dat['WINNER'].astype('category')
dat.columns
dat.head()
dat.shape

(4620, 25)

In [28]:
def rf(X_train, X_test, y_train, y_test):
    mod = RandomForestClassifier(random_state = 426)
    param_grid_={'n_estimators': [100,500,1000], # number of trees in the forest
             'criterion': ["gini", "entropy"],
             'max_depth': [3, 5, 7],     # max number of layers
             'min_samples_leaf': [1, 10]}
    gs = GridSearchCV(mod, param_grid, scoring='accuracy',cv = 3)
    rf_fit = gs.fit(X_train, y_train)
    
    train_preds = rf_fit.predict(X_train)
    train_preds_probs = rf_fit.predict_proba(X_train)
    
    test_preds = rf_fit.predict(X_test)
    test_preds_probs = rf_fit.predict_proba(X_test)
    return train_preds, train_preds_probs, test_preds, test_preds_probs, gs.best_params_

# 1 - Home Ratio
# 2 - Away Ratio
# 3 - Home Diff
# 4 - Away Diff

In [8]:
away = dat.iloc[::2]
home = dat.iloc[1::2]

In [42]:
y_a = away['WINNER'].values
X_ar = away[['DEFLECTIONS_RATIO', 'CONTEST_RATE', 'CONTEST_RATE_2', 'CONTEST_RATE_3', 'CHARGES_RATIO', 'SCREEN_AST_RATIO', 'SCREEN_AST_PTS_RATIO', 'LOOSE_RATIO', 'OFF_BOXOUT_RATIO', 'DEF_BOXOUT_RATIO', 'BOXOUT_TM_RBS_RATIO']].values
y_h = home['WINNER'].values
X_hr = home[['DEFLECTIONS_RATIO', 'CONTEST_RATE', 'CONTEST_RATE_2', 'CONTEST_RATE_3', 'CHARGES_RATIO', 'SCREEN_AST_RATIO', 'SCREEN_AST_PTS_RATIO', 'LOOSE_RATIO', 'OFF_BOXOUT_RATIO', 'DEF_BOXOUT_RATIO', 'BOXOUT_TM_RBS_RATIO']].values
X_ad = away[['DEFLECTIONS_DIFF', 'CONTEST_RATE', 'CONTEST_RATE_2', 'CONTEST_RATE_3', 'CHARGES_DIFF', 'SCREEN_AST_DIFF', 'SCREEN_AST_PTS_DIFF', 'LOOSE_DIFFERENCE', 'OFF_BOXOUT_DIFF', 'DEF_BOXOUT_DIFF', 'BOXOUT_TM_RBS_DIFF']].values
X_hd = home[['DEFLECTIONS_DIFF', 'CONTEST_RATE', 'CONTEST_RATE_2', 'CONTEST_RATE_3', 'CHARGES_DIFF', 'SCREEN_AST_DIFF', 'SCREEN_AST_PTS_DIFF', 'LOOSE_DIFFERENCE', 'OFF_BOXOUT_DIFF', 'DEF_BOXOUT_DIFF', 'BOXOUT_TM_RBS_DIFF']].values

In [11]:
# X_train, X_test, y_train, y_test = train_test_split(X_ar, y_a, test_size = 0.2, random_state = 42)

In [16]:
#     mod = RandomForestClassifier(random_state = 426)
#     param_grid={'n_estimators': [100,500,1000], # number of trees in the forest
#              'criterion': ["gini", "entropy"],
#              'max_depth': [3, 5, 7],     # max number of layers
#              'min_samples_leaf': [1, 10]}
#     gs = GridSearchCV(mod, param_grid, scoring='accuracy',cv = 3)
#     rf_fit = gs.fit(X_train, y_train)
    
#     train_preds = rf_fit.predict(X_train)
#     train_preds_probs = rf_fit.predict_proba(X_train)
    
#     test_preds = rf_fit.predict(X_test)
#     test_preds_probs = rf_fit.predict_proba(X_test)

In [22]:
# np.mean(train_preds == y_train)
# np.mean(test_preds == y_test)
# gs.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'n_estimators': 500}

## 1 - Home Ratio

In [26]:
X_hr_train, X_hr_test, y_hr_train, y_hr_test = train_test_split(X_hr, y_h, test_size = 0.2, random_state = 42)

In [29]:
X_hr_train_preds, X_hr_train_probs, X_hr_test_preds, X_hr_test_probs, hr_best_parms = rf(X_hr_train, X_hr_test, y_hr_train, y_hr_test)

In [34]:
hr_best_parms

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'n_estimators': 1000}

In [32]:
print(np.mean(X_hr_train_preds == y_hr_train))
print(metrics.classification_report(y_hr_train, X_hr_train_preds))

0.7034632034632035
              precision    recall  f1-score   support

           0       0.71      0.53      0.61       806
           1       0.70      0.83      0.76      1042

   micro avg       0.70      0.70      0.70      1848
   macro avg       0.71      0.68      0.69      1848
weighted avg       0.71      0.70      0.70      1848



In [33]:
print(np.mean(X_hr_test_preds == y_hr_test))
print(metrics.classification_report(y_hr_test, X_hr_test_preds))

0.6038961038961039
              precision    recall  f1-score   support

           0       0.52      0.39      0.45       188
           1       0.64      0.75      0.69       274

   micro avg       0.60      0.60      0.60       462
   macro avg       0.58      0.57      0.57       462
weighted avg       0.59      0.60      0.59       462



## 2 - Away Ratio

In [35]:
X_ar_train, X_ar_test, y_ar_train, y_ar_test = train_test_split(X_ar, y_a, test_size = 0.2, random_state = 42)

In [36]:
X_ar_train_preds, X_ar_train_probs, X_ar_test_preds, X_ar_test_probs, ar_best_parms = rf(X_ar_train, X_ar_test, y_ar_train, y_ar_test)

In [37]:
ar_best_parms

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'n_estimators': 500}

In [38]:
print(np.mean(X_ar_train_preds == y_ar_train))
print(metrics.classification_report(y_ar_train, X_ar_train_preds))

0.7278138528138528
              precision    recall  f1-score   support

           0       0.71      0.88      0.79      1042
           1       0.78      0.52      0.63       806

   micro avg       0.73      0.73      0.73      1848
   macro avg       0.74      0.70      0.71      1848
weighted avg       0.74      0.73      0.72      1848



In [39]:
print(np.mean(X_ar_test_preds == y_ar_test))
print(metrics.classification_report(y_ar_test, X_ar_test_preds))

0.6168831168831169
              precision    recall  f1-score   support

           0       0.65      0.76      0.70       274
           1       0.54      0.40      0.46       188

   micro avg       0.62      0.62      0.62       462
   macro avg       0.60      0.58      0.58       462
weighted avg       0.61      0.62      0.60       462



## 3 - Home Diff

In [43]:
X_hd_train, X_hd_test, y_hd_train, y_hd_test = train_test_split(X_hd, y_h, test_size = 0.2, random_state = 42)

In [44]:
X_hd_train_preds, X_hd_train_probs, X_hd_test_preds, X_hd_test_probs, hd_best_parms = rf(X_hd_train, X_hd_test, y_hd_train, y_hd_test)

In [45]:
hd_best_parms

{'criterion': 'gini',
 'max_depth': 7,
 'min_samples_leaf': 1,
 'n_estimators': 100}

In [46]:
print(np.mean(X_hd_train_preds == y_hd_train))
print(metrics.classification_report(y_hd_train, X_hd_train_preds))

0.7792207792207793
              precision    recall  f1-score   support

           0       0.85      0.60      0.70       806
           1       0.75      0.91      0.82      1042

   micro avg       0.78      0.78      0.78      1848
   macro avg       0.80      0.76      0.76      1848
weighted avg       0.79      0.78      0.77      1848



In [47]:
print(np.mean(X_hd_test_preds == y_hd_test))
print(metrics.classification_report(y_hd_test, X_hd_test_preds))

0.5952380952380952
              precision    recall  f1-score   support

           0       0.50      0.36      0.42       188
           1       0.63      0.76      0.69       274

   micro avg       0.60      0.60      0.60       462
   macro avg       0.57      0.56      0.55       462
weighted avg       0.58      0.60      0.58       462



## 4 - Away Diff

In [48]:
X_ad_train, X_ad_test, y_ad_train, y_ad_test = train_test_split(X_ad, y_a, test_size = 0.2, random_state = 42)

In [49]:
X_ad_train_preds, X_ad_train_probs, X_ad_test_preds, X_ad_test_probs, ad_best_parms = rf(X_ad_train, X_ad_test, y_ad_train, y_ad_test)

In [50]:
ad_best_parms

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 10,
 'n_estimators': 100}

In [51]:
print(np.mean(X_ad_train_preds == y_ad_train))
print(metrics.classification_report(y_ad_train, X_ad_train_preds))

0.7418831168831169
              precision    recall  f1-score   support

           0       0.72      0.90      0.80      1042
           1       0.81      0.54      0.64       806

   micro avg       0.74      0.74      0.74      1848
   macro avg       0.76      0.72      0.72      1848
weighted avg       0.76      0.74      0.73      1848



In [52]:
print(np.mean(X_ad_test_preds == y_ad_test))
print(metrics.classification_report(y_ad_test, X_ad_test_preds))

0.5909090909090909
              precision    recall  f1-score   support

           0       0.63      0.76      0.69       274
           1       0.50      0.35      0.41       188

   micro avg       0.59      0.59      0.59       462
   macro avg       0.56      0.55      0.55       462
weighted avg       0.57      0.59      0.57       462

