In [9]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split, cross_val_score,cross_validate, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sportsreference.ncaab.teams import Teams
from sportsreference.ncaab.teams import Team
from sportsreference.ncaab.roster import Player
from tqdm import tqdm
from sportsreference.ncaab.boxscore import Boxscore
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from pprint import pprint
from sklearn.preprocessing import MinMaxScaler

In [4]:
df = pd.read_csv("ml_df_l3.csv")

In [5]:
cols_to_drop = [0,71,72,74,75,78,79,7,8,17,42,43,62,73]
df = df.drop(columns = [df.columns[i] for i in cols_to_drop],axis = 1)
X = df.drop('result',1)
y = df.result
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y)

  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
RandomForestClassifier()

RandomForestClassifier()

In [18]:
clf = RandomForestClassifier(n_estimators = 250,n_jobs = -1)
clf.fit(X_train,y_train)
y_train_pred = clf.predict(X_train)
pprint(classification_report(y_train,y_train_pred,output_dict = True))

{'0': {'f1-score': 0.9839749959350327,
       'precision': 0.983708413105516,
       'recall': 0.9842417232904438,
       'support': 27668},
 '1': {'f1-score': 0.9839657260615702,
       'precision': 0.9842326052365109,
       'recall': 0.9836989915784147,
       'support': 27667},
 'accuracy': 0.9839703623384838,
 'macro avg': {'f1-score': 0.9839703609983015,
               'precision': 0.9839705091710135,
               'recall': 0.9839703574344292,
               'support': 55335},
 'weighted avg': {'f1-score': 0.9839703610820627,
                  'precision': 0.9839705044344802,
                  'recall': 0.9839703623384838,
                  'support': 55335}}


In [19]:
pprint(classification_report(y_test,clf.predict(X_test),output_dict = True))

{'0': {'f1-score': 0.7330487870765574,
       'precision': 0.7304834715193281,
       'recall': 0.735632183908046,
       'support': 9222},
 '1': {'f1-score': 0.7311898155704261,
       'precision': 0.733784669141734,
       'recall': 0.7286132494849832,
       'support': 9223},
 'accuracy': 0.7321225264299268,
 'macro avg': {'f1-score': 0.7321193013234917,
               'precision': 0.732134070330531,
               'recall': 0.7321227166965145,
               'support': 18445},
 'weighted avg': {'f1-score': 0.7321192509312037,
                  'precision': 0.7321341598181327,
                  'recall': 0.7321225264299268,
                  'support': 18445}}


In [21]:
imp_dict = {key:[] for key in ('feature','importance')}
for feat,imp in zip(X_train.columns,clf.feature_importances_):
    imp_dict['feature'].append(feat)
    imp_dict['importance'].append(imp)
    
imp_df = pd.DataFrame.from_dict(imp_dict)
imp_df.sort_values('importance',ascending = False).head(25)

Unnamed: 0,feature,importance
64,home,0.07
31,win_percentage,0.06
49,opp_win_percentage,0.06
41,opp_simple_rating_system,0.04
23,simple_rating_system,0.04
38,opp_offensive_rating,0.02
6,offensive_rating,0.02
20,allowed_true_shooting_percentage,0.02
62,opp_true_shooting_percentage_allowed,0.02
52,opp_effective_field_goal_percentage_allowed,0.02


In [None]:
master_dict = {key:[] for key in ('trees','fit_time','score_time','test_score')}
for trees in tqdm(range(50,1000,50)):
    clf = RandomForestClassifier(n_estimators = trees,n_jobs = -1)
    cv_results = cross_validate(clf,X_train,y_train,cv = 3)
    avg = np.mean(cv_results['test_score'])
    master_dict['trees'].append(trees)
    master_dict['fit_time'].append(cv_results['fit_time'])
    master_dict['score_time'].append(cv_results['score_time'])
    master_dict['test_score'].append(avg)

In [None]:
pd.DataFrame.from_dict(master_dict)

In [None]:
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
param_dict = {key:[] for key in ('best_score','best_params')}
for train_ix, test_ix in tqdm(cv_outer.split(X)):

    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
    # define the model
    model = RandomForestClassifier(random_state=1,n_jobs = -1)
    # define search space
    space = dict()
    space['n_estimators'] = [100, 500,1000]
    space['max_features'] = [3,6,12,24]
    # define search
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    acc = accuracy_score(y_test, yhat)
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    param_dict['best_score'].append(result.best_score_)
    param_dict['best_params'].append(result.best_params_)
    # summarize the estimated performance of the model
    print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

In [None]:
master_dict = {key:[] for key in ('max_features','trees','training_acc','testing_acc','testing_std')}
ests = [6,12,18,24,30,36,42]
trees = [100,250,500,1000,1500]
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,random_state = 3)
for i in tqdm(ests):
    for j in trees:
        clf = RandomForestClassifier(n_estimators = j,max_features = i,n_jobs = -1,verbose = 1,class_weight = 'balanced')
        clf.fit(X_train,y_train)
        score1 = clf.score(X_train,y_train)
        master_dict['training_acc'].append(score1)
        score = cross_val_score(clf,X_train,y_train,cv = 5)
        #score = np.mean(score)
        master_dict['testing_acc'].append(np.mean(score))
        master_dict['testing_std'].append(np.std(score))  
        master_dict['max_features'].append(i)
        master_dict['trees'].append(j)

In [None]:
pd.DataFrame.from_dict(master_dict)

In [None]:
master_dict = {key:[] for key in ('trees','acc','std')}
for i in tqdm(range(50,1500,50)):
    clf = xgb.XGBClassifier(n_jobs = -1,n_estimators = i)
    score = cross_val_score(clf,X_train,y_train,cv = 5,n_jobs = -1)
    master_dict['acc'].append(np.mean(score))
    master_dict['std'].append(np.std(score))
    master_dict['trees'].append(i)

In [None]:
pd.DataFrame.from_dict(master_dict)

In [None]:
master_dict = {key:[] for key in ('md','acc','std')}
for i in tqdm(range(1,20)):
    clf = xgb.XGBClassifier(n_jobs = -1,max_depth = i)
    score = cross_val_score(clf,X_train,y_train,cv = 5,n_jobs = -1)
    master_dict['acc'].append(np.mean(score))
    master_dict['std'].append(np.std(score))
    master_dict['md'].append(i)
pd.DataFrame.from_dict(master_dict)

In [22]:
xgb.XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [None]:
master_dict = {key:[] for key in ('lr','acc','std')}
for i in tqdm(np.linspace(0.5,2,10)):
    clf = xgb.XGBClassifier(n_jobs = -1,max_depth = i)
    score = cross_val_score(clf,X_train,y_train,cv = 5,n_jobs = -1)
    master_dict['acc'].append(np.mean(score))
    master_dict['std'].append(np.std(score))
    master_dict['lr'].append(i)
pd.DataFrame.from_dict(master_dict)

In [55]:
clf = xgb.XGBClassifier(n_jobs = -1,
                        max_depth = 2,
                       subsample = 1,
                       n_estimators = 1000,
                       learning_rate = 0.0025)
clf.fit(X_train,y_train)
score1 = accuracy_score(y_train,clf.predict(X_train))
score2 = accuracy_score(y_test,clf.predict(X_test))
print(score1,score2)

0.7539712659257252 0.7495798319327731
