# NFL Capstone:Modeling

### Starting Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
import sys

## Import datasets

In [3]:
year = pd.read_csv('../data/teamstarterdraft.csv')
yearAV = pd.read_csv('../data/teamstarterdraftAV.csv')
week = pd.read_csv('../data/weekstarterdraft.csv')
weekAV = pd.read_csv('../data/weekstarterdraftAV.csv')
yearnocoach = year.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])
yearnocoachAV = yearAV.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])

## New Datasets with bucketization of major categorical variables and ordinal encoding of weeks

In [4]:
weekord = week.copy()
weekAVord = weekAV.copy()
weekord['Week'] = weekord['Week'].replace('Wild Card', 18)
weekord['Week'] = weekord['Week'].replace('Division', 19)
weekord['Week'] = weekord['Week'].replace('Conf. Champ.', 20)
weekord['Week'] = weekord['Week'].replace('SuperBowl', 21)

weekAVord['Week'] = weekAVord['Week'].replace('Wild Card', 18)
weekAVord['Week'] = weekAVord['Week'].replace('Division', 19)
weekAVord['Week'] = weekAVord['Week'].replace('Conf. Champ.', 20)
weekAVord['Week'] = weekAVord['Week'].replace('SuperBowl', 21)

weekord['Week']= weekord['Week'].astype(int)
weekAVord['Week']= weekAVord['Week'].astype(int)

In [5]:
yearbucket = year.copy()
yearAVbucket = yearAV.copy()
top = yearbucket['coach'].isin(yearbucket['coach'].value_counts().index[:61])
yearbucket.loc[~top, 'coach'] = 'other'

top = yearbucket['offcoor'].isin(yearbucket['offcoor'].value_counts().index[:61])
yearbucket.loc[~top, 'offcoor'] = 'other'

top = yearbucket['defcoor'].isin(yearbucket['defcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'defcoor'] = 'other'

top = yearAVbucket['coach'].isin(yearAVbucket['coach'].value_counts().index[:61])
yearAVbucket.loc[~top, 'coach'] = 'other'

top = yearAVbucket['offcoor'].isin(yearAVbucket['offcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'offcoor'] = 'other'

top = yearAVbucket['defcoor'].isin(yearAVbucket['defcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'defcoor'] = 'other'

# Model Selection Regression

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach, 'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        
        categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=True, handle_unknown='ignore')

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', LinearRegression())])
        search_space = [{'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        results = pd.DataFrame({"R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor': df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

                     R2       MSE      RMSE       MAE  \
year           0.003490  6.803684  2.608387  2.203176   
yearAV         0.000286  6.996596  2.645108  2.220828   
week           0.027038  6.751551  2.598375  2.174834   
weekAV         0.038667  6.563252  2.561884  2.143305   
yearnocoach    0.010850  6.734904  2.595169  2.183651   
yearnocoachAV  0.003437  6.907644  2.628240  2.184574   

                                                  best regressor  
year           ElasticNet(alpha=1.0, copy_X=True, fit_interce...  
yearAV         Lasso(alpha=0.15264179671752318, copy_X=True, ...  
week           Lasso(alpha=0.0009540954763499944, copy_X=True...  
weekAV         Lasso(alpha=0.0011513953993264468, copy_X=True...  
yearnocoach    Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  
yearnocoachAV  Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  


In [13]:
dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV}
dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']

        X_dummies = pd.get_dummies(X)
        print(key, X.shape, X_dummies.shape)


year (4261, 73) (4261, 497)
yearAV (3140, 82) (3140, 448)
week (71017, 34) (71017, 93)
weekAV (52331, 43) (52331, 102)


# Model Selection Regression w/ PCA

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach, 'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        
        categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        pca = PCA()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('pca', pca),
                                ('regressor', LinearRegression())])
        search_space = [{'pca__n_components':[2,15,30,50,100],
                        'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        pca_components = reg_CV.best_estimator_.named_steps['pca'].n_components
        results = pd.DataFrame({'PCA components': pca_components, "R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor': df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

               PCA components        R2       MSE      RMSE       MAE  \
year                       50  0.006157  6.564303  2.562090  2.148639   
yearAV                     50 -0.000032  7.133128  2.670792  2.246177   
week                       50  0.012887  6.832590  2.613922  2.188608   
weekAV                     50  0.016964  6.642123  2.577232  2.155551   
yearnocoach                50 -0.005583  7.053212  2.655788  2.221354   
yearnocoachAV               2 -0.004378  7.119864  2.668307  2.228804   

                                                  best regressor  
year           Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  
yearAV         Lasso(alpha=0.0011513953993264468, copy_X=True...  
week           Ridge(alpha=0.007543120063354615, copy_X=True,...  
weekAV         Lasso(alpha=0.004291934260128779, copy_X=True,...  
yearnocoach    Ridge(alpha=0.6866488450042998, copy_X=True, f...  
yearnocoachAV  Ridge(alpha=0.15264179671752318, copy_X=True, ...  


# Model Selection Regresson run on datasets with bucketized coach values

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

#this is what I changed from cell above
dfdict = {'yearbucket':yearbucket, 'yearAVbucket':yearAVbucket, 'week':weekord,'weekAV':weekAVord}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        categorical_features = list(df.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=True, handle_unknown='ignore')

        numeric_features = list(df.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', LinearRegression())])
        search_space = [{'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        results = pd.DataFrame({"R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor':                 df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

                    R2       MSE      RMSE       MAE  \
yearbucket    0.002520  7.142888  2.672618  2.240276   
yearAVbucket  0.013071  6.455497  2.540767  2.151045   
week          0.024096  6.787724  2.605326  2.179035   
weekAV        0.041493  6.602885  2.569608  2.149873   

                                                 best regressor  
yearbucket    Lasso(alpha=0.0016768329368110067, copy_X=True...  
yearAVbucket  Lasso(alpha=0.0020235896477251557, copy_X=True...  
week          Ridge(alpha=0.04941713361323833, copy_X=True, ...  
weekAV        Ridge(alpha=0.040949150623804234, copy_X=True,...  


# Classification Model Selection

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import auc

dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach, 'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftPosition', axis=1)
        y = df['DraftPosition']
        #categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        #categorical_transformer = make_pipeline(OneHotEncoder(sparse=True, handle_unknown='ignore'))

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features)])
                #('cat', categorical_transformer, categorical_features)])
        pca = PCA()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('pca', pca),
                                ('classifier', RandomForestClassifier())])
        search_space = [{'pca__n_components':[2,15,30,50,100],
                        'classifier': [RandomForestClassifier()],
                        'classifier__n_estimators': [64,96,128,1000,2000],
                        'classifier__criterion': ['gini', 'entropy']},
                        {'pca__n_components':[2,15,30,50,100],
                        'classifier': [SVC()],
                        'classifier__C': [0.25, 0.50, 0.75, 1]}]
        
        clf_CV = GridSearchCV(clf, search_space, cv=5, n_jobs=-1)
        best_model = clf_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
 
        ac = accuracy_score(y_test, y_pred)
        print(ac)
        f1 = f1_score(y_test, y_pred, average='weighted')
        print(f1)
        #cm = confusion_matrix(y_test, y_pred)
        df_best_classifier = best_model.best_estimator_.get_params()['classifier']
        print(df_best_classifier)
        pca_components = clf_CV.best_estimator_.named_steps['pca'].n_components
        #,'best classifier': df_best_classifier
        results = pd.DataFrame({'PCA components': pca_components, "Accuracy": ac, "f1" : f1}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

0.20540156361051884
0.08477457890001086
SVC(C=0.25, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.20154291224686596
0.06761230924493578
SVC(C=0.25, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.269115890083632
0.24101193239029695
SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.2688477127967574
0.25192539054479235
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                    

# Best dataset (week) vs dummy classifier

In [29]:
#No depth limit no pca
dfdict = {'week':week}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftPosition', axis=1)
        y = df['DraftPosition']
        #categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        #categorical_transformer = make_pipeline(OneHotEncoder(sparse=True, handle_unknown='ignore'))

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        scalers_to_test = [StandardScaler(), MinMaxScaler()]
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', MinMaxScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features)])
                #('cat', categorical_transformer, categorical_features)])
        pca = PCA()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', RandomForestClassifier())])
        search_space = [{'classifier': [RandomForestClassifier()],
                        'classifier__n_estimators': [64,96,128, 256],                         
                        'classifier__criterion': ['gini', 'entropy']},
                        {'classifier': [SVC()],
                        'classifier__C': [0.25, 1],
                        'classifier__kernel': ['poly', 'rbf']}]
        
        clf_CV = GridSearchCV(clf, search_space, cv=5, n_jobs=-1, verbose=10)
        best_model = clf_CV.fit(X_train, y_train)
        #saving model (https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/)
        filename = 'best_model_no_pruning.sav'
        pickle.dump(best_model, open(filename, 'wb'))##########

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 60.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 76.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 76.2min finished


In [32]:
#No depth limit no pca
from sklearn.dummy import DummyClassifier
filename = 'best_model_no_pruning.sav'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred = loaded_model.predict(X_test)

cv_results_df = pd.DataFrame(loaded_model.cv_results_)
cv_results_df = cv_results_df.sort_values(by=['rank_test_score'])
cv_results_df[
    ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
    ]
#Check how well model generalizes
y_train_pred = loaded_model.predict(X_train)
train_ac = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average='weighted')

#dummy classifier
dummy_score_dict = {}
strategies = ['stratified', 'most_frequent', 'uniform']
for strategy in strategies:
        dclf = DummyClassifier(strategy=strategy)
        dclf.fit(X_train, y_train)
        dummy_score = dclf.score(X_test, y_test)
        dummy_score_dict[strategy] = dummy_score
dummy_df = pd.DataFrame.from_dict(dummy_score_dict, orient='index', columns=['score'])

ac = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
#cm = confusion_matrix(y_test, y_pred)
df_best_classifier = loaded_model.best_estimator_.get_params()['classifier']
#,'best classifier': df_best_classifier
results = pd.DataFrame({"Train Accuracy": train_ac, "Test Accuracy": ac, "Train F1":train_f1, "Test F1" : f1}, index=[key])
print("Cross-Validation best parameters: ", cv_results_df.iloc[0]['param_classifier'])
print("Cross-validation mean test score:", cv_results_df.iloc[0]['mean_test_score'])
dfresults = dfresults.append(results)
print(dummy_df)
print(dfresults)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
cv_results_df.head()

Cross-Validation best parameters:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=64,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Cross-validation mean test score: 0.4587754472064172
                  score
stratified     0.129459
most_frequent  0.202936
uniform        0.107996
      Train Accuracy  Test Accuracy  Train F1   Test F1
week             1.0       0.434332       1.0  0.432814
[[2477  263  227  373  483  102  268  180  383]
 [ 319  896   72  132  264   68  105   96  178]
 [ 274   75  832  186  198   64  127   81  137]
 [ 402  154

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__criterion,param_classifier__n_estimators,param_classifier__C,param_classifier__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,22.663899,0.448551,0.586909,0.044231,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,64,,,{'classifier': RandomForestClassifier(bootstra...,0.464222,0.449138,0.452291,0.444725,0.483501,0.458775,0.013953,1
0,20.374341,0.90794,0.912101,0.177165,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,64,,,{'classifier': RandomForestClassifier(bootstra...,0.456867,0.478352,0.44567,0.449559,0.444409,0.454971,0.012471,2
1,28.361846,0.308601,1.050777,0.149217,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,96,,,{'classifier': RandomForestClassifier(bootstra...,0.450562,0.456494,0.419714,0.444515,0.457125,0.445682,0.013765,3
5,45.348789,3.549338,1.038997,0.265559,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,96,,,{'classifier': RandomForestClassifier(bootstra...,0.44825,0.449664,0.451135,0.426965,0.447352,0.444673,0.008947,4
2,38.513515,0.670985,1.566372,0.286328,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,128,,,{'classifier': RandomForestClassifier(bootstra...,0.437848,0.45702,0.439155,0.447037,0.441467,0.444505,0.007003,5


In [33]:
#with depth limit and no pca
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle


dfdict = {'week':week}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftPosition', axis=1)
        y = df['DraftPosition']
        #categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        #categorical_transformer = make_pipeline(OneHotEncoder(sparse=True, handle_unknown='ignore'))

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        scalers_to_test = [StandardScaler(), MinMaxScaler()]
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', MinMaxScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features)])
                #('cat', categorical_transformer, categorical_features)])
        pca = PCA()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('pca', pca),
                                ('classifier', RandomForestClassifier())])
        search_space = [{'classifier': [RandomForestClassifier()],
                        'classifier__n_estimators': [64,96,128, 256],
                        'classifier__max_depth': np.linspace(1,50,5),
                        'classifier__criterion': ['gini', 'entropy']},
                        {'classifier': [SVC()],
                        'classifier__C': [0.25, 1],
                        'classifier__kernel': ['poly', 'rbf']}]
        
        clf_CV = GridSearchCV(clf, search_space, cv=5, n_jobs=-1, verbose=10)
        best_model = clf_CV.fit(X_train, y_train)
        #saving model (https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/)
        filename = 'best_model_pruned.sav'
        pickle.dump(best_model, open(filename, 'wb'))

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 32.7min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 44.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 56

In [34]:
#with depth limit and no pca
filename = 'best_model_pruned.sav'
loaded_model = pickle.load(open(filename, 'rb'))

y_pred = loaded_model.predict(X_test)

cv_results_df = pd.DataFrame(loaded_model.cv_results_)
cv_results_df = cv_results_df.sort_values(by=['rank_test_score'])
cv_results_df[
    ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
    ]
#Check how well model generalizes
y_train_pred = loaded_model.predict(X_train)
train_ac = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average='weighted')

#dummy classifier
dummy_score_dict = {}
strategies = ['stratified', 'most_frequent', 'uniform']
for strategy in strategies:
        dclf = DummyClassifier(strategy=strategy)
        dclf.fit(X_train, y_train)
        dummy_score = dclf.score(X_test, y_test)
        dummy_score_dict[strategy] = dummy_score
dummy_df = pd.DataFrame.from_dict(dummy_score_dict, orient='index', columns=['score'])

ac = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
#cm = confusion_matrix(y_test, y_pred)
df_best_classifier = loaded_model.best_estimator_.get_params()['classifier']
#,'best classifier': df_best_classifier
results = pd.DataFrame({"Train Accuracy": train_ac, "Test Accuracy": ac, "Train F1":train_f1, "Test F1" : f1}, index=[key])
print("Cross-Validation best parameters: ", cv_results_df.iloc[0]['param_classifier'])
print("Cross-validation mean test score:", cv_results_df.iloc[0]['mean_test_score'])
dfresults = dfresults.append(results)
print(dummy_df)
print(dfresults)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
cv_results_df.head()

Cross-Validation best parameters:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=13.25,
                       max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=256, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)
Cross-validation mean test score: 0.32714773850188134
                  score
stratified     0.130227
most_frequent  0.204258
uniform        0.108892
      Train Accuracy  Test Accuracy  Train F1   Test F1
week        0.642252       0.347542  0.650995  0.300821
[[4077   52   30  121  317    8   31   15  136]
 [1044  383   21   91  365   10   41   33  155]
 [1105   27  199   74  301   27   42   19  103]
 [1566  

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__criterion,param_classifier__max_depth,param_classifier__n_estimators,param_classifier__C,param_classifier__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,296.309531,5.391066,0.877804,0.082429,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,13.25,256,,,{'classifier': RandomForestClassifier(bootstra...,0.310602,0.324296,0.323665,0.33554,0.341635,0.327148,0.010718,1
26,148.728888,3.195423,0.544117,0.052281,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,13.25,128,,,{'classifier': RandomForestClassifier(bootstra...,0.309026,0.327133,0.318621,0.33491,0.342055,0.326349,0.011665,2
25,110.084859,0.949237,0.340571,0.052484,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,13.25,96,,,{'classifier': RandomForestClassifier(bootstra...,0.307975,0.321459,0.324926,0.332177,0.32934,0.323176,0.008439,3
24,75.484253,1.373811,0.230121,0.02746,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,13.25,64,,,{'classifier': RandomForestClassifier(bootstra...,0.305453,0.311896,0.318516,0.329025,0.348256,0.322629,0.014998,4
7,119.171806,0.855958,1.006449,0.088094,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,13.25,256,,,{'classifier': RandomForestClassifier(bootstra...,0.296312,0.313157,0.299391,0.316625,0.328289,0.310755,0.011708,5
