# NFL Capstone:Modeling

### Starting Imports

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
import sys

## Import datasets

In [22]:
year = pd.read_csv('../data/teamstarterdraft.csv')
yearAV = pd.read_csv('../data/teamstarterdraftAV.csv')
week = pd.read_csv('../data/weekstarterdraft.csv')
weekAV = pd.read_csv('../data/weekstarterdraftAV.csv')
yearnocoach = year.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])
yearnocoachAV = yearAV.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])

## New Datasets with bucketization of major categorical variables and ordinal encoding of weeks

In [23]:
weekord = week.copy()
weekAVord = weekAV.copy()
weekord['Week'] = weekord['Week'].replace('Wild Card', 18)
weekord['Week'] = weekord['Week'].replace('Division', 19)
weekord['Week'] = weekord['Week'].replace('Conf. Champ.', 20)
weekord['Week'] = weekord['Week'].replace('SuperBowl', 21)

weekAVord['Week'] = weekAVord['Week'].replace('Wild Card', 18)
weekAVord['Week'] = weekAVord['Week'].replace('Division', 19)
weekAVord['Week'] = weekAVord['Week'].replace('Conf. Champ.', 20)
weekAVord['Week'] = weekAVord['Week'].replace('SuperBowl', 21)

weekord['Week']= weekord['Week'].astype(int)
weekAVord['Week']= weekAVord['Week'].astype(int)

In [24]:
yearbucket = year.copy()
yearAVbucket = yearAV.copy()
top = yearbucket['coach'].isin(yearbucket['coach'].value_counts().index[:61])
yearbucket.loc[~top, 'coach'] = 'other'

top = yearbucket['offcoor'].isin(yearbucket['offcoor'].value_counts().index[:61])
yearbucket.loc[~top, 'offcoor'] = 'other'

top = yearbucket['defcoor'].isin(yearbucket['defcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'defcoor'] = 'other'

top = yearAVbucket['coach'].isin(yearAVbucket['coach'].value_counts().index[:61])
yearAVbucket.loc[~top, 'coach'] = 'other'

top = yearAVbucket['offcoor'].isin(yearAVbucket['offcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'offcoor'] = 'other'

top = yearAVbucket['defcoor'].isin(yearAVbucket['defcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'defcoor'] = 'other'

# Model Selection Regression

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach, 'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        
        categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=True, handle_unknown='ignore')

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', LinearRegression())])
        search_space = [{'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        results = pd.DataFrame({"R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor': df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

                     R2       MSE      RMSE       MAE  \
year           0.003490  6.803684  2.608387  2.203176   
yearAV         0.000286  6.996596  2.645108  2.220828   
week           0.027038  6.751551  2.598375  2.174834   
weekAV         0.038667  6.563252  2.561884  2.143305   
yearnocoach    0.010850  6.734904  2.595169  2.183651   
yearnocoachAV  0.003437  6.907644  2.628240  2.184574   

                                                  best regressor  
year           ElasticNet(alpha=1.0, copy_X=True, fit_interce...  
yearAV         Lasso(alpha=0.15264179671752318, copy_X=True, ...  
week           Lasso(alpha=0.0009540954763499944, copy_X=True...  
weekAV         Lasso(alpha=0.0011513953993264468, copy_X=True...  
yearnocoach    Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  
yearnocoachAV  Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  


# Model Selection Regression w/ PCA

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach, 'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        
        categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        pca = PCA()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('pca', pca),
                                ('regressor', LinearRegression())])
        search_space = [{'pca__n_components':[2,15,30,50,100],
                        'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'pca__n_components':[2,15,30,50,100],
                        'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        pca_components = reg_CV.best_estimator_.named_steps['pca'].n_components
        results = pd.DataFrame({'PCA components': pca_components, "R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor': df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

               PCA components        R2       MSE      RMSE       MAE  \
year                       50  0.006157  6.564303  2.562090  2.148639   
yearAV                     50 -0.000032  7.133128  2.670792  2.246177   
week                       50  0.012887  6.832590  2.613922  2.188608   
weekAV                     50  0.016964  6.642123  2.577232  2.155551   
yearnocoach                50 -0.005583  7.053212  2.655788  2.221354   
yearnocoachAV               2 -0.004378  7.119864  2.668307  2.228804   

                                                  best regressor  
year           Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  
yearAV         Lasso(alpha=0.0011513953993264468, copy_X=True...  
week           Ridge(alpha=0.007543120063354615, copy_X=True,...  
weekAV         Lasso(alpha=0.004291934260128779, copy_X=True,...  
yearnocoach    Ridge(alpha=0.6866488450042998, copy_X=True, f...  
yearnocoachAV  Ridge(alpha=0.15264179671752318, copy_X=True, ...  


# Model Selection Regresson run on datasets with bucketized coach values

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

#this is what I changed from cell above
dfdict = {'yearbucket':yearbucket, 'yearAVbucket':yearAVbucket, 'week':weekord,'weekAV':weekAVord}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        categorical_features = list(df.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=True, handle_unknown='ignore')

        numeric_features = list(df.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', LinearRegression())])
        search_space = [{'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        results = pd.DataFrame({"R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor':                 df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

                    R2       MSE      RMSE       MAE  \
yearbucket    0.002520  7.142888  2.672618  2.240276   
yearAVbucket  0.013071  6.455497  2.540767  2.151045   
week          0.024096  6.787724  2.605326  2.179035   
weekAV        0.041493  6.602885  2.569608  2.149873   

                                                 best regressor  
yearbucket    Lasso(alpha=0.0016768329368110067, copy_X=True...  
yearAVbucket  Lasso(alpha=0.0020235896477251557, copy_X=True...  
week          Ridge(alpha=0.04941713361323833, copy_X=True, ...  
weekAV        Ridge(alpha=0.040949150623804234, copy_X=True,...  


# Classification Model Selection

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import auc

dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach, 'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftPosition', axis=1)
        y = df['DraftPosition']
        #categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        #categorical_transformer = make_pipeline(OneHotEncoder(sparse=True, handle_unknown='ignore'))

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features)])
                #('cat', categorical_transformer, categorical_features)])
        pca = PCA()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('pca', pca),
                                ('classifier', RandomForestClassifier())])
        search_space = [{'pca__n_components':[2,15,30,50,100],
                        'classifier': [RandomForestClassifier()],
                        'classifier__n_estimators': [64,96,128,1000,2000],
                        'classifier__criterion': ['gini', 'entropy']},
                        {'pca__n_components':[2,15,30,50,100],
                        'classifier': [SVC()],
                        'classifier__C': [0.25, 0.50, 0.75, 1]}]
        
        clf_CV = GridSearchCV(clf, search_space, cv=5, n_jobs=-1)
        best_model = clf_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        ac = accuracy_score(y_test, y_pred)
        print(ac)
        f1 = f1_score(y_test, y_pred, average='weighted')
        print(f1)
        #cm = confusion_matrix(y_test, y_pred)
        df_best_classifier = best_model.best_estimator_.get_params()['classifier']
        print(df_best_classifier)
        pca_components = clf_CV.best_estimator_.named_steps['pca'].n_components
        #,'best classifier': df_best_classifier
        results = pd.DataFrame({'PCA components': pca_components, "Accuracy": ac, "f1" : f1}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

0.20540156361051884
0.08477457890001086
SVC(C=0.25, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.20154291224686596
0.06761230924493578
SVC(C=0.25, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.269115890083632
0.24101193239029695
SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.2688477127967574
0.25192539054479235
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                    

# Best dataset (week) vs dummy classifier

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle


dfdict = {'week':week}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftPosition', axis=1)
        y = df['DraftPosition']
        #categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        #categorical_transformer = make_pipeline(OneHotEncoder(sparse=True, handle_unknown='ignore'))

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        scalers_to_test = [StandardScaler(), MinMaxScaler()]
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', MinMaxScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features)])
                #('cat', categorical_transformer, categorical_features)])
        pca = PCA()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('pca', pca),
                                ('classifier', RandomForestClassifier())])
        search_space = [{'pca__n_components':[15,30],
                        'classifier': [RandomForestClassifier()],
                        'classifier__n_estimators': [64,96,128, 256],
                        'classifier__criterion': ['gini', 'entropy']},
                        {'pca__n_components':[15,30],
                        'classifier': [SVC()],
                        'classifier__C': [0.25, 1],
                        'classifier__kernel': ['poly', 'rbf']}]
        
        clf_CV = GridSearchCV(clf, search_space, cv=5, n_jobs=-1, verbose=10)
        best_model = clf_CV.fit(X_train, y_train)
        #saving model (https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/)
        filename = 'best_model.sav'
        pickle.dump(best_model, open(filename, 'wb'))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 21.3min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 28.9min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 44.4min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 78.2min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 118.9min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 156.7min finished


In [26]:
from sklearn.dummy import DummyClassifier
y_pred = best_model.predict(X_test)

"""cv_results_df = pd.DataFrame(clf_CV.cv_results_)
cv_results_df = cv_results_df.sort_values(by=['rank_test_score'])
cv_results_df = (
            cv_results_df
            .set_index(cv_results_df["params"].apply(
            lambda x: "_".join(str(val) for val in x.values()))
            )
            .rename_axis('params')
    )
cv_results_df[
    ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
    ]"""
cv_results_df = pd.DataFrame(best_model.cv_results_)
cv_results_df = cv_results_df.sort_values(by=['rank_test_score'])
cv_results_df[
    ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
    ]
#Check how well model generalizes
y_train_pred = best_model.predict(X_train)
train_ac = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average='weighted')

#dummy classifier
dummy_score_dict = {}
strategies = ['stratified', 'most_frequent', 'uniform']
for strategy in strategies:
        dclf = DummyClassifier(strategy=strategy)
        dclf.fit(X_train, y_train)
        dummy_score = dclf.score(X_test, y_test)
        dummy_score_dict[strategy] = dummy_score
dummy_df = pd.DataFrame.from_dict(dummy_score_dict, orient='index', columns=['score'])

ac = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
#cm = confusion_matrix(y_test, y_pred)
df_best_classifier = best_model.best_estimator_.get_params()['classifier']
pca_components = best_model.best_estimator_.named_steps['pca'].n_components
#,'best classifier': df_best_classifier
results = pd.DataFrame({'PCA components': pca_components,"Train Accuracy": train_ac, "Test Accuracy": ac, "Train F1":train_f1, "Test F1" : f1}, index=[key])
print("Cross-Validation best parameters: ", cv_results_df.iloc[0]['param_classifier'])
print("Cross-validation mean test score:", cv_results_df.iloc[0]['mean_test_score'])
dfresults = dfresults.append(results)
print(dummy_df)
print(dfresults)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
cv_results_df.head()

Cross-Validation best parameters:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=96,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Cross-validation mean test score: 0.37182838850837613
                  score
stratified     0.129715
most_frequent  0.204728
uniform        0.112775
      PCA components  Train Accuracy  Test Accuracy  Train F1   Test F1
week              15        0.999874       0.394009  0.999874  0.391261
[[2398  281  234  408  560  112  267  150  388]
 [ 376  765   72  167  279   56  125   80  214]
 [ 336   74  693  181  245

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__criterion,param_classifier__n_estimators,param_pca__n_components,param_classifier__C,param_classifier__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,105.098951,1.582974,0.696645,0.09765,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,96,15,,,{'classifier': RandomForestClassifier(bootstra...,0.405275,0.323035,0.387663,0.333859,0.409311,0.371828,0.036323,1
12,143.540457,3.542147,1.013205,0.260957,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,128,15,,,{'classifier': RandomForestClassifier(bootstra...,0.383734,0.315153,0.382829,0.340794,0.418453,0.368193,0.036182,2
14,265.948975,4.355393,3.369695,1.214388,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,256,15,,,{'classifier': RandomForestClassifier(bootstra...,0.39834,0.317465,0.366435,0.33491,0.411202,0.36567,0.035808,3
8,73.277475,1.379415,0.469021,0.074956,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,64,15,,,{'classifier': RandomForestClassifier(bootstra...,0.396764,0.323876,0.344683,0.334174,0.417087,0.363317,0.036775,4
2,64.046363,4.411528,1.062383,0.181597,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,96,15,,,{'classifier': RandomForestClassifier(bootstra...,0.376379,0.311896,0.352354,0.32976,0.373476,0.348773,0.024926,5


In [23]:
dummy_score_dict = {}
strategies = ['stratified', 'most_frequent', 'uniform']
for strategy in strategies:
    dclf = DummyClassifier(strategy=strategy)
    dclf.fit(X_train, y_train)
    dummy_score = dclf.score(X_test, y_test)
    dummy_score_dict[strategy] = dummy_score
dummy_df = pd.DataFrame.from_dict(dummy_score_dict, orient='index', columns=['score'])

y_pred = best_model.predict(X_test)

cv_results_df = pd.DataFrame(clf_CV.cv_results_)
cv_results_df = cv_results_df.sort_values(by=['rank_test_score'])
cv_results_df = (
            cv_results_df
            .set_index(cv_results_df["params"].apply(
            lambda x: "_".join(str(val) for val in x.values()))
            )
            .rename_axis('params')
            )
cv_results_df[
            ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
            ]

     

ac = accuracy_score(y_test, y_pred)
print(ac)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)
#cm = confusion_matrix(y_test, y_pred)
df_best_classifier = best_model.best_estimator_.get_params()['classifier']
pca_components = clf_CV.best_estimator_.named_steps['pca'].n_components
#,'best classifier': df_best_classifier
results = pd.DataFrame({'PCA components': pca_components, "Test Accuracy": ac, "Test f1" : f1}, index=[key])
dfresults = dfresults.append(results)
print(dummy_df)
print(cv_results_df.iloc[0]['param_classifier'])
print(cv_results_df)
print(results)

0.3731438812083973
0.3703634252904805
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=128,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
                                                    mean_fit_time  \
params                                                              
RandomForestClassifier(bootstrap=True, ccp_alph...     121.708640   
RandomForestClassifier(bootstrap=True, ccp_alph...      63.170216   
RandomForestClassifier(bootstrap=True, ccp_alph...     102.042401   
RandomForestClassifier(bootstrap=True, ccp_alph...      50.027442  

In [20]:
dummy_df

Unnamed: 0,score
stratified,0.128563
most_frequent,0.203149
uniform,0.113287


In [25]:
cv_results_df.head()

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__criterion,param_classifier__n_estimators,param_pca__n_components,param_classifier__C,param_classifier__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='entropy', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=128,\n n_jobs=None, oob_score=False, random_state=None,\n verbose=0, warm_start=False)_entropy_128_15",121.70864,3.134313,0.711036,0.08599,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,128,15,,,{'classifier': RandomForestClassifier(bootstra...,0.495219,0.284994,0.31095,0.327974,0.417823,0.367392,0.07799,1
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='entropy', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=128,\n n_jobs=None, oob_score=False, random_state=None,\n verbose=0, warm_start=False)_entropy_64_15",63.170216,1.227362,0.372185,0.02624,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,64,15,,,{'classifier': RandomForestClassifier(bootstra...,0.487233,0.293506,0.304224,0.339533,0.411833,0.367266,0.072887,2
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='entropy', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=128,\n n_jobs=None, oob_score=False, random_state=None,\n verbose=0, warm_start=False)_entropy_96_15",102.042401,4.364663,0.706636,0.194897,"RandomForestClassifier(bootstrap=True, ccp_alp...",entropy,96,15,,,{'classifier': RandomForestClassifier(bootstra...,0.467164,0.285099,0.309058,0.336171,0.423392,0.364177,0.069554,3
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='entropy', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=128,\n n_jobs=None, oob_score=False, random_state=None,\n verbose=0, warm_start=False)_gini_128_15",50.027442,0.494794,0.753048,0.018397,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,128,15,,,{'classifier': RandomForestClassifier(bootstra...,0.461805,0.269441,0.285834,0.307272,0.395124,0.343895,0.073189,4
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='entropy', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=128,\n n_jobs=None, oob_score=False, random_state=None,\n verbose=0, warm_start=False)_gini_64_15",25.947269,0.5544,0.470516,0.061937,"RandomForestClassifier(bootstrap=True, ccp_alp...",gini,64,15,,,{'classifier': RandomForestClassifier(bootstra...,0.457812,0.266919,0.285834,0.308323,0.400063,0.34379,0.073067,5


In [40]:
print(cv_results_df.iloc[0]['param_classifier'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=128,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [33]:
results

Unnamed: 0,PCA components,Accuracy,f1
week,15,0.373144,0.370363


In [34]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2263  267  214  440  567   96  285  164  465]
 [ 352  722   66  197  270   45  107   70  218]
 [ 350  104  622  178  248   72  126   63  184]
 [ 498  195  188 1112  476   91  163  137  248]
 [ 635  235  197  393 1660   97  220  101  323]
 [ 192   88   66  123  167  323   85   43   85]
 [ 412  141  151  178  309   63  651   92  167]
 [ 260  129   77  149  151   33  100  360  130]
 [ 606  186  159  271  395   72  175   91 1032]]
              precision    recall  f1-score   support

          DB       0.41      0.48      0.44      4761
          DE       0.35      0.35      0.35      2047
          DT       0.36      0.32      0.34      1947
          LB       0.37      0.36      0.36      3108
          OL       0.39      0.43      0.41      3861
          QB       0.36      0.28      0.31      1172
          RB       0.34      0.30      0.32      2164
          TE       0.32      0.26      0.29      1389
          WR       0.36      0.35      0.35      2987

    accuracy             