# NFL Capstone:Modeling

### Starting Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
import sys

## Import datasets

In [3]:
year = pd.read_csv('../data/teamstarterdraft.csv')
yearAV = pd.read_csv('../data/teamstarterdraftAV.csv')
week = pd.read_csv('../data/weekstarterdraft.csv')
weekAV = pd.read_csv('../data/weekstarterdraftAV.csv')
yearnocoach = year.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])
yearnocoachAV = yearAV.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])

## New Datasets with bucketization of major categorical variables and ordinal encoding of weeks

In [4]:
weekord = week.copy()
weekAVord = weekAV.copy()
weekord['Week'] = weekord['Week'].replace('Wild Card', 18)
weekord['Week'] = weekord['Week'].replace('Division', 19)
weekord['Week'] = weekord['Week'].replace('Conf. Champ.', 20)
weekord['Week'] = weekord['Week'].replace('SuperBowl', 21)

weekAVord['Week'] = weekAVord['Week'].replace('Wild Card', 18)
weekAVord['Week'] = weekAVord['Week'].replace('Division', 19)
weekAVord['Week'] = weekAVord['Week'].replace('Conf. Champ.', 20)
weekAVord['Week'] = weekAVord['Week'].replace('SuperBowl', 21)

weekord['Week']= weekord['Week'].astype(int)
weekAVord['Week']= weekAVord['Week'].astype(int)

In [5]:
yearbucket = year.copy()
yearAVbucket = yearAV.copy()
top = yearbucket['coach'].isin(yearbucket['coach'].value_counts().index[:61])
yearbucket.loc[~top, 'coach'] = 'other'

top = yearbucket['offcoor'].isin(yearbucket['offcoor'].value_counts().index[:61])
yearbucket.loc[~top, 'offcoor'] = 'other'

top = yearbucket['defcoor'].isin(yearbucket['defcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'defcoor'] = 'other'

top = yearAVbucket['coach'].isin(yearAVbucket['coach'].value_counts().index[:61])
yearAVbucket.loc[~top, 'coach'] = 'other'

top = yearAVbucket['offcoor'].isin(yearAVbucket['offcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'offcoor'] = 'other'

top = yearAVbucket['defcoor'].isin(yearAVbucket['defcoor'].value_counts().index[:61])
yearAVbucket.loc[~top, 'defcoor'] = 'other'

# Model Selection Regression

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach,                        'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        
        categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=True, handle_unknown='ignore')

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', LinearRegression())])
        search_space = [{'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        results = pd.DataFrame({"R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor':                 df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

                     R2       MSE      RMSE       MAE  \
year          -0.003689  7.030284  2.651468  2.206876   
yearAV        -0.001526  6.826498  2.612757  2.168102   
week           0.027613  6.739145  2.595986  2.170780   
weekAV         0.034467  6.500216  2.549552  2.140941   
yearnocoach    0.015172  6.595608  2.568192  2.152198   
yearnocoachAV  0.007238  6.919831  2.630557  2.194490   

                                                  best regressor  
year           Lasso(alpha=0.0029470517025518097, copy_X=True...  
yearAV         Lasso(alpha=0.004291934260128779, copy_X=True,...  
week           Lasso(alpha=0.0013894954943731374, copy_X=True...  
weekAV         Lasso(alpha=0.0011513953993264468, copy_X=True...  
yearnocoach    Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  
yearnocoachAV  Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  


                     R2       MSE      RMSE       MAE  \
year           0.006471  7.053925  2.655923  2.210044   
yearAV         0.001267  6.893871  2.625618  2.224999   
week           0.027048  6.687490  2.586018  2.160993   
weekAV         0.038889  6.670500  2.582731  2.162373   
yearnocoach    0.008149  6.860356  2.619228  2.175273   
yearnocoachAV  0.000010  6.733512  2.594901  2.177637   

                                                  best regressor  
year           Lasso(alpha=0.033932217718953266, copy_X=True,...  
yearAV         Lasso(alpha=0.0029470517025518097, copy_X=True...  
week           Lasso(alpha=0.0011513953993264468, copy_X=True...  
weekAV         Lasso(alpha=0.0011513953993264468, copy_X=True...  
yearnocoach    Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...  
yearnocoachAV  Lasso(alpha=0.0020235896477251557, copy_X=True...  

# Regression Model Selection with new Datasets

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

#this is what I changed from cell above
dfdict = {'yearbucket':yearbucket, 'yearAVbucket':yearAVbucket, 'week':weekord,'weekAV':weekAVord}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftTeamSelection', axis=1)
        y = df['DraftTeamSelection']
        categorical_features = list(df.select_dtypes(include=['category', object]).columns)
        categorical_transformer = OneHotEncoder(sparse=True, handle_unknown='ignore')

        numeric_features = list(df.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        reg = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', LinearRegression())])
        search_space = [{'regressor': [LinearRegression()],
                        'regressor__normalize': [True, False]},
                        {'regressor':[Ridge()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [Lasso()],
                        'regressor__alpha': np.logspace(-4, 0, 50),
                        'regressor__normalize': [True, False]},
                        {'regressor': [ElasticNet()],
                        'regressor__l1_ratio': np.linspace(0,1,30),
                        'regressor__normalize': [True, False]},
                        {'regressor': [RandomForestRegressor()],
                        'regressor__n_estimators': np.logspace(2,3,20),
                        'regressor__max_depth': np.linspace(1,10,10),
                        'regressor__criterion': ['mse', 'mae']}]
        
        reg_CV = GridSearchCV(reg, search_space, cv=5, n_jobs=-1)
        best_model = reg_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        
        df_best_regressor = best_model.best_estimator_.get_params()['regressor']
        R2 = r2_score(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        MAE = mean_absolute_error(y_test, y_pred)
        results = pd.DataFrame({"R2": R2, 'MSE':MSE, 'RMSE': RMSE, 'MAE':MAE, 'best regressor':                 df_best_regressor}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

                    R2       MSE      RMSE       MAE  \
yearbucket    0.002520  7.142888  2.672618  2.240276   
yearAVbucket  0.013071  6.455497  2.540767  2.151045   
week          0.024096  6.787724  2.605326  2.179035   
weekAV        0.041493  6.602885  2.569608  2.149873   

                                                 best regressor  
yearbucket    Lasso(alpha=0.0016768329368110067, copy_X=True...  
yearAVbucket  Lasso(alpha=0.0020235896477251557, copy_X=True...  
week          Ridge(alpha=0.04941713361323833, copy_X=True, ...  
weekAV        Ridge(alpha=0.040949150623804234, copy_X=True,...  


# CLassification Model Selection

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import accuracy_score,log_loss
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc

dfdict = {'year':year, 'yearAV':yearAV, 'week':week,'weekAV':weekAV, 'yearnocoach': yearnocoach,                        'yearnocoachAV':yearnocoachAV}


dfresults= pd.DataFrame()
for key, df in dfdict.items():
        X = df.drop('DraftPosition', axis=1)
        y = df['DraftPosition']
        categorical_features = list(X.select_dtypes(include=['category', object]).columns)
        categorical_transformer = make_pipeline(OneHotEncoder(sparse=True, handle_unknown='ignore'))

        numeric_features = list(X.select_dtypes(include=['int', 'float']).columns)
        numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer(
        transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', RandomForestClassifier())])
        search_space = [{'classifier': [RandomForestClassifier()],
                        'classifier__n_estimators': [100,500,1000],
                        'classifier__criterion': ['gini', 'entropy']}]
        
        clf_CV = GridSearchCV(clf, search_space, cv=5, n_jobs=-1)
        best_model = clf_CV.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        ac = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred)
        df_best_classifier = best_model.best_estimator_.get_params()['classifier']
        results = pd.DataFrame({"Accuracy": ac, 'f1': f1, 'best classifier': df_best_classifier}, index=[key])
        dfresults = dfresults.append(results)
print(dfresults)

ValueError: Shape of passed values is (1000, 3), indices imply (1, 3)