In [60]:
import zipfile, os
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from functions.functions import previous_yrs, opponent_stats

# Read in files

In [61]:
_file = os.path.join('Data', 'finalgamedata.zip')
zf = zipfile.ZipFile(_file)
gamestats = pd.read_csv(zf.open('gamestats.csv'))
gamelist = pd.read_csv(zf.open('gameslist.csv'))
gamelist['opponentName'] = gamelist['opponentName'].str.replace("  ", " ")

In [62]:
gamestats.TOP = pd.to_timedelta(gamestats.TOP).astype('timedelta64[s]')

In [63]:
# Create a list of the teams 
teamlist = list(gamelist['Team'].unique())

In [64]:
# Create the opponent's game list data for use later 
opponent_game_list_data = gamelist.copy(deep=True)
opponent_game_list_data.drop(['Opponent', 'Result', 'year', 'opponentName'], axis = 1, inplace = True)
opponent_game_list_data.rename(columns={'Team':'opponentName'}, inplace = True)
# Relabel the variables
opponent_game_list_columns = ['opp_'+ x if x not in ['Date', 'opponentName']
                              else x
                              for x in list(opponent_game_list_data.columns) 
                             ]

opponent_game_list_data.columns = opponent_game_list_columns

In [65]:
gamelist = gamelist.merge(opponent_game_list_data, 
                          left_on = ['opponentName', 'Date'],
                          right_on = ['opponentName', 'Date'], 
                          how = 'left'
                         );

# Target variables

In [66]:
target_variables = ["FG","RushNetYards","PassYards","Interceptions","Pct", "RZScores",
                    "Punts","FumblesLost","Penalties","Plays","def_RushNetYards","def_PassYards",
                    "def_Interceptions","def_Punts", "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles",
                    "def_Plays", "TOP"
                   ]

# Create opponent variable names

In [67]:
# Create the opponent variable names 
opponent_variables = ['opp_' + x for x in target_variables]

# Create dataset

In [68]:
base_year_mask = ~((gamelist.year == 2013) & (gamelist['gamenumber'] == 1))

In [69]:
gamelist[target_variables] = gamelist[base_year_mask].apply(lambda x: previous_yrs(team = x['Team'], 
                                                                                   year = x['year'], 
                                                                                   game = x['gamenumber'], 
                                                                                   cols = target_variables, 
                                                                                   gamestats = gamestats,
                                                                                   debug = False
                                                                                  ), 
                                                            axis = 1
                                                           )

In [70]:
# Subset the datset to only the needed games 
games = gamelist[base_year_mask].copy()

# Calculate the opponent game data 
games[opponent_variables] = gamelist.apply(lambda x: opponent_stats(team = x['Team'],
                                           date = x['Date'], 
                                           year = x['year'], 
                                           cols = target_variables,
                                           gamestats = gamestats,
                                           debug = False
                                          )
                             if x['opponentName'] in teamlist
                             else pd.Series([None for x in range(0, len(target_variables))]),
                             axis = 1
                            )

# Game list file variables to include

In [71]:
game_list_variables = ['gamenumber', 'Coach_years', 'Coach_WL', 'HomeAway', 'history_WL_1yrs']
game_list_variables = game_list_variables + ['opp_' + x for x in game_list_variables]

In [72]:
final_variables = target_variables + opponent_variables + game_list_variables

In [73]:
games[final_variables];

# Count the number of missing variables

In [74]:
games['missing'] = games[final_variables].isnull().sum(axis=1)

In [75]:
games[final_variables];

# Create variables for analysis 

In [76]:
y = games[games['missing'] == 0]['WinLoss'].values
X = games[games['missing'] == 0][final_variables].values

# Run single Random Forest Classifier

In [77]:
# Create the train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [78]:
# Create and fit the DT Classifier 
dt = RandomForestClassifier(random_state = 42)
dt.fit(X= X_train, y=y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [79]:
print("Accuracy:\t", (y_train == dt.predict(X_train)).mean())
print("Accuracy:\t", (y_test == dt.predict(X_test)).mean())

Accuracy:	 0.989112683723
Accuracy:	 0.716335540839


# Run Grid Search on Random Forest Classifier

In [80]:
imputer = Imputer()
forest = RandomForestClassifier()
pipe = Pipeline(steps=[('imputer', imputer), ('forest', forest)])

In [81]:
parameters = {'forest__criterion': ['gini', 'entropy'], 
              'forest__max_depth':[1,2,3,4,5], 
              'forest__n_estimators':[100, 500, 1000, 1250], 
              'forest__n_jobs':[-1], 
              'forest__random_state':[42], 
              'forest__min_samples_split':[5, 10],
              'imputer__missing_values':['NaN'], 
              'imputer__strategy':['mean'], 
              'imputer__axis':[0], 
             }

In [82]:
grid = GridSearchCV(pipe, parameters, cv=3)
grid.fit(X, y)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_s...mators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'forest__criterion': ['gini', 'entropy'], 'forest__max_depth': [1, 2, 3, 4, 5], 'forest__n_estimators': [100, 500, 1000, 1250], 'forest__n_jobs': [-1], 'forest__random_state': [42], 'forest__min_samples_split': [5, 10], 'imputer__missing_values': ['NaN'], 'imputer__strategy': ['mean'], 'imputer__axis': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [83]:
tree_model = grid.best_estimator_
print("Accuracy:\t", grid.best_score_) 
print("Best Parameters:\t", grid.best_params_) 

Accuracy:	 0.7386073642
Best Parameters:	 {'forest__criterion': 'gini', 'forest__max_depth': 5, 'forest__min_samples_split': 10, 'forest__n_estimators': 1250, 'forest__n_jobs': -1, 'forest__random_state': 42, 'imputer__axis': 0, 'imputer__missing_values': 'NaN', 'imputer__strategy': 'mean'}


In [84]:
feature_importance = grid.best_estimator_.named_steps['forest'].feature_importances_

In [85]:
x = list(zip(final_variables, feature_importance))

In [86]:
importance = pd.DataFrame(x, columns=['var', 'importance'])

In [87]:
importance.sort_values('importance', ascending = False)

Unnamed: 0,var,importance
51,opp_history_WL_1yrs,0.204435
46,history_WL_1yrs,0.127477
31,opp_def_RushNetYards,0.069478
10,def_RushNetYards,0.061863
5,RZScores,0.048289
26,opp_RZScores,0.047525
22,opp_RushNetYards,0.035835
1,RushNetYards,0.033392
25,opp_Pct,0.028671
6,Punts,0.026829
