In [38]:
import zipfile, os
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from functions.functions import previous_yrs, opponent_stats

# Read in files

In [39]:
_file = os.path.join('Data', 'finalgamedata.zip')
zf = zipfile.ZipFile(_file)
gamestats = pd.read_csv(zf.open('gamestats.csv'))
gamelist = pd.read_csv(zf.open('gameslist.csv'))

In [40]:
# Create a list of the teams 
teamlist = list(gamelist['Team'].unique())

# Target variables

In [41]:
target_variables = ["FG","RushNetYards","PassYards","Interceptions","PassAttempts",
                    "Punts","Points","FumblesLost","Penalties","Plays","def_RushNetYards","def_PassYards",
                    "def_Interceptions","def_Punts", "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles",
                    "def_Plays"
                   ]

# Create opponent variable names

In [42]:
# Create the opponent variable names 
opponent_variables = ['opp_' + x for x in target_variables]

# Create dataset

In [43]:
base_year_mask = ~((gamelist.year == 2013) & (gamelist['gamenumber'] == 1))

In [None]:
gamelist[target_variables] = gamelist[base_year_mask].apply(lambda x: previous_yrs(team = x['Team'], 
                                                                                   year = x['year'], 
                                                                                   game = x['gamenumber'], 
                                                                                   cols = target_variables, 
                                                                                   gamestats = gamestats,
                                                                                   debug = False
                                                                                  ), 
                                                            axis = 1
                                                           )

In [None]:
# Subset the datset to only the needed games 
games = gamelist[base_year_mask].copy()

# Calculate the opponent game data 
games[opponent_variables] = gamelist.apply(lambda x: opponent_stats(team = x['Team'],
                                           date = x['Date'], 
                                           year = x['year'], 
                                           cols = target_variables,
                                           gamestats = gamestats,
                                           debug = False
                                          )
                             if x['opponentName'] in teamlist
                             else pd.Series([None for x in range(0, len(target_variables))]),
                             axis = 1
                            )

# Game list file variables to include

In [25]:
list(games.columns);

In [26]:
# Create the opponent's game list data for use later 
opponent_game_list_data = gamelist.copy(deep=True)
opponent_game_list_data.drop(['Opponent', 'Result', 'year', 'opponentName'], axis = 1, inplace = True)

# Relabel the variables
opponent_game_list_columns = ['opp_'+ x if x not in ['Date', 'Team']
                              else x
                              for x in list(opponent_game_list_data.columns) 
                             ]

opponent_game_list_data.columns = opponent_game_list_columns

In [29]:
list(games.columns);

In [36]:
games2 = pd.merge(  gamelist,
                    opponent_game_list_data, 
                    left_on = ['Team', 'Date'],
                    right_on = ['Team', 'Date'], 
                    how = 'left'
                   )

In [37]:
list(games2.columns)

['Date',
 'Opponent',
 'Result',
 'Team',
 'year',
 'gamenumber',
 'opponentName',
 'history_wins_1yrs',
 'history_losses_1yrs',
 'history_WL_1yrs',
 'history_wins_2yrs',
 'history_losses_2yrs',
 'history_WL_2yrs',
 'history_wins_3yrs',
 'history_losses_3yrs',
 'history_WL_3yrs',
 'history_wins_5yrs',
 'history_losses_5yrs',
 'history_WL_5yrs',
 'history_wins_10yrs',
 'history_losses_10yrs',
 'history_WL_10yrs',
 'history_wins_maxyrs',
 'history_losses_maxyrs',
 'history_WL_maxyrs',
 'coach',
 'Coach_wins',
 'Coach_losses',
 'Coach_WL',
 'Coach_years',
 'HomeAway',
 'WinLoss',
 'FG',
 'RushNetYards',
 'PassYards',
 'Interceptions',
 'PassAttempts',
 'Punts',
 'Points',
 'FumblesLost',
 'Penalties',
 'Plays',
 'def_RushNetYards',
 'def_PassYards',
 'def_Interceptions',
 'def_Punts',
 'def_FF',
 'def_PenaltyYards',
 'def_PDef',
 'Sacks',
 'Tackles',
 'def_Plays',
 'opp_gamenumber',
 'opp_history_wins_1yrs',
 'opp_history_losses_1yrs',
 'opp_history_WL_1yrs',
 'opp_history_wins_2yrs',
 'o

In [None]:
game_list_variables = ['gamenumber', 'Coach_years', 'Coach_WL', 'HomeAway', 'history_WL_maxyrs']
game_list_variables = game_list_variables + ['opp_' + x for x in game_list_variables]

In [None]:
final_variables = target_variables + opponent_variables + game_list_variables

In [None]:
list(games.columns)

In [None]:
final_variables

In [None]:
games[final_variables]

# Count the number of missing variables

In [None]:
games['missing'] = games[final_variables].isnull().sum(axis=1)

In [None]:
games[final_variables]

# Create variables for analysis 

In [None]:
y = games[games['missing'] == 0]['WinLoss'].values
X = games[games['missing'] == 0][final_variables].values

# Run single Random Forest Classifier

In [None]:
# Create the train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Create and fit the DT Classifier 
dt = RandomForestClassifier(random_state = 42)
dt.fit(X= X_train, y=y_train)

In [None]:
print("Accuracy:\t", (y_train == dt.predict(X_train)).mean())
print("Accuracy:\t", (y_test == dt.predict(X_test)).mean())

# Run Grid Search on Random Forest Classifier

In [None]:
imputer = Imputer()
forest = RandomForestClassifier()
pipe = Pipeline(steps=[('imputer', imputer), ('forest', forest)])

In [None]:
parameters = {'forest__criterion': ['gini', 'entropy'], 
              'forest__max_depth':[1,2,3,4,5], 
              'forest__n_estimators':[100, 500, 1000, 1250], 
              'forest__n_jobs':[-1], 
              'forest__random_state':[42], 
              'imputer__missing_values':['NaN'], 
              'imputer__strategy':['mean'], 
              'imputer__axis':[0], 
             }

In [None]:
grid = GridSearchCV(pipe, parameters, cv=3)
grid.fit(X, y)

In [None]:
tree_model = grid.best_estimator_
print("Accuracy:\t", grid.best_score_) 
print("Best Parameters:\t", grid.best_params_) 

In [None]:
feature_importance = grid.best_estimator_.named_steps['forest'].feature_importances_

In [None]:
x = list(zip(final_variables, feature_importance))

In [None]:
importance = pd.DataFrame(x, columns=['var', 'importance'])

In [None]:
importance.sort_values('importance', ascending = False)