In [2]:
import zipfile, os
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from functions.functions import previous_yrs, opponent_stats

# Read in files

In [3]:
_file = os.path.join('Data', 'finalgamedata.zip')
zf = zipfile.ZipFile(_file)
gamestats = pd.read_csv(zf.open('gamestats.csv'))
gamelist = pd.read_csv(zf.open('gameslist.csv'))

In [4]:
# Create a list of the teams 
teamlist = list(gamelist['Team'].unique())

# Target variables

In [5]:
target_variables = ["FG","RushNetYards","PassYards","Interceptions","PassAttempts",
                    "Punts","Points","FumblesLost","Penalties","Plays","def_RushNetYards","def_PassYards",
                    "def_Interceptions","def_Punts", "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles",
                    "def_Plays"
                   ]

# Create opponent variable names

In [6]:
# Create the opponent variable names 
opponent_variables = ['opp_' + x for x in target_variables]

# Create dataset

In [7]:
base_year_mask = ~((gamelist.year == 2013) & (gamelist['gamenumber'] == 1))

In [8]:
gamelist[target_variables] = gamelist[base_year_mask].apply(lambda x: previous_yrs(team = x['Team'], 
                                                                                   year = x['year'], 
                                                                                   game = x['gamenumber'], 
                                                                                   cols = target_variables, 
                                                                                   gamestats = gamestats,
                                                                                   debug = False
                                                                                  ), 
                                                            axis = 1
                                                           )

In [9]:
# Subset the datset to only the needed games 
games = gamelist[base_year_mask].copy()

# Calculate the opponent data 
games[opponent_variables] = gamelist.apply(lambda x: opponent_stats(team = x['Team'],
                                           date = x['Date'], 
                                           year = x['year'], 
                                           cols = target_variables,
                                           gamestats = gamestats,
                                           debug = False
                                          )
                             if x['opponentName'] in teamlist
                             else pd.Series([None for x in range(0, len(target_variables))]),
                             axis = 1
                            )

# Count the number of missing variables

In [10]:
games['missing'] = games[target_variables + opponent_variables].isnull().sum(axis=1)

# Create variables for analysis 

In [11]:
y = games[games['missing'] == 0]['WinLoss'].values
X = games[games['missing'] == 0][target_variables + opponent_variables].values

# Run single Random Forest Classifier

In [12]:
# Create the train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
# Create and fit the DT Classifier 
dt = RandomForestClassifier(random_state = 42)
dt.fit(X= X_train, y=y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [14]:
print("Accuracy:\t", (y_train == dt.predict(X_train)).mean())
print("Accuracy:\t", (y_test == dt.predict(X_test)).mean())

Accuracy:	 0.980819529207
Accuracy:	 0.630088495575


# Run Grid Search on Random Forest Classifier

In [26]:
imputer = Imputer()
forest = RandomForestClassifier()
pipe = Pipeline(steps=[('imputer', imputer), ('forest', forest)])

In [30]:
parameters = {'forest__criterion': ['gini', 'entropy'], 
              'forest__max_depth':[1,2,3,4,5], 
              'forest__n_estimators':[100, 500, 1000, 1250, 2500], 
              'forest__n_jobs':[-1], 
              'forest__random_state':[42], 
              'imputer__missing_values':['NaN'], 
              'imputer__strategy':['mean'], 
              'imputer__axis':[0], 
             }

In [31]:
grid = GridSearchCV(pipe, parameters, cv=5)
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_s...mators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'forest__criterion': ['gini', 'entropy'], 'forest__max_depth': [1, 2, 3, 4, 5], 'forest__n_estimators': [100, 500, 1000, 1250, 2500], 'forest__n_jobs': [1], 'forest__random_state': [42], 'imputer__missing_values': ['NaN'], 'imputer__strategy': ['mean'], 'imputer__axis': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [32]:
tree_model = grid.best_estimator_
print("Accuracy:\t", grid.best_score_) 
print("Best Parameters:\t", grid.best_params_) 

Accuracy:	 0.665595794393
Best Parameters:	 {'forest__criterion': 'entropy', 'forest__max_depth': 5, 'forest__n_estimators': 1250, 'forest__n_jobs': 1, 'forest__random_state': 42, 'imputer__axis': 0, 'imputer__missing_values': 'NaN', 'imputer__strategy': 'mean'}


In [40]:
feature_importance = grid.best_estimator_.named_steps['forest'].feature_importances_

In [49]:
x = list(zip(target_variables, feature_importance))

In [51]:
importance = pd.DataFrame(x, columns=['var', 'importance'])

In [54]:
importance.sort_values('importance', ascending = False)

Unnamed: 0,var,importance
6,Points,0.126958
10,def_RushNetYards,0.096262
5,Punts,0.048704
1,RushNetYards,0.040939
13,def_Punts,0.025181
17,Sacks,0.019773
2,PassYards,0.016771
16,def_PDef,0.015491
11,def_PassYards,0.013168
4,PassAttempts,0.012321
