In [4]:
import zipfile, os
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import  cross_val_score

# Create files

In [6]:
_file = os.path.join('Data', 'finalgamedata.zip')
zf = zipfile.ZipFile(_file)
gamestats = pd.read_csv(zf.open('gamestats.csv'))
gamelist = pd.read_csv(zf.open('gameslist.csv'))

# Target variables

In [7]:
cols2 = ["Coach_WL", "Coach_years", "FG",
         "RushNetYards","PassYards","Interceptions","PassAttempts","Punts","Points","FumblesLost",
         "Penalties","Plays","def_RushNetYards","def_PassYards","def_Interceptions","def_Punts", 
         "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles","def_Plays",
        ]

# Create dataset

In [None]:


# Filter out game 1 of season 2013
base_year_mask = ~((gamebygame_part.year == 2013) & (gamebygame_part['count'] == 1))

gamebygame_part[cols2] = gamebygame_part[base_year_mask].apply(lambda x: previous_yrs(team = x['Team'], 
                                                     year = x['year'], 
                                                     game = x['count'], 
                                                     cols = cols2, 
                                                     final = final,                                                                                     
                                                     debug = False
                                                     ), axis = 1
                                                   )

In [None]:
gamedata = list(gamelistOut['Team'].unique())

In [None]:
# Create the oppoent variable names 
opps = ['opp_' + x for x in cols2]

# Subset the datset to games to keep it short
games = gamedata[~((gamebygame_part.year == 2013) & (gamebygame_part['count'] == 1))].copy()

# Calculate the opponent data 
games[opps] = gamedata.apply(lambda x: opp(team = x['Team'],
                                        date = x['Date'], 
                                        year = x['year'], 
                                        cols = cols2,
                                        final = final,
                                        debug = False
                                       )
                          if x['opponentName'] in gamedata
                          else pd.Series([None for x in range(0, len(cols2))]),
                          axis = 1
                         )

In [None]:
# Select all the games where they don't have missing data for rush data for home team and opponent team
finalgames = games[(pd.notnull(games['opp_def_RushNetYards'])) & (pd.notnull(games['RushNetYards']))]

In [None]:
finalgames[['HomeAway', 'WinLoss']] = finalgames.apply(lambda x: create_variables(x), axis = 1).copy()# Variables to drop out for analysis
drop_variables = ['Date', 'Opponent', 'Result', 'G', 'Team', 'OffenseDefense', 
                  'year', 'Opponent2', 'opponentName', 'WinLoss']

In [None]:
y = finalgames['WinLoss'].values
X = finalgames.drop(drop_variables, axis = 1).values

In [None]:
stats = list(final.columns)

# Run Models

In [294]:
# Create and fit an imputer for missing data 
imr = Imputer(missing_values = 'NaN', strategy='mean', axis = 0)
imr = imr.fit(X)

In [295]:
X_imp = imr.fit_transform(X)

In [296]:
# Create the train test split 
X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.33, random_state=42)

In [297]:
# Create and fit the DT Classifier 
dt = RandomForestClassifier()
dt.fit(X= X_train, y=y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [298]:
print("Accuracy:\t", (y_train == dt.predict(X_train)).mean())
print("Accuracy:\t", (y_test == dt.predict(X_test)).mean())

Accuracy:	 0.978598405371
Accuracy:	 0.634893617021


In [299]:
tree_parameters = {'criterion':['gini','entropy'],
             'max_depth':[1,2,3,4,5,6,7,8,9,10],
             'n_estimators':[10,20,30,40,50,75,80,85,90, 100], 
             'n_jobs': [-1]
            }
clf = GridSearchCV(RandomForestClassifier(), tree_parameters, cv=5)
clf.fit(X_imp, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 80, 85, 90, 100], 'n_jobs': [-1]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [300]:
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

0.6742551995503092 {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 80, 'n_jobs': -1}
