In [18]:
import zipfile, os
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV, cross_val_score
from functions.functions import previous_yrs, opponent_stats

# Read in files

In [19]:
_file = os.path.join('Data', 'finalgamedata.zip')
zf = zipfile.ZipFile(_file)
gamestats = pd.read_csv(zf.open('gamestats.csv'))
gamelist = pd.read_csv(zf.open('gameslist.csv'))

In [20]:
# Create a list of the teams 
teamlist = list(gamelist['Team'].unique())

# Target variables

In [21]:
target_variables = ["FG","RushNetYards","PassYards","Interceptions","PassAttempts",
                    "Punts","Points","FumblesLost","Penalties","Plays","def_RushNetYards","def_PassYards",
                    "def_Interceptions","def_Punts", "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles",
                    "def_Plays"
                   ]

# Create opponent variable names

In [22]:
# Create the opponent variable names 
opponent_variables = ['opp_' + x for x in target_variables]

# Create dataset

In [23]:
base_year_mask = ~((gamelist.year == 2013) & (gamelist['gamenumber'] == 1))

In [24]:
gamelist[target_variables] = gamelist[base_year_mask].apply(lambda x: previous_yrs(team = x['Team'], 
                                                                                   year = x['year'], 
                                                                                   game = x['gamenumber'], 
                                                                                   cols = target_variables, 
                                                                                   gamestats = gamestats,
                                                                                   debug = False
                                                                                  ), 
                                                            axis = 1
                                                           )

In [25]:
# Subset the datset to only the needed games 
games = gamelist[base_year_mask].copy()

# Calculate the opponent data 
games[opponent_variables] = gamelist.apply(lambda x: opponent_stats(team = x['Team'],
                                           date = x['Date'], 
                                           year = x['year'], 
                                           cols = target_variables,
                                           gamestats = gamestats,
                                           debug = False
                                          )
                             if x['opponentName'] in teamlist
                             else pd.Series([None for x in range(0, len(target_variables))]),
                             axis = 1
                            )

# Count the number of missing variables

In [26]:
games['missing'] = games[target_variables + opponent_variables].isnull().sum(axis=1)

# Create variables for analysis 

In [27]:
y = games[games['missing'] == 0]['WinLoss'].values
X = games[games['missing'] == 0][target_variables + opponent_variables].values

# Impute missing data

In [28]:
# Create and fit an imputer for missing data 
imr = Imputer(missing_values = 'NaN', strategy='mean', axis = 0)
imr = imr.fit(X)

In [29]:
X_imp = imr.fit_transform(X)

# Run single Random Forest Classifier

In [30]:
# Create the train test split 
X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.33, random_state=42)

In [31]:
# Create and fit the DT Classifier 
dt = RandomForestClassifier()
dt.fit(X= X_train, y=y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [32]:
print("Accuracy:\t", (y_train == dt.predict(X_train)).mean())
print("Accuracy:\t", (y_test == dt.predict(X_test)).mean())

Accuracy:	 0.976024411508
Accuracy:	 0.618584070796


# Run Grid Search on Random Forest Classifier

In [16]:
tree_parameters = {'criterion':['gini','entropy'],
                   'max_depth':[1,2,3,4,5,6,7,8,9,10],
                   'n_estimators':[10,20,30,40,50,75,80,85,90, 100], 
                   'n_jobs': [-1]
                  }
clf = GridSearchCV(RandomForestClassifier(), tree_parameters, cv=5)
clf.fit(X_imp, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 80, 85, 90, 100], 'n_jobs': [-1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
tree_model = clf.best_estimator_
print("Accuracy:\t", clf.best_score_) 
print("Best Parameters:\t", clf.best_params_) 

Accuracy:	 0.667640186916
Best Parameters:	 {'criterion': 'entropy', 'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1}
