In [568]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import re

df_2014 = pd.read_csv('2014_master.csv')
df_2015 = pd.read_csv('2015_master.csv')
df_2016 = pd.read_csv('2016_master.csv')
df_2017 = pd.read_csv('2017_master.csv')

In [569]:
df_2014['Season'] = 2014
df_2015['Season'] = 2015
df_2016['Season'] = 2016
df_2017['Season'] = 2017

df = pd.concat([df_2014, df_2015, df_2016, df_2017])

# not a real game type, dropped
df = df[df.game_type_0!=1]

# index counter to be removed
df.drop(['Unnamed: 0','game_type_0'], axis=1, inplace=True)

In [570]:
#rename columns so more readable

new_columns = df.columns.values
new_columns[58:65] = ['TNF', 'Sunday_AM', 'Sunday_PM', 'SNF', 'MNF', 'England', 'Saturday']
df.columns = new_columns

In [571]:
for col in range(66,114,4):
    title = df.columns[col][5:]
    df['home_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col-57]]
for col in range(7,59,4):
    title = df.columns[col][5:]
    df['away_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col+61]]

In [572]:
for col in range(65,115,4):
    title = df.columns[col][5:]
    df['home_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col-57]]
for col in range(6,58,4):
    title = df.columns[col][5:]
    df['away_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col+61]]

In [573]:
# dropping tampa and miami week 2 2017 games because they had no week 1 data because of flood 
df.reset_index(drop=True, inplace=True)

df.drop(df[(df.Season == 2017) & (df.Week == 2) & (df.Away_team == 'MIA')].index, inplace=True)
df.drop(df[(df.Season == 2017) & (df.Week == 2) & (df.Home_team == 'TAM')].index, inplace=True)

In [574]:
# # creating teams location features 
# #home timezone

def h_tz(x):
    if x in ['LAC','OAK','SEA','LAR','SFO']:
        tz = 'Home_team_west'
    elif x in ['DEN','ARI']:
        tz = 'Home_team_mtn'
    elif x in ['KAN','IND','NOR','DET','MIN','CHI','TEN','GNB','HOU','DAL']:
        tz = 'Home_team_midwest'
    else:
        tz = 'Home_team_east'
    return tz

df['Home_tz'] = df['Home_team'].apply(h_tz)    
        
df = df.join(pd.get_dummies(df['Home_tz']))
df.drop('Home_tz', axis=1, inplace=True)

#away timezone

def a_tz(x):
    if x in ['LAC','OAK','SEA','LAR','SFO']:
        tz = 'Away_team_west'
    elif x in ['DEN','ARI']:
        tz = 'Away_team_mtn'
    elif x in ['KAN','IND','NOR','DET','MIN','CHI','TEN','GNB','HOU','DAL']:
        tz = 'Away_team_midwest'
    else:
        tz = 'Away_team_east'
    return tz

df['Away_tz'] = df['Away_team'].apply(a_tz)  

df = df.join(pd.get_dummies(df['Away_tz']))
df.drop('Away_tz', axis=1, inplace=True)

# games timezone

def g_tz(x):
    if x[1]==1:
        tz = 'game_eng'
    elif x[0] in ['LAC','OAK','SEA','LAR','SFO']:
        tz = 'game_west'
    elif x[0] in ['DEN','ARI']:
        tz = 'game_mtn'
    elif x[0] in ['KAN','IND','NOR','DET','MIN','CHI','TEN','GNB','HOU','DAL']:
        tz = 'game_midwest'
    else:
        tz = 'game_east'
    return tz

df['game_tz'] = df[['Home_team','England']].apply(g_tz, axis=1)  

df = df.join(pd.get_dummies(df['game_tz']))
df.drop('game_tz', axis=1, inplace=True)

# # Adding column for division games# Adding 

def division_game(x):
    AFC_east = ['NWE','BUF','MIA','NYJ']
    AFC_north = ['PIT','BAL','CIN','CLE']
    AFC_south = ['JAX','TEN','IND','HOU']
    AFC_west = ['KAN','LAC','OAK','DEN']
    NFC_east = ['PHI','DAL','WAS','NYG']
    NFC_north = ['MIN','DET','GNB','CHI']
    NFC_south = ['NOR','CAR','ATL','TAM']
    NFC_west = ['LAR','SEA','ARI','SFO']
    home = x[0]
    away = x[1]
    if ((home in AFC_east) & (away in AFC_east)) | ((home in AFC_north) & (away in AFC_north)):
        d = 1
    elif ((home in AFC_south) & (away in AFC_south)) | ((home in AFC_west) & (away in AFC_west)):
        d = 1
    elif ((home in NFC_east) & (away in NFC_east)) | ((home in NFC_north) & (away in NFC_north)):
        d = 1
    elif ((home in NFC_south) & (away in NFC_south)) | ((home in NFC_west) & (away in NFC_west)):
        d = 1
    else:
        d=0
    return d


df['Division_game'] = df[['Home_team','Away_team']].apply(division_game, axis=1)

In [575]:
# measure time between games

# df['game_type'] = df[['TNF', 'Sunday_AM', 'Sunday_PM', 'SNF', 'MNF', 'England', 'Saturday']].idxmax(axis=1)

# def time_btwn(x):
#     week = x[0]
#     team = x[1]
#     season = x[2]
#     game_t = {'TNF':0, 'Sunday_AM':3, 'Sunday_PM':3, 'SNF':3, 'MNF':4, 'England':3, 'Saturday':2}
#     try: 
#         week_after = df[((df['Week']==week) & (df['Season']==season)) & ((df['Home_team']==team) | (df['Away_team']==team))]['game_type'].iloc[0]
#         try:
#             week_before = df[((df['Week']==week-1) & (df['Season']==season)) & ((df['Home_team']==team) | (df['Away_team']==team))]['game_type'].iloc[0]
#             weekly = 7
#         except:
#             week_before = df[((df['Week']==week-2) & (df['Season']==season)) & ((df['Home_team']==team) | (df['Away_team']==team))]['game_type'].iloc[0]
#             weekly = 14
#         tb = game_t[week_after] + weekly - game_t[week_before]
#     except:
#         tb = 7 
#     return tb

# df['Home_team_days_after_last_game'] = df[['Week','Home_team','Season']].apply(time_btwn, axis=1)
# df['Away_team_days_after_last_game'] = df[['Week','Away_team','Season']].apply(time_btwn, axis=1)

# df.drop('game_type', axis=1, inplace=True)

In [576]:
# ADD GAME DATA

pg = pd.read_csv('NFL_scrape.csv')
pg = pg.drop('Unnamed: 0', axis=1)
df_hw = pd.merge(df,pg,how='left', on=['Week','Season','Home_team','Away_team'])
df = df_hw.drop('Home_win',axis=1)

In [577]:
# DUMMY VARIABLES FOR TEAMS, MAKE SURE TO CHECK THAT TEAMS ARE NOT DROPPED FROM TRAINING DATA BELOW

df = pd.get_dummies(data=df,columns=['Home_team'])
df = pd.get_dummies(data=df,columns=['Away_team'])

In [578]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [579]:
df_test_by_season = df[df['Season']==2017]
df_train_by_season = df[df['Season']!=2017]

y_train = df_train_by_season['Home_cover']
X_train = df_train_by_season.drop(['Season','Home_cover'], axis=1)
# X_train = df_train_by_season.drop(['Season','Home_team', 'Away_team', 'Home_cover'], axis=1)

y_test = df_test_by_season['Home_cover']
X_test = df_test_by_season.drop(['Season','Home_cover'], axis=1)
# X_test = df_test_by_season.drop(['Season','Home_team', 'Away_team', 'Home_cover'], axis=1)

In [580]:
params = [{'n_estimators': [9,12,15],'max_features': ['auto','sqrt','log2'], 'max_depth':[9,12,15]}]

clf = GridSearchCV(RandomForestClassifier(random_state=42), params, scoring='accuracy', cv=StratifiedKFold(10)).fit(X_train, y_train)
clf.cv_results_
clf.best_params_
print('RFC GridSearchCV Best Score: ', clf.best_score_, ' with ' , clf.best_params_)

print('RFC Test Score: ', accuracy_score(y_test, clf.predict(X_test)))
print('RFC Train Score: ', accuracy_score(y_train, clf.predict(X_train)))

('RFC GridSearchCV Best Score: ', 0.51738525730180807, ' with ', {'max_features': 'auto', 'n_estimators': 15, 'max_depth': 9})
('RFC Test Score: ', 0.5104602510460251)
('RFC Train Score: ', 0.97913769123783034)


In [581]:
#using top5 predictions and seeing how model would have performed with top five predictions

guess = list(zip(abs(clf.predict_proba(X_test)[:,1]-.5), y_test, [0 if x<.5 else 1 for x in clf.predict_proba(X_test)[:,1]]))
top5 = sorted(guess, reverse=True)[:5]
print(top5)

[(0.34444740763981485, 1, 0), (0.32296164102790265, 1, 0), (0.31606060606060604, 1, 0), (0.31020414118158479, 0, 0), (0.2957587818567855, 1, 0)]


In [582]:
# how model would have done by week in 2017 for top five predictions random forest# how mo 

for week in range(2,18):
    df_test_by_week = df_test_by_season[df_test_by_season['Week']==week]

    y_test_by_week = df_test_by_week['Home_cover']
    X_test_by_week = df_test_by_week.drop(['Season', 'Home_cover'], axis=1)
#     X_test_by_week = df_test_by_week.drop(['Season','Home_team', 'Away_team', 'Home_cover'], axis=1)

    guess = list(zip(abs(clf.predict_proba(X_test_by_week)[:,1]-.5), y_test_by_week, [0 if x<.5 else 1 for x in clf.predict_proba(X_test_by_week)[:,1]]))
    top5 = sorted(guess, reverse=True)[:5]
    print(top5)
    w = 0
    for acc in top5:
        if (acc[1]==acc[2]):
            w+=1
    print('Week %d record: ' % week)
    print(str(w) +'-' + str(len(top5)-w))

[(0.278743961352657, 0, 1), (0.23030303030303034, 1, 0), (0.16956521739130426, 1, 1), (0.16596491228070176, 1, 1), (0.16138888888888892, 0, 1)]
Week 2 record: 
2-3
[(0.2845614035087719, 0, 0), (0.23999999999999999, 1, 0), (0.14128146453089246, 1, 0), (0.13512384565016144, 1, 1), (0.11704545454545456, 0, 1)]
Week 3 record: 
2-3
[(0.2007287672070035, 1, 0), (0.17626794258373202, 0, 1), (0.17216949416591687, 0, 1), (0.15618072208821621, 0, 0), (0.15233662926372965, 1, 0)]
Week 4 record: 
1-4
[(0.24659820958399159, 1, 0), (0.19232612704602564, 1, 0), (0.16562498612188054, 0, 0), (0.11395697193165544, 0, 1), (0.11326140086473463, 1, 0)]
Week 5 record: 
1-4
[(0.22975091697666072, 1, 1), (0.22175995544416599, 0, 0), (0.20212784212784213, 1, 0), (0.18699951459676861, 1, 0), (0.17170878014708901, 1, 0)]
Week 6 record: 
2-3
[(0.24853990376638463, 1, 0), (0.18150092875886265, 0, 0), (0.14528272893320471, 1, 1), (0.10307890426311478, 1, 0), (0.083533500405710615, 0, 0)]
Week 7 record: 
3-2
[(0.267