In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import re

df_2014 = pd.read_csv('2014_master.csv')
df_2015 = pd.read_csv('2015_master.csv')
df_2016 = pd.read_csv('2016_master.csv')
df_2017 = pd.read_csv('2017_master.csv')

In [69]:
df_2014['Season'] = 2014
df_2015['Season'] = 2015
df_2016['Season'] = 2016
df_2017['Season'] = 2017

df = pd.concat([df_2014, df_2015, df_2016, df_2017])

# not a real game type, dropped
df = df[df.game_type_0!=1]

# index counter to be removed
df.drop(['Unnamed: 0','game_type_0'], axis=1, inplace=True)

In [70]:
#rename columns so more readable

new_columns = df.columns.values
new_columns[58:65] = ['TNF', 'Sunday_AM', 'Sunday_PM', 'SNF', 'MNF', 'England', 'Saturday']
df.columns = new_columns

In [71]:
for col in range(66,114,4):
    title = df.columns[col][5:]
    df['home_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col-57]]
for col in range(7,59,4):
    title = df.columns[col][5:]
    df['away_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col+61]]

In [72]:
for col in range(65,115,4):
    title = df.columns[col][5:]
    df['home_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col-57]]
for col in range(6,58,4):
    title = df.columns[col][5:]
    df['away_off_spread_%s' % title] = df[df.columns[col]]-df[df.columns[col+61]]

In [73]:
# dropping tampa and miami week 2 2017 games because they had no week 1 data because of flood 
df.reset_index(drop=True, inplace=True)

df.drop(df[(df.Season == 2017) & (df.Week == 2) & (df.Away_team == 'MIA')].index, inplace=True)
df.drop(df[(df.Season == 2017) & (df.Week == 2) & (df.Home_team == 'TAM')].index, inplace=True)

In [63]:
# # creating teams location features 
# #home timezone

def h_tz(x):
    if x in ['LAC','OAK','SEA','LAR','SFO']:
        tz = 'Home_team_west'
    elif x in ['DEN','ARI']:
        tz = 'Home_team_mtn'
    elif x in ['KAN','IND','NOR','DET','MIN','CHI','TEN','GNB','HOU','DAL']:
        tz = 'Home_team_midwest'
    else:
        tz = 'Home_team_east'
    return tz

df['Home_tz'] = df['Home_team'].apply(h_tz)    
        
df = df.join(pd.get_dummies(df['Home_tz']))
df.drop('Home_tz', axis=1, inplace=True)

#away timezone

def a_tz(x):
    if x in ['LAC','OAK','SEA','LAR','SFO']:
        tz = 'Away_team_west'
    elif x in ['DEN','ARI']:
        tz = 'Away_team_mtn'
    elif x in ['KAN','IND','NOR','DET','MIN','CHI','TEN','GNB','HOU','DAL']:
        tz = 'Away_team_midwest'
    else:
        tz = 'Away_team_east'
    return tz

df['Away_tz'] = df['Away_team'].apply(a_tz)  

df = df.join(pd.get_dummies(df['Away_tz']))
df.drop('Away_tz', axis=1, inplace=True)

# games timezone

def g_tz(x):
    if x[1]==1:
        tz = 'game_eng'
    elif x[0] in ['LAC','OAK','SEA','LAR','SFO']:
        tz = 'game_west'
    elif x[0] in ['DEN','ARI']:
        tz = 'game_mtn'
    elif x[0] in ['KAN','IND','NOR','DET','MIN','CHI','TEN','GNB','HOU','DAL']:
        tz = 'game_midwest'
    else:
        tz = 'game_east'
    return tz

df['game_tz'] = df[['Home_team','England']].apply(g_tz, axis=1)  

df = df.join(pd.get_dummies(df['game_tz']))
df.drop('game_tz', axis=1, inplace=True)

# # Adding column for division games# Adding 

def division_game(x):
    AFC_east = ['NWE','BUF','MIA','NYJ']
    AFC_north = ['PIT','BAL','CIN','CLE']
    AFC_south = ['JAX','TEN','IND','HOU']
    AFC_west = ['KAN','LAC','OAK','DEN']
    NFC_east = ['PHI','DAL','WAS','NYG']
    NFC_north = ['MIN','DET','GNB','CHI']
    NFC_south = ['NOR','CAR','ATL','TAM']
    NFC_west = ['LAR','SEA','ARI','SFO']
    home = x[0]
    away = x[1]
    if ((home in AFC_east) & (away in AFC_east)) | ((home in AFC_north) & (away in AFC_north)):
        d = 1
    elif ((home in AFC_south) & (away in AFC_south)) | ((home in AFC_west) & (away in AFC_west)):
        d = 1
    elif ((home in NFC_east) & (away in NFC_east)) | ((home in NFC_north) & (away in NFC_north)):
        d = 1
    elif ((home in NFC_south) & (away in NFC_south)) | ((home in NFC_west) & (away in NFC_west)):
        d = 1
    else:
        d=0
    return d


df['Division_game'] = df[['Home_team','Away_team']].apply(division_game, axis=1)

In [42]:
# measure time between games

# df['game_type'] = df[['TNF', 'Sunday_AM', 'Sunday_PM', 'SNF', 'MNF', 'England', 'Saturday']].idxmax(axis=1)

# def time_btwn(x):
#     week = x[0]
#     team = x[1]
#     season = x[2]
#     game_t = {'TNF':0, 'Sunday_AM':3, 'Sunday_PM':3, 'SNF':3, 'MNF':4, 'England':3, 'Saturday':2}
#     try: 
#         week_after = df[((df['Week']==week) & (df['Season']==season)) & ((df['Home_team']==team) | (df['Away_team']==team))]['game_type'].iloc[0]
#         try:
#             week_before = df[((df['Week']==week-1) & (df['Season']==season)) & ((df['Home_team']==team) | (df['Away_team']==team))]['game_type'].iloc[0]
#             weekly = 7
#         except:
#             week_before = df[((df['Week']==week-2) & (df['Season']==season)) & ((df['Home_team']==team) | (df['Away_team']==team))]['game_type'].iloc[0]
#             weekly = 14
#         tb = game_t[week_after] + weekly - game_t[week_before]
#     except:
#         tb = 7 
#     return tb

# df['Home_team_days_after_last_game'] = df[['Week','Home_team','Season']].apply(time_btwn, axis=1)
# df['Away_team_days_after_last_game'] = df[['Week','Away_team','Season']].apply(time_btwn, axis=1)

# df.drop('game_type', axis=1, inplace=True)

In [74]:
# ADD GAME DATA

pg = pd.read_csv('NFL_scrape.csv')
pg = pg.drop('Unnamed: 0', axis=1)
df_hw = pd.merge(df,pg,how='left', on=['Week','Season','Home_team','Away_team'])
df = df_hw.drop('Home_win',axis=1)

In [64]:
# DUMMY VARIABLES FOR TEAMS, MAKE SURE TO CHECK THAT TEAMS ARE NOT DROPPED FROM TRAINING DATA BELOW

df = pd.get_dummies(data=df,columns=['Home_team'])
df = pd.get_dummies(data=df,columns=['Away_team'])

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [75]:
df_test_by_season = df[df['Season']==2017]
df_train_by_season = df[df['Season']!=2017]

y_train = df_train_by_season['Home_cover']
# X_train = df_train_by_season.drop(['Season','Home_cover'], axis=1)
X_train = df_train_by_season.drop(['Season','Home_team', 'Away_team', 'Home_cover'], axis=1)

y_test = df_test_by_season['Home_cover']
# X_test = df_test_by_season.drop(['Season','Home_cover'], axis=1)
X_test = df_test_by_season.drop(['Season','Home_team', 'Away_team', 'Home_cover'], axis=1)

In [None]:
n_params = [{'penalty':['l1','l2'], 'C':[.01,.1,1,10,100]}]
clf1 = GridSearchCV(LogisticRegression(), n_params, cv=5, scoring='accuracy').fit(X_train, y_train)

print('LR GridSearchCV f1 Score: ', clf1.best_score_, ' with ' , clf1.best_params_ )

print('LR Test Score: ', accuracy_score(y_test, clf1.predict(X_test))) 
print('LR Train Score: ', accuracy_score(y_train, clf1.predict(X_train)))

In [67]:
#Top 5 Predictions Logistic Regression# 

for week in range(2,18):
    df_test_by_week = df_test_by_season[df_test_by_season['Week']==week]

    y_test_by_week = df_test_by_week['Home_cover']
    X_test_by_week = df_test_by_week.drop(['Season','Home_team', 'Away_team', 'Home_cover'], axis=1)
#     X_test_by_week = df_test_by_week.drop(['Season','Home_cover'], axis=1)
    

    guess = list(zip(abs(clf1.predict_proba(X_test_by_week)[:,1]-.5), y_test_by_week, [0 if x<.5 else 1 for x in clf1.predict_proba(X_test_by_week)[:,1]]))
    top5 = sorted(guess, reverse=True)[:5]
    print(top5)
    w = 0
    for acc in top5:
        if (acc[1]==acc[2]):
            w+=1
    print('Week %d record: ' % week)
    print(str(w) +'-' + str(len(top5)-w))

[(0.038969895997213788, 1, 0), (0.038969895997213788, 0, 0), (0.019415817899440191, 0, 0), (0.018319608630932238, 1, 0), (0.016615300845438996, 0, 0)]
Week 2 record: 
3-2
[(0.030015174475391482, 0, 0), (0.021506070113266496, 0, 0), (0.017246522541019271, 0, 0), (0.016394292310066005, 0, 0), (0.01322887600977668, 1, 0)]
Week 3 record: 
4-1
[(0.041920736477732645, 1, 0), (0.024690782417097601, 0, 0), (0.022380904923892331, 0, 0), (0.022136812743979606, 0, 0), (0.021285225626029247, 1, 0)]
Week 4 record: 
3-2
[(0.026415982109170622, 0, 0), (0.025321148096767609, 0, 0), (0.023862870258376301, 0, 0), (0.022767484789604053, 1, 0), (0.01765671895645593, 0, 0)]
Week 5 record: 
4-1
[(0.033602810807040751, 0, 0), (0.033602810807040751, 0, 0), (0.031053928773015493, 0, 0), (0.030447611803493924, 0, 0), (0.030203934617488215, 1, 0)]
Week 6 record: 
4-1
[(0.032776598858287154, 1, 0), (0.029377012021258642, 1, 0), (0.025730774439351678, 1, 0), (0.025730774439351678, 1, 0), (0.024879764042239461, 1, 