## Variable Importance ##
### Code for determining the top ten variables for predicting the winner of a football game ###

**Importing the data**

In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,  GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm #adds progress bar!

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'NFL_2011_21.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the datafile
games = pd.read_csv(file_content_stream)
games.head()

Unnamed: 0.1,Unnamed: 0,game_id,season,game_type,week,gameday,weekday,gametime,gametime_hour,gametime_minute,away_team,away_score,home_team,home_score,location,result,total,overtime,pfr,espn,away_rest,home_rest,away_moneyline,home_moneyline,spread_line,away_spread_odds,home_spread_odds,total_line,under_odds,over_odds,div_game,roof,surface,temp,wind,away_qb_id,home_qb_id,referee,stadium_id,stadium,outdoor,grass,playoff,home_DVOA_Rank,home_DVOA,away_DVOA_Rank,away_DVOA,home_win,home_afterbye,away_afterbye,tie
0,1,2011_01_NO_GB,2011,REG,1,9/8/2011,Thursday,20:30,20,30,NO,34,GB,42,Home,8,76,0,201109080gnb,310908009,7,7,222,-250,4.5,100,-108,47.5,104,-115,0,outdoors,grass,68,5,00-0020531,00-0023459,Clete Blakeman,GNB00,Lambeau Field,1,1,0,2,0.26,1,0.29,1,0,0,0
1,2,2011_01_PIT_BAL,2011,REG,1,9/11/2011,Sunday,13:00,13,0,PIT,7,BAL,35,Home,28,42,0,201109110rav,310911033,7,7,113,-125,1.0,108,-117,37.0,-101,-109,1,outdoors,sportturf,75,2,00-0022924,00-0026158,Tony Corrente,BAL00,M&T Bank Stadium,1,0,0,8,0.14,3,0.22,1,0,0,0
2,3,2011_01_ATL_CHI,2011,REG,1,9/11/2011,Sunday,13:00,13,0,ATL,12,CHI,30,Home,18,42,0,201109110chi,310911003,7,7,-116,105,-1.0,-109,101,40.5,-101,-109,0,outdoors,grass,76,7,00-0026143,00-0024226,Ed Hochuli,CHI98,Soldier Field,1,1,0,15,0.01,7,0.17,1,0,0,0
3,4,2011_01_CIN_CLE,2011,REG,1,9/11/2011,Sunday,13:00,13,0,CIN,27,CLE,17,Home,-10,44,0,201109110cle,310911005,7,7,273,-310,6.5,109,-118,36.5,-101,-109,1,outdoors,grass,72,9,00-0027973,00-0027688,Bill Leavy,CLE00,Cleveland Browns Stadium,1,1,0,25,-0.15,17,-0.01,0,0,0,0
4,5,2011_01_IND_HOU,2011,REG,1,9/11/2011,Sunday,13:00,13,0,IND,7,HOU,34,Home,27,41,0,201109110htx,310911034,7,7,369,-430,9.0,-110,102,44.0,-106,-104,1,open,grass,70,0,00-0003292,00-0022787,Walt Coleman,HOU00,Reliant Stadium,0,1,0,5,0.2,32,-0.35,1,0,0,0


**Removing variables that are not useful for prediction (ex- identifiers, scores from the end of the game, etc) or were replaced by another variable**

In [2]:
#removing the additional id columns that came with the file
games=games.drop(columns = ['Unnamed: 0','game_id','home_team','away_team','location','pfr','espn','away_qb_id','home_qb_id','referee','stadium_id','stadium'], axis=1)
#removing variables that were replaced during the data cleaning stage(ex - grass, playoff) (refer to the R file)
games=games.drop(columns = ['game_type','roof','surface','gametime'],axis=1)
#removing columns that were defined after/during the game (ex - home_score, overtime, etc)
games=games.drop(columns = ['away_score','home_score','result','total','overtime'],axis=1)
#removing weekday and gameday
games=games.drop(columns = ['gameday','weekday'],axis=1)

games.head()

Unnamed: 0,season,week,gametime_hour,gametime_minute,away_rest,home_rest,away_moneyline,home_moneyline,spread_line,away_spread_odds,home_spread_odds,total_line,under_odds,over_odds,div_game,temp,wind,outdoor,grass,playoff,home_DVOA_Rank,home_DVOA,away_DVOA_Rank,away_DVOA,home_win,home_afterbye,away_afterbye,tie
0,2011,1,20,30,7,7,222,-250,4.5,100,-108,47.5,104,-115,0,68,5,1,1,0,2,0.26,1,0.29,1,0,0,0
1,2011,1,13,0,7,7,113,-125,1.0,108,-117,37.0,-101,-109,1,75,2,1,0,0,8,0.14,3,0.22,1,0,0,0
2,2011,1,13,0,7,7,-116,105,-1.0,-109,101,40.5,-101,-109,0,76,7,1,1,0,15,0.01,7,0.17,1,0,0,0
3,2011,1,13,0,7,7,273,-310,6.5,109,-118,36.5,-101,-109,1,72,9,1,1,0,25,-0.15,17,-0.01,0,0,0,0
4,2011,1,13,0,7,7,369,-430,9.0,-110,102,44.0,-106,-104,1,70,0,0,1,0,5,0.2,32,-0.35,1,0,0,0


**Removing games that end in a tie and removing the tie column**

Reasoning: In most cases when you bet on a specific team winning a game if the result is a tie you are refunded your bet.

In [3]:
#removing games that tie
games = games.loc[games['tie'] == 0]

#dropping the tie column
games=games.drop(columns = ['tie'],axis=1)

**Determining the most important variables**

The 10 most important variables will be chosen for our final data set to make predictions.

In [5]:
importances= []
for i in tqdm(range(0, 100)):
    ## Defining the input and taregt variables
    x = games.drop(columns = ['home_win'],axis=1)
    y = games['home_win']

    #Splitting data into train and test
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, stratify = y)

    #building 3 models to find the most important variables
    
    #random forest model
    rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train, y_train)
    #extract variable importance
    importances.append(rf_md.feature_importances_)

    #adaboost model
    ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(x_train, y_train)
    #extract variable importance
    importances.append(ada_md.feature_importances_)

    #gradient boosting model
    gb_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(x_train, y_train)
    #extract variable importance
    importances.append(gb_md.feature_importances_)

100%|██████████| 100/100 [11:53<00:00,  7.14s/it]


In [6]:
importance = pd.DataFrame(importances)
importance.columns = ['season','week','gametime_hour','gametime_minute','away_rest','home_rest','away_moneyline','home_moneyline','spread_line','away_spread_odds','home_spread_odds','total_line','under_odds','over_odds','div_game','temp','wind','outdoor','grass','playoff','home_DVOA_Rank','home_DVOA','away_DVOA_Rank','away_DVOA','home_afterbye','away_afterbye']
i = pd.DataFrame({'Importance':importance.apply(np.mean, axis = 0)})
i=i.sort_values(by = 'Importance', ascending = False)
i.head(10)

Unnamed: 0,Importance
home_moneyline,0.147186
away_DVOA,0.124452
home_DVOA,0.121425
away_moneyline,0.105816
away_DVOA_Rank,0.105792
spread_line,0.093758
home_DVOA_Rank,0.090396
total_line,0.041853
temp,0.027272
home_spread_odds,0.019971


**The variables that will be included in the final models will be:**

home_moneyline, away_DVOA, home_DVOA, away_moneyline, away_DVOA_Rank, spread_line, home_DVOA_Rank, total_line, temp, and home_spread_odds

In [9]:
gamesFinal = games.drop(columns=['season','week','gametime_hour','gametime_minute','away_rest','home_rest','away_spread_odds','under_odds','over_odds','div_game','wind','outdoor','grass','playoff','home_afterbye','away_afterbye'],axis=1)
gamesFinal.head()

Unnamed: 0,away_moneyline,home_moneyline,spread_line,home_spread_odds,total_line,temp,home_DVOA_Rank,home_DVOA,away_DVOA_Rank,away_DVOA,home_win
0,222,-250,4.5,-108,47.5,68,2,0.26,1,0.29,1
1,113,-125,1.0,-117,37.0,75,8,0.14,3,0.22,1
2,-116,105,-1.0,101,40.5,76,15,0.01,7,0.17,1
3,273,-310,6.5,-118,36.5,72,25,-0.15,17,-0.01,0
4,369,-430,9.0,102,44.0,70,5,0.2,32,-0.35,1


**Export gamesFinal to CSV to use for model evaluation**

In [11]:
#exporting final version of games file to be used for model predictions
gamesFinal.to_csv("games_Final.csv", index=False)