# Notebook to document that the hyperparameter tuned random forest does not improve to the point of a linear regression



# Feature engineering on the NFL spread_scores dataset
The general plan is to add a bunch of features, make a shallow learning predictive pipeline, and test the effectiveness of the features.  The goal is to improve the percent chance of winning prediction component of the simulation.    

Plan of basic features to add:  
* win/loss percent
* streak
* points for/ points against
* home/ away
* team name (one-hot-encode)

These features can be calculated with various parameter settings:
* number of games to look back
* whether to look back to the previous season
* whether to weight the data based on the opponent at the time
* operations on parameters, e.g., squared, log normalized, parameters multiplied or added together  

There are a lot of potential combinations.  All features will end up being doubled (added for the opponent as well).

There are two ways to approach this: either build all the features in a large CSV file, and then test the algorithms, or build them one or a few at a time and test as we go.  The second sounds more fun, so I'll go with that.  



In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

## Load the data and do some wrangling

In [2]:
path = r'..\processed_data'
data = pd.read_csv(path + '\\spreadspoke_scores_processed.csv')

In [3]:
data.columns

Index(['Unnamed: 0', 'schedule_date', 'schedule_season', 'schedule_week',
       'schedule_playoff', 'team_home', 'score_home', 'score_away',
       'team_away', 'team_favorite_id', 'spread_favorite', 'over_under_line',
       'stadium', 'stadium_neutral', 'weather_temperature', 'weather_wind_mph',
       'weather_humidity', 'weather_detail', 'team_home_id', 'team_away_id',
       'winner', 'favorite_won', 'team_underdog_id'],
      dtype='object')

In [4]:
# base the "left hand" team based on the home team; make the spread negative = favorite   
data.loc[(data.team_home_id == data.team_favorite_id),'spread_home'] = data['spread_favorite']
data.loc[(data.team_home_id != data.team_favorite_id),'spread_home'] = -data['spread_favorite']

In [5]:
data['home_won'] = (data.winner == data.team_home_id)

In [6]:
# replace the playoffs with numbers
playoff_list = data.schedule_week.drop_duplicates().sort_values()[-6::].tolist()
number_playoffs = {'Conference':21, 'Division':20, 'SuperBowl':22, 'Superbowl':22, 'WildCard':19, 'Wildcard':19}
data.loc[data.schedule_week.isin(playoff_list), 'schedule_week'] = data.loc[data.schedule_week.isin(playoff_list), 'schedule_week'].map(number_playoffs)
data.schedule_week = pd.to_numeric(data.schedule_week)
data.score_home = pd.to_numeric(data.score_home)
data.score_away = pd.to_numeric(data.score_away)
data.head()

Unnamed: 0.1,Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,...,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id,winner,favorite_won,team_underdog_id,spread_home,home_won
0,0,09/01/1979,1979,1,False,Tampa Bay Buccaneers,31.0,16.0,Detroit Lions,TB,...,9.0,87.0,,TB,DET,TB,True,DET,-3.0,True
1,1,11/23/1980,1980,12,False,Tampa Bay Buccaneers,10.0,24.0,Detroit Lions,TB,...,9.0,77.0,,TB,DET,DET,False,DET,-3.0,False
2,2,10/04/1981,1981,5,False,Tampa Bay Buccaneers,28.0,10.0,Detroit Lions,TB,...,9.0,76.0,,TB,DET,TB,True,DET,-1.0,True
3,3,12/26/1982,1982,8,False,Tampa Bay Buccaneers,23.0,21.0,Detroit Lions,TB,...,11.0,72.0,,TB,DET,TB,True,DET,-3.5,True
4,4,09/04/1983,1983,1,False,Tampa Bay Buccaneers,0.0,11.0,Detroit Lions,TB,...,7.0,83.0,,TB,DET,DET,False,DET,-3.0,False


## First Benchmark = logistic regression on point spreads

In [7]:
X = data.spread_home
X = X.values.reshape(-1, 1) 
y = data.home_won
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [8]:
model = LogisticRegression().fit(X_train, y_train)

In [9]:
percent_correct = sum(model.predict(X_test)==y_test)/len(y_test)
percent_correct

0.6597370834607154

In [10]:
log_loss(y_pred=model.predict_proba(X_test), y_true=y_test)

0.6148705115814553

## Second Benchmark = decision tree on point spreads

In [11]:
model = DecisionTreeClassifier().fit(X_train, y_train)

In [12]:
percent_correct = sum(model.predict(X_test)==y_test)/len(y_test)
percent_correct

0.6502598593702231

In [13]:
log_loss(y_pred=model.predict_proba(X_test), y_true=y_test)

0.6588701109321393

## Third Benchmark = random forest on point spreads

In [14]:
model = RandomForestClassifier().fit(X_train, y_train)

In [15]:
percent_correct = sum(model.predict(X_test)==y_test)/len(y_test)
percent_correct

0.6514827269948028

In [16]:
log_loss(y_pred=model.predict_proba(X_test), y_true=y_test)

0.6592703381789566

## Fourth Benchmark = extra trees on point spreads

In [19]:
model = ExtraTreesClassifier().fit(X_train, y_train)

In [20]:
percent_correct = sum(model.predict(X_test)==y_test)/len(y_test)
percent_correct

0.6502598593702231

In [21]:
log_loss(y_pred=model.predict_proba(X_test), y_true=y_test)

0.6588701109321392

## Make the first feature
Points for - points against in the last n games

### Some more wrangling

In [72]:
# reorganize the DF so each team is listed once (each game is listed twice)
games = data[['schedule_season', 'schedule_week', 'team_home_id', 'team_away_id', 'spread_home', 
              'score_home', 'score_away','home_won']].sort_values(by = ['schedule_season', 'schedule_week', 'team_home_id'])
games['home'] = True
games.rename(columns = {'team_home_id':'team', 'team_away_id': 'opponent', 'spread_home':'spread',
                        'score_home':'pts_for', 'score_away': 'pts_against', 'home_won' : 'won'}, inplace = True)
games.tail()

Unnamed: 0,schedule_season,schedule_week,team,opponent,spread,pts_for,pts_against,won,home
9901,2018,20,NE,LAC,-3.5,41.0,28.0,True,True
2514,2018,20,NO,PHI,-8.5,20.0,14.0,True,True
6312,2018,21,KC,NE,-3.0,31.0,37.0,False,True
370,2018,21,NO,LAR,-3.0,23.0,26.0,False,True
6316,2018,22,LAR,NE,2.0,3.0,13.0,False,True


In [73]:
copy = games.copy()
copy.rename(columns = {'team':'opponent', 'opponent':'team', 'pts_for': 'pts_against', 'pts_against':'pts_for'}, inplace = True)
copy.home = False
copy.spread = -copy.spread
copy.won = -copy.won
copy.tail()

Unnamed: 0,schedule_season,schedule_week,opponent,team,spread,pts_against,pts_for,won,home
9901,2018,20,NE,LAC,3.5,41.0,28.0,False,False
2514,2018,20,NO,PHI,8.5,20.0,14.0,False,False
6312,2018,21,KC,NE,3.0,31.0,37.0,True,False
370,2018,21,NO,LAR,3.0,23.0,26.0,True,False
6316,2018,22,LAR,NE,-2.0,3.0,13.0,True,False


In [74]:
games = pd.concat([games, copy]).sort_values(by = ['schedule_season', 'schedule_week', 'team'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [75]:
games = games[[ 'schedule_season', 'schedule_week','team','opponent', 'home', 'spread','pts_for', 'pts_against', 'won']]

In [76]:
games.tail()

Unnamed: 0,schedule_season,schedule_week,team,opponent,home,spread,pts_for,pts_against,won
370,2018,21,LAR,NO,False,3.0,26.0,23.0,True
6312,2018,21,NE,KC,False,3.0,37.0,31.0,True
370,2018,21,NO,LAR,True,-3.0,23.0,26.0,False
6316,2018,22,LAR,NE,True,2.0,3.0,13.0,False
6316,2018,22,NE,LAR,False,-2.0,13.0,3.0,True


In [77]:
# add running tally of game counts for each team
teams = games.team.drop_duplicates().sort_values().to_list()
for team in teams:
    games.loc[games.team == team, 'team_game_count'] =  np.arange(sum(games.team == team)) 

In [78]:
games[games.team == team].head()

Unnamed: 0,schedule_season,schedule_week,team,opponent,home,spread,pts_for,pts_against,won,team_game_count
5801,1979,1,WAS,TEN,True,4.0,27.0,29.0,False,0.0
4584,1979,2,WAS,DET,False,0.0,27.0,24.0,True,1.0
1269,1979,3,WAS,NYG,True,-6.0,27.0,0.0,True,2.0
4570,1979,4,WAS,ARI,False,3.0,17.0,7.0,True,3.0
4667,1979,5,WAS,ATL,False,3.0,16.0,7.0,True,4.0


## Add points for/ points against features

In [79]:
def log_reg(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    lr = LogisticRegression().fit(X_train, y_train)
    
    percent_correct = sum(lr.predict(X_test)==y_test)/len(y_test)
    ll = log_loss(y_pred=lr.predict_proba(X_test), y_true=y_test)
    print('percent_correct = ',percent_correct*100)
    print('log_loss = ', ll)

In [80]:
# add previous pts_for/ pts_against in previous n games

def for_against(df, lookbacks = [1]):
    games = df.copy()
    
    average_pts_for = games.pts_for.mean() # for imputing the first value
    params = ['pts_for', 'pts_against']
    new_features = []


    for lookback in lookbacks:

        for param in params:
            new_feature = param+'_roll_'+str(lookback)
            new_features.append(new_feature)
            for team in teams:
                rolling  = games[games.team == team][param].rolling(window = lookback, min_periods = 1).mean().to_list()
                # rolling average is inclusive - shift back and impute the first value as the global average
                rolling.insert(0,average_pts_for)
                del rolling[-1]
                games.loc[games.team == team, new_feature] = rolling

    # add opponents' pts for and against
    opp_features = []
    for feature in new_features:
        opp_features.append('opp_'+feature)

    col_names = { 'team':'opponent'}
    col_names.update(dict(zip(new_features, opp_features)))
    col_names

    games = games.merge(games[['schedule_season', 'schedule_week', 'team']+new_features
                     ].rename(columns  = col_names), 
                on = ['schedule_season', 'schedule_week', 'opponent'] )

    games.columns

    features = new_features+opp_features
    target = 'won'

    # keep home games and features
    X = games[games.home][features]
    y = games[games.home][target]
    print('lookbacks = ',lookbacks)
    log_reg(X, y)

In [82]:
games.head()

Unnamed: 0,schedule_season,schedule_week,team,opponent,home,spread,pts_for,pts_against,won,team_game_count
7346,1979,1,ARI,DAL,True,4.0,21.0,22.0,False,0.0
3888,1979,1,ATL,NO,False,5.0,40.0,34.0,True,0.0
4784,1979,1,BUF,MIA,True,5.0,7.0,9.0,False,0.0
839,1979,1,CHI,GB,True,-3.0,6.0,3.0,True,0.0
5452,1979,1,CIN,DEN,False,3.0,0.0,10.0,False,0.0


In [85]:
# logistic regression based on the point spreads only
features = 'spread'
target = 'won'
X = games[games.home][features]
X = X.values.reshape(-1, 1) 
y = games[games.home][target]
log_reg(X, y)

percent_correct =  65.60684805869764
log_loss =  0.61929562483345




In [86]:
for l in range(1,21):
    lookbacks = [l]
    for_against(games, lookbacks = lookbacks)

lookbacks =  [1]
percent_correct =  60.26894865525673
log_loss =  0.6653583924555065




lookbacks =  [2]
percent_correct =  62.011002444987774
log_loss =  0.6567845479612643




lookbacks =  [3]
percent_correct =  61.980440097799516
log_loss =  0.6489572274590097




lookbacks =  [4]
percent_correct =  63.6002444987775
log_loss =  0.6408653205514933




lookbacks =  [5]
percent_correct =  63.997555012224936
log_loss =  0.6383520778954074




lookbacks =  [6]
percent_correct =  64.82273838630806
log_loss =  0.6355631187644272




lookbacks =  [7]
percent_correct =  64.60880195599023
log_loss =  0.6333306841198364




lookbacks =  [8]
percent_correct =  64.70048899755501
log_loss =  0.6310308028084285




lookbacks =  [9]
percent_correct =  64.60880195599023
log_loss =  0.6298247966674227




lookbacks =  [10]
percent_correct =  65.00611246943765
log_loss =  0.6284413480901965




lookbacks =  [11]
percent_correct =  65.03667481662592
log_loss =  0.6268448517354653




lookbacks =  [12]
percent_correct =  64.82273838630806
log_loss =  0.6273310266480503




lookbacks =  [13]
percent_correct =  65.3117359413203
log_loss =  0.6270436155738742




lookbacks =  [14]
percent_correct =  65.25061124694376
log_loss =  0.6279597693537401




lookbacks =  [15]
percent_correct =  64.79217603911981
log_loss =  0.6287333999920309




lookbacks =  [16]
percent_correct =  64.8838630806846
log_loss =  0.6283667875785148




lookbacks =  [17]
percent_correct =  64.91442542787286
log_loss =  0.6287644871671937




lookbacks =  [18]
percent_correct =  64.63936430317848
log_loss =  0.6291610534802983




lookbacks =  [19]
percent_correct =  64.39486552567237
log_loss =  0.6298917731400313




lookbacks =  [20]
percent_correct =  64.18092909535453
log_loss =  0.6309164693404898




10 - 14 are the best.  Try with combinations of 2 lookbacks

In [389]:
def rand_for(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

    clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1600, n_jobs=None,
            oob_score=False, verbose=0,
            warm_start=False, random_state=42)
    clf.fit(X_train, y_train) 
    
    percent_correct = sum(clf.predict(X_test)==y_test)/len(y_test)
    ll = log_loss(y_pred = clf.predict_proba(X_test), y_true=y_test)
    print('percent_correct = ',percent_correct*100)
    print('log_loss = ', ll)



In [390]:
rand_for(X, y)

percent_correct =  64.3643031784841
log_loss =  0.6349863853069838


Ouch!  Badly overfit.

In [367]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train) 

percent_correct = sum(clf.predict(X_test)==y_test)/len(y_test)
ll = log_loss(y_pred = clf.predict_proba(X_test), y_true=y_test)
print('percent_correct = ',percent_correct*100)
print('log_loss = ', ll)

percent_correct =  57.12102689486552
log_loss =  0.8951736766868521




In [379]:
# try to whole dataset to see how bad overfitting can get
clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [380]:
sum(clf.predict(X)==y)/len(y)

0.9836578230606274

In [381]:
log_loss(y_pred = clf.predict_proba(X), y_true=y)

0.19527542206080925

In [384]:
# hyperparameter tuning with help from here: 
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [386]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 20.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [388]:
rf_random.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1600, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [373]:
clf.predict(X_test)

array([False,  True,  True, ..., False, False,  True])

In [369]:
y_test

11660    False
16078     True
721       True
7077      True
11996    False
5485     False
2308     False
14383    False
17060    False
13506     True
8604      True
6785     False
19200    False
16669     True
18994    False
13327    False
8084      True
18992     True
14712    False
4465     False
9410      True
17616     True
622       True
5477      True
13938    False
14934     True
69        True
15507    False
17218    False
1704     False
         ...  
5251      True
11163    False
16357     True
9069      True
1609     False
9108      True
9683      True
10941    False
17556    False
15371    False
6929      True
10852     True
2486      True
16370     True
10933     True
19610     True
1094      True
6043     False
16055     True
7825      True
50        True
9731     False
17490     True
5016     False
2867      True
5164     False
2605     False
11424    False
12813     True
19709     True
Name: won, Length: 3272, dtype: bool