# Feature engineering on the NFL spread_scores dataset

Try more feature engineering combinations with the functions from the previous feature_engineering notebook moved to straight .py scripts.  

In [1]:
import pandas as pd
import numpy as np
# feature_engineering.py has the functions that we want
import feature_engineering as fe
# prediction_functions.py has some wrappers around some shallow learning algos
# including train_test split 
import prediction_functions as pf

## Load the data and add some features

In [2]:
data = fe.read_data()
data = fe.process(data)
data, new_features = fe.for_against_weighted(data, 
                            lookbacks = [4,14], drop_unweighted = True)

In [3]:
data.head()

Unnamed: 0,schedule_season,schedule_week,team,opponent,home,spread,pts_for,pts_against,won,pts_for_adj_roll_4,pts_against_adj_roll_4,pts_for_adj_roll_14,pts_against_adj_roll_14,opp_pts_for_adj_roll_4,opp_pts_against_adj_roll_4,opp_pts_for_adj_roll_14,opp_pts_against_adj_roll_14
0,1979,1,ARI,DAL,True,4.0,21.0,22.0,False,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506
1,1979,1,ATL,NO,False,5.0,40.0,34.0,True,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506
2,1979,1,BUF,MIA,True,5.0,7.0,9.0,False,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506
3,1979,1,CHI,GB,True,-3.0,6.0,3.0,True,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506
4,1979,1,CIN,DEN,False,3.0,0.0,10.0,False,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506


In [4]:
new_features

['pts_for_adj_roll_4',
 'pts_against_adj_roll_4',
 'pts_for_adj_roll_14',
 'pts_against_adj_roll_14',
 'opp_pts_for_adj_roll_4',
 'opp_pts_against_adj_roll_4',
 'opp_pts_for_adj_roll_14',
 'opp_pts_against_adj_roll_14']

## Try several options to see how the new features fare

In [5]:
pf.log_reg(X = data['spread'].values.reshape(-1, 1), y = data.won)

percent_correct =  65.68462830102274
log_loss =  0.6197260837410923




In [6]:
pf.log_reg(X = data[new_features+['spread']], y = data.won)

percent_correct =  65.4556556250954
log_loss =  0.6200619268454557




In [7]:
pf.log_reg(X = data[new_features], y = data.won)

percent_correct =  61.120439627537785
log_loss =  0.6526686862934007




In [8]:
pf.rand_for(X = data[new_features+['spread']], y = data.won)

    percent_correct =  65.54724469546635
    log_loss =  0.6243288950238


## Try some more features

In [9]:
data, new_feature = fe.param_cube(data, param = 'spread')
new_features.append(new_feature)

In [10]:
data.head()

Unnamed: 0,schedule_season,schedule_week,team,opponent,home,spread,pts_for,pts_against,won,pts_for_adj_roll_4,pts_against_adj_roll_4,pts_for_adj_roll_14,pts_against_adj_roll_14,opp_pts_for_adj_roll_4,opp_pts_against_adj_roll_4,opp_pts_for_adj_roll_14,opp_pts_against_adj_roll_14,spread3
0,1979,1,ARI,DAL,True,4.0,21.0,22.0,False,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,64.0
1,1979,1,ATL,NO,False,5.0,40.0,34.0,True,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,125.0
2,1979,1,BUF,MIA,True,5.0,7.0,9.0,False,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,125.0
3,1979,1,CHI,GB,True,-3.0,6.0,3.0,True,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,-27.0
4,1979,1,CIN,DEN,False,3.0,0.0,10.0,False,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,0.034506,27.0


In [11]:
pf.rand_for(X = data[new_features+['spread']], y = data.won)

    percent_correct =  65.7914822164555
    log_loss =  0.6227814536319197


#### Spread cubed is an improvement

Try some subtraction then cubed features that could be good in theory.  

In [13]:
data, new_feature = fe.params_subtract(data, param0 = 'pts_for_adj_roll_4', param1 = 'opp_pts_against_adj_roll_4')
new_features.append(new_feature)
data, new_feature = fe.param_cube(data, param = new_feature)
new_features.append(new_feature)
data, new_feature = fe.params_subtract(data, param0 = 'pts_against_adj_roll_4', param1 = 'opp_pts_for_adj_roll_4')
new_features.append(new_feature)
data, new_feature = fe.param_cube(data, param = new_feature)
new_features.append(new_feature)

In [14]:
pf.rand_for(X = data[new_features+['spread']], y = data.won)

    percent_correct =  65.40986108990994
    log_loss =  0.6229624860279247


Not much happening.  What next?  Add the win/loss record to the mix.  Rolls 4 and 14 again

In [15]:
data, more_new_features = fe.win_loss(data, lookbacks = [4,14])
new_features += more_new_features

In [16]:
pf.rand_for(X = data[new_features+['spread']], y = data.won)

    percent_correct =  66.68679245283019
    log_loss =  0.613425558194899


Helps a little.  Try 10^spread?  

In [17]:
data, new_feature = fe.param_exp(data, param = 'spread')
new_features.append(new_feature)
pf.rand_for(X = data[new_features+['spread']], y = data.won)

    percent_correct =  66.70188679245283
    log_loss =  0.6129176842213229


In [18]:
pf.log_reg(X = data[new_features+['spread']], y = data.won)

percent_correct =  65.87169811320754
log_loss =  0.6171984151410731


