In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

from clean2 import clean_games

pd.set_option("display.max_columns", 250)
pd.set_option("display.max_rows", 73)

Let's clean & load our data. For time's sake, we're only going to use years 1990-2020.

In [2]:
game_df = clean_games("data/games.pickle", start_year=1990)

In [3]:
game_df.shape

(5954, 293)

In [5]:
# print all columns
for col_name in game_df.columns:
    print(col_name)

game_id
season_year
week_num
team_year
full_game_date
team
opp
decade
log_year
margin
game_outcome
pts_off
pts_def
pass_cmp
pass_att
pass_yds
pass_td
pass_int
pass_sacked
pass_sacked_yds
pass_yds_per_att
pass_net_yds_per_att
pass_cmp_perc
pass_rating
rush_att
rush_yds
rush_yds_per_att
rush_td
fgm
fga
xpm
xpa
punt
punt_yds
overtime
time_of_poss
third_down_success
third_down_att
fourth_down_success
fourth_down_att
team_home_game
result_tie
result_win
total_td_off
total_yds_off
prev_week_num
prev_result_win
prev_result_tie
prev_margin
off_bye
sn_total_pts_off
sn_total_margin
sn_total_pts_def
sn_total_pass_cmp
sn_total_pass_att
sn_total_pass_yds
sn_total_pass_td
sn_total_pass_int
sn_total_pass_sacked
sn_total_pass_sacked_yds
sn_total_pass_yds_per_att
sn_total_pass_net_yds_per_att
sn_total_pass_cmp_perc
sn_total_pass_rating
sn_total_rush_att
sn_total_rush_yds
sn_total_rush_yds_per_att
sn_total_rush_td
sn_total_fgm
sn_total_fga
sn_total_xpm
sn_total_xpa
sn_total_punt
sn_total_punt_yds
sn_tot

In [6]:
# construct a list containing all of the columns we could use in model
indicators = ['prev_', 'roll3', 'ewma_']
key_variables = [col for col in game_df.columns if col[:5] in indicators]
key_variables.sort()

additional = ['season_year', 'full_game_date', 'team', 'team_home_game', 'opp', 'margin']

for col in additional[::-1]:
    key_variables.insert(0, col)

key_variables

['season_year',
 'full_game_date',
 'team',
 'team_home_game',
 'opp',
 'margin',
 'prev_margin',
 'prev_margin_opp',
 'prev_result_tie',
 'prev_result_tie_opp',
 'prev_result_win',
 'prev_result_win_opp',
 'prev_week_num',
 'prev_week_num_opp',
 'roll3_num_ties',
 'roll3_num_ties_opp',
 'roll3_num_wins',
 'roll3_num_wins_opp']

Our DF is still too large to use `df.info()`.

Let's split the columns into sections. Since we have so many columns, we need to do quite a bit of splitting.

Let's start with a small selection of columns.

In [8]:
small_df = game_df[['margin', 'team', 'opp', 'game_id', 'season_year', 'sn_total_margin', 'ewma3_margin',
                    'ewma19_margin', 'sn_total_margin_opp', 'ewma3_margin_opp', 'ewma19_margin_opp']]

In [9]:
indicators = ['prev_', 'roll3', 'ewma_']

prev_vars = [col for col in game_df.columns if col[:5] == 'prev_']
roll_vars = [col for col in game_df.columns if col[:4] == 'roll']
sn_total_vars = [col for col in game_df.columns if col[:4] == 'sn_t']
ewma_vars = [col for col in game_df.columns if col[:4] == 'ewma']

# remove dupe in ewma_vars
ewma_vars = list(dict.fromkeys(ewma_vars))

prev_df = game_df[prev_vars]
roll3_df = game_df[roll_vars]
ewma_df = game_df[roll_vars]
sn_total_df = game_df[sn_total_vars]

In [10]:
# sns.pairplot(small_df, plot_kws=dict(alpha=.1, edgecolor='none'))

In [11]:
# sns.pairplot(def_df, plot_kws=dict(alpha=.1, edgecolor='none'))

In [12]:
# new ewma cols? i.e. ewma_total_yards_off - ewma_total_yards_off_opp

# create list of all ewma vars + row indicators & remove dupes
ewma_vars
ewma_vars

['ewma3_pts_off',
 'ewma3_margin',
 'ewma3_pts_def',
 'ewma3_pass_cmp',
 'ewma3_pass_att',
 'ewma3_pass_yds',
 'ewma3_pass_td',
 'ewma3_pass_int',
 'ewma3_pass_sacked',
 'ewma3_pass_sacked_yds',
 'ewma3_pass_yds_per_att',
 'ewma3_pass_net_yds_per_att',
 'ewma3_pass_cmp_perc',
 'ewma3_pass_rating',
 'ewma3_rush_att',
 'ewma3_rush_yds',
 'ewma3_rush_yds_per_att',
 'ewma3_rush_td',
 'ewma3_fgm',
 'ewma3_fga',
 'ewma3_xpm',
 'ewma3_xpa',
 'ewma3_punt',
 'ewma3_punt_yds',
 'ewma3_third_down_success',
 'ewma3_third_down_att',
 'ewma3_fourth_down_success',
 'ewma3_fourth_down_att',
 'ewma3_team_home_game',
 'ewma3_result_tie',
 'ewma3_result_win',
 'ewma3_total_td_off',
 'ewma3_total_yds_off',
 'ewma19_pts_off',
 'ewma19_margin',
 'ewma19_pts_def',
 'ewma19_pass_cmp',
 'ewma19_pass_att',
 'ewma19_pass_yds',
 'ewma19_pass_td',
 'ewma19_pass_int',
 'ewma19_pass_sacked',
 'ewma19_pass_sacked_yds',
 'ewma19_pass_yds_per_att',
 'ewma19_pass_net_yds_per_att',
 'ewma19_pass_cmp_perc',
 'ewma19_pass_

In [13]:
opp_ewmas = [var for var in ewma_vars if var[-4:] == '_opp']
team_ewmas = [var for var in ewma_vars[5:] if var not in opp_ewmas]

opp_ewmas.sort()
team_ewmas.sort()

In [14]:
opp_ewmas, team_ewmas

(['ewma19_fga_opp',
  'ewma19_fgm_opp',
  'ewma19_fourth_down_att_opp',
  'ewma19_fourth_down_success_opp',
  'ewma19_margin_opp',
  'ewma19_pass_att_opp',
  'ewma19_pass_cmp_opp',
  'ewma19_pass_cmp_perc_opp',
  'ewma19_pass_int_opp',
  'ewma19_pass_net_yds_per_att_opp',
  'ewma19_pass_rating_opp',
  'ewma19_pass_sacked_opp',
  'ewma19_pass_sacked_yds_opp',
  'ewma19_pass_td_opp',
  'ewma19_pass_yds_opp',
  'ewma19_pass_yds_per_att_opp',
  'ewma19_pts_def_opp',
  'ewma19_pts_off_opp',
  'ewma19_punt_opp',
  'ewma19_punt_yds_opp',
  'ewma19_result_tie_opp',
  'ewma19_result_win_opp',
  'ewma19_rush_att_opp',
  'ewma19_rush_td_opp',
  'ewma19_rush_yds_opp',
  'ewma19_rush_yds_per_att_opp',
  'ewma19_team_home_game_opp',
  'ewma19_third_down_att_opp',
  'ewma19_third_down_success_opp',
  'ewma19_total_td_off_opp',
  'ewma19_total_yds_off_opp',
  'ewma19_xpa_opp',
  'ewma19_xpm_opp',
  'ewma3_fga_opp',
  'ewma3_fgm_opp',
  'ewma3_fourth_down_att_opp',
  'ewma3_fourth_down_success_opp',
  

In [15]:
plot_cols = game_df[['ewma19_margin', 'ewma19_margin_opp', 'ewma19_pts_off',
                'ewma19_pts_off_opp', 'margin']]



In [16]:
# sns.pairplot(plot_cols, plot_kws=dict(alpha=.1, edgecolor='none'))

In [17]:
plot_cols.describe()

Unnamed: 0,ewma19_margin,ewma19_margin_opp,ewma19_pts_off,ewma19_pts_off_opp,margin
count,5954.0,5954.0,5954.0,5954.0,5954.0
mean,0.038458,0.588235,21.646045,21.692438,-0.772422
std,7.273097,7.250326,5.294727,5.134292,14.627858
min,-38.151,-32.166,2.092,5.694,-58.0
25%,-5.062,-4.407,17.993,18.11325,-10.0
50%,0.1335,0.7085,21.409,21.473,-1.0
75%,5.13875,5.55875,24.9885,25.0065,7.0
max,29.708,29.151,46.35,44.848,59.0


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

def split_and_validate(X, y):
    '''
        For a set of features and target X, y, perform a 80/20 train/val split, 
        fit and validate a linear regression model, and report results
    '''

    # perform train/val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    
    # fit linear regression to training data
    lr_model = LinearRegression()
    print(y_train[:5])
    lr_model.fit(X_train, y_train)
    
    # score fit model on validation data
    val_score = lr_model.score(X_val, y_val)
    
    # report results
    print('\nValidation R^2 score was:', val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}') 
    
    return lr_model

In [19]:
game_df.columns

Index(['game_id', 'season_year', 'week_num', 'team_year', 'full_game_date',
       'team', 'opp', 'decade', 'log_year', 'margin',
       ...
       'fgm_opp', 'sn_total_total_td_off_opp', 'xpa_opp', 'ewma3_pass_td_opp',
       'sn_total_xpm_opp', 'sn_total_pass_att_opp', 'sn_total_margin_opp',
       'sn_total_result_win_opp', 'ewma3_margin_diff', 'ewma19_margin_diff'],
      dtype='object', length=293)

In [20]:
mask = game_df['team_year'] == 'chi-2019'
game_df[mask][['ewma19_pts_off', 'ewma19_pts_def']]

Unnamed: 0,ewma19_pts_off,ewma19_pts_def
11074,17.649,13.173
11075,17.17,11.088
11076,18.105,14.241
11077,19.577,18.885
11078,18.891,18.523
11079,18.032,19.134
11080,18.353,18.132
11081,16.61,17.959
11082,16.959,17.382
11083,17.94,17.747


Let's try all of the roll3 columns and compare the results to ewma columns. 

In [38]:
ewma_cols = [col for col in game_df.columns if col[:4] == 'ewma']
sn_total_cols = [col for col in game_df.columns if col[:4] == 'sn_t']

In [39]:
ewma_cols = list(set(ewma_cols))

In [40]:
X = game_df[ewma_cols]
y = game_df['margin']

split_and_validate(X, y)

8591    -29.0
5280    -18.0
1210     24.0
11699   -19.0
4252     -4.0
Name: margin, dtype: float64

Validation R^2 score was: 0.09152450363950027
Feature coefficient results: 

ewma19_pass_net_yds_per_att : 5.00
ewma3_result_tie : -2.82
ewma3_third_down_att : -0.98
ewma19_pass_att : 0.09
ewma19_xpm_opp : 0.36
ewma3_total_yds_off_opp : 149.64
ewma19_fourth_down_success : 0.10
ewma19_pass_int : 1.32
ewma19_rush_yds_per_att : -0.96
ewma19_punt_yds : -0.06
ewma19_result_win : -0.43
ewma19_fourth_down_att : -1.48
ewma19_punt : 1.72
ewma3_pass_cmp : -0.52
ewma19_pass_cmp_opp : -0.89
ewma3_fgm_opp : -1.43
ewma19_result_win_opp : -12.36
ewma3_fgm : -2.49
ewma3_pass_int : -1.66
ewma19_rush_att : -0.13
ewma3_third_down_success_opp : -1.20
ewma19_pass_cmp : 0.41
ewma3_xpm : 0.10
ewma3_pass_yds_per_att : 0.95
ewma19_pts_def : 224.80
ewma3_rush_att : 0.20
ewma19_third_down_att_opp : -0.07
ewma3_margin_diff : -94.28
ewma3_total_td_off : -98.47
ewma19_pass_cmp_perc : -0.15
ewma19_pass_td_opp : -165.1

LinearRegression()

In [41]:
X = game_df[roll3_cols]
y = game_df['margin']

split_and_validate(X, y)

8591    -29.0
5280    -18.0
1210     24.0
11699   -19.0
4252     -4.0
Name: margin, dtype: float64

Validation R^2 score was: 0.049437748757453326
Feature coefficient results: 

roll3_num_wins : 2.26
roll3_num_ties : -1.01
roll3_num_ties_opp : -1.21
roll3_num_wins_opp : -2.68


LinearRegression()

At first glance, it appears ewma is better. Let's break them into halves to confirm.

In [42]:
split = len(ewma_cols) // 2

X = game_df[ewma_cols[:split]]
y = game_df['margin']

split_and_validate(X, y)

8591    -29.0
5280    -18.0
1210     24.0
11699   -19.0
4252     -4.0
Name: margin, dtype: float64

Validation R^2 score was: 0.10583044662699714
Feature coefficient results: 

ewma19_pass_net_yds_per_att : 0.67
ewma3_result_tie : -0.31
ewma3_third_down_att : 0.03
ewma19_pass_att : 0.18
ewma19_xpm_opp : 0.61
ewma3_total_yds_off_opp : -0.00
ewma19_fourth_down_success : -0.04
ewma19_pass_int : 0.13
ewma19_rush_yds_per_att : 0.49
ewma19_punt_yds : -0.01
ewma19_result_win : 0.15
ewma19_fourth_down_att : 0.54
ewma19_punt : 0.51
ewma3_pass_cmp : -0.20
ewma19_pass_cmp_opp : -0.16
ewma3_fgm_opp : -0.24
ewma19_result_win_opp : -11.13
ewma3_fgm : -0.27
ewma3_pass_int : -0.87
ewma19_rush_att : 0.11
ewma3_third_down_success_opp : -0.70
ewma19_pass_cmp : -0.15
ewma3_xpm : 0.65
ewma3_pass_yds_per_att : -1.15
ewma19_pts_def : -0.06
ewma3_rush_att : 0.05
ewma19_third_down_att_opp : 0.52
ewma3_margin_diff : -0.14
ewma3_total_td_off : -0.47
ewma19_pass_cmp_perc : 0.13
ewma19_pass_td_opp : -0.10
ewma19_m

LinearRegression()

In [43]:
split = len(sn_total_cols) // 2

X = game_df[sn_total_cols[:split]]
y = game_df['margin']

split_and_validate(X, y)

8591    -29.0
5280    -18.0
1210     24.0
11699   -19.0
4252     -4.0
Name: margin, dtype: float64

Validation R^2 score was: 0.04601182497668954
Feature coefficient results: 

sn_total_pts_off : -0.01
sn_total_margin : 0.01
sn_total_pts_def : -0.02
sn_total_pass_cmp : -0.11
sn_total_pass_att : 0.06
sn_total_pass_yds : 0.00
sn_total_pass_td : 0.06
sn_total_pass_int : -0.08
sn_total_pass_sacked : 0.01
sn_total_pass_sacked_yds : 0.01
sn_total_pass_yds_per_att : -0.33
sn_total_pass_net_yds_per_att : 0.29
sn_total_pass_cmp_perc : 0.02
sn_total_pass_rating : 0.00
sn_total_rush_att : 0.02
sn_total_rush_yds : -0.00
sn_total_rush_yds_per_att : 0.02
sn_total_rush_td : -0.07
sn_total_fgm : 0.03
sn_total_fga : -0.04
sn_total_xpm : -0.15
sn_total_xpa : 0.24
sn_total_punt : 0.01
sn_total_punt_yds : -0.00
sn_total_third_down_success : -0.06
sn_total_third_down_att : -0.00
sn_total_fourth_down_success : 0.12
sn_total_fourth_down_att : -0.14
sn_total_team_home_game : -2.43
sn_total_result_tie : -0.88


LinearRegression()

And now the other half...

In [44]:
split = len(ewma_cols) // 2

X = game_df[ewma_cols[split:]]
y = game_df['margin']

split_and_validate(X, y)

8591    -29.0
5280    -18.0
1210     24.0
11699   -19.0
4252     -4.0
Name: margin, dtype: float64

Validation R^2 score was: 0.11682894266237198
Feature coefficient results: 

ewma3_pts_def : 234.90
ewma3_pass_att : 0.05
ewma19_total_yds_off : 0.02
ewma19_xpa_opp : -0.10
ewma19_third_down_success_opp : 0.35
ewma3_pass_sacked_opp : 0.58
ewma19_rush_td : -0.76
ewma3_pass_cmp_perc : -0.03
ewma3_rush_yds_per_att : 0.41
ewma3_result_tie_opp : -2.93
ewma19_punt_opp : 1.63
ewma3_xpm_opp : -0.06
ewma3_pass_sacked_yds_opp : 0.03
ewma3_total_yds_off : -0.02
ewma19_pts_off : -0.34
ewma3_result_win : 0.40
ewma3_fourth_down_att : 0.59
ewma3_pass_rating : 0.01
ewma19_team_home_game_opp : 10.61
ewma19_pass_sacked : -0.30
ewma3_punt_yds_opp : 0.00
ewma3_rush_td_opp : -0.33
ewma3_total_td_off_opp : 0.29
ewma3_fourth_down_success : -0.37
ewma19_margin_diff : 0.55
ewma19_pass_sacked_yds_opp : 0.05
ewma19_result_tie : 2.82
ewma3_pass_yds_opp : -0.01
ewma19_pass_yds : -0.01
ewma3_pass_sacked_yds : -0.03
e

LinearRegression()

In [45]:
split = len(sn_total_cols) // 2

X = game_df[sn_total_cols[split:]]
y = game_df['margin']

split_and_validate(X, y)

8591    -29.0
5280    -18.0
1210     24.0
11699   -19.0
4252     -4.0
Name: margin, dtype: float64

Validation R^2 score was: 0.04882559493700811
Feature coefficient results: 

sn_total_third_down_att_opp : 0.05
sn_total_result_tie_opp : -4.04
sn_total_pts_def_opp : 0.01
sn_total_pass_cmp_opp : -0.01
sn_total_pass_td_opp : -0.15
sn_total_punt_opp : -0.02
sn_total_punt_yds_opp : 0.00
sn_total_rush_yds_per_att_opp : -0.25
sn_total_rush_att_opp : -0.04
sn_total_pass_cmp_perc_opp : -0.02
sn_total_fgm_opp : 0.16
sn_total_pass_sacked_yds_opp : 0.02
sn_total_pass_net_yds_per_att_opp : 0.01
sn_total_pts_off_opp : -0.01
sn_total_fga_opp : -0.07
sn_total_rush_td_opp : 0.12
sn_total_fourth_down_att_opp : 0.05
sn_total_xpa_opp : 0.12
sn_total_pass_sacked_opp : -0.13
sn_total_rush_yds_opp : 0.00
sn_total_pass_int_opp : 0.19
sn_total_third_down_success_opp : -0.02
sn_total_pass_yds_opp : -0.00
sn_total_team_home_game_opp : 2.43
sn_total_total_yds_off_opp : 0.00
sn_total_pass_yds_per_att_opp : -0.13


LinearRegression()

Okay, that checks out. Moving forward, we'll use columns with the `ewma` prefix.

Let's also start factoring in home game, which we know often impacts who wins a game.

Here, we'll introduce `off_bye`, which tells us whether or not the team is coming off a bye week. Perhaps teams play better when they've had an extra week of rest time.

In [46]:
[col for col in game_df.columns]

['game_id',
 'season_year',
 'week_num',
 'team_year',
 'full_game_date',
 'team',
 'opp',
 'decade',
 'log_year',
 'margin',
 'game_outcome',
 'pts_off',
 'pts_def',
 'pass_cmp',
 'pass_att',
 'pass_yds',
 'pass_td',
 'pass_int',
 'pass_sacked',
 'pass_sacked_yds',
 'pass_yds_per_att',
 'pass_net_yds_per_att',
 'pass_cmp_perc',
 'pass_rating',
 'rush_att',
 'rush_yds',
 'rush_yds_per_att',
 'rush_td',
 'fgm',
 'fga',
 'xpm',
 'xpa',
 'punt',
 'punt_yds',
 'overtime',
 'time_of_poss',
 'third_down_success',
 'third_down_att',
 'fourth_down_success',
 'fourth_down_att',
 'team_home_game',
 'result_tie',
 'result_win',
 'total_td_off',
 'total_yds_off',
 'prev_week_num',
 'prev_result_win',
 'prev_result_tie',
 'prev_margin',
 'off_bye',
 'sn_total_pts_off',
 'sn_total_margin',
 'sn_total_pts_def',
 'sn_total_pass_cmp',
 'sn_total_pass_att',
 'sn_total_pass_yds',
 'sn_total_pass_td',
 'sn_total_pass_int',
 'sn_total_pass_sacked',
 'sn_total_pass_sacked_yds',
 'sn_total_pass_yds_per_att',

In [29]:
X = game_df[[
 'year',
 'log_year',
 'sn_total_pts_off',
 'sn_total_pts_off_opp',
 'ewma19_total_yds_off',
 'ewma19_total_yds_off_opp',
 'ewma19_total_td_off',
 'ewma19_total_td_off_opp',
 'ewma19_total_td_off_opp',
 '',
 '',
 '',
 '',
 '',
]]

y = game_df['margin']

split_and_validate(X, y)

KeyError: "['ewma_to_def', 'ewma_rush_yds_def', 'ewma_pass_yds_off_opp', 'ewma_pts_def', 'ewma_to2_def_opp', 'ewma_to2_off_opp', 'ewma_result_win', 'ewma_rush_yds_off_opp', 'ewma_pts_off_opp', 'ewma_yards_def_opp', 'ewma_yards_off_opp', 'ewma_to_off_opp', 'ewma_first_down_off_opp', 'ewma_to_def_opp', 'ewma_result_tie', 'ewma_pass_yds_def_opp', 'ewma_pass_yds_off', 'ewma_to2_def', 'ewma_first_down_off', 'ewma_yards_off', 'off_bye_opp', 'ewma_to2_off', 'ewma_margin', 'ewma_rush_yds_def_opp', 'ewma_pass_yds_def', 'ewma_pts_def_opp', 'ewma_to_off', 'ewma_margin_opp', 'ewma_yards_def', 'ewma_rush_yds_off', 'ewma_pts_off', 'ewma_result_win_opp'] not in index"