In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns

from clean3 import clean_games

Let's clean & load our data. For time's sake, we're only going to use years 1990-2020.

In [2]:
game_df = clean_games("data/games.pickle", start_year=1990)

In [3]:
game_df.shape

(11956, 187)

In [4]:
pd.set_option("display.max_columns", 250)
pd.set_option("display.max_rows", 73)

In [5]:
# print all columns
[col for col in game_df.columns]

['game_id',
 'season_year',
 'week_num',
 'team_year',
 'full_game_date',
 'team',
 'decade',
 'opp',
 'game_day_of_week',
 'game_outcome',
 'pts_off',
 'margin',
 'pts_def',
 'pass_cmp',
 'pass_att',
 'pass_yds',
 'pass_td',
 'pass_int',
 'pass_sacked',
 'pass_sacked_yds',
 'pass_yds_per_att',
 'pass_net_yds_per_att',
 'pass_cmp_perc',
 'pass_rating',
 'rush_att',
 'rush_yds',
 'rush_yds_per_att',
 'rush_td',
 'fgm',
 'fga',
 'xpm',
 'xpa',
 'punt',
 'punt_yds',
 'overtime',
 'time_of_poss',
 'third_down_success',
 'third_down_att',
 'fourth_down_success',
 'fourth_down_att',
 'team_home_game',
 'pass_cmp_def',
 'pass_att_def',
 'pass_yds_def',
 'pass_td_def',
 'pass_int_def',
 'pass_sacked_def',
 'pass_sacked_yds_def',
 'pass_yds_per_att_def',
 'pass_net_yds_per_att_def',
 'pass_cmp_perc_def',
 'pass_rating_def',
 'rush_att_def',
 'rush_yds_def',
 'rush_yds_per_att_def',
 'rush_td_def',
 'fgm_def',
 'fga_def',
 'third_down_success_def',
 'third_down_att_def',
 'fourth_down_success_de

In [6]:
# construct a list containing all of the columns we could use in model
indicators = ['prev_', 'roll3', 'ewma3', 'ewma1']
key_variables = [col for col in game_df.columns if col[:5] in indicators]
key_variables.sort()

additional = ['season_year', 'date', 'team', 'team_home_game', 'opp', 'margin']

for col in additional[::-1]:
    key_variables.insert(0, col)

key_variables

['season_year',
 'date',
 'team',
 'team_home_game',
 'opp',
 'margin',
 'ewma19_fga',
 'ewma19_fga_def',
 'ewma19_fga_def_opp',
 'ewma19_fga_opp',
 'ewma19_fgm',
 'ewma19_fgm_def',
 'ewma19_fgm_def_opp',
 'ewma19_fgm_opp',
 'ewma19_fourth_down_att',
 'ewma19_fourth_down_att_def',
 'ewma19_fourth_down_att_def_opp',
 'ewma19_fourth_down_att_opp',
 'ewma19_fourth_down_success',
 'ewma19_fourth_down_success_def',
 'ewma19_fourth_down_success_def_opp',
 'ewma19_fourth_down_success_opp',
 'ewma19_margin',
 'ewma19_margin_opp',
 'ewma19_pass_att',
 'ewma19_pass_att_def',
 'ewma19_pass_att_def_opp',
 'ewma19_pass_att_opp',
 'ewma19_pass_cmp',
 'ewma19_pass_cmp_def',
 'ewma19_pass_cmp_def_opp',
 'ewma19_pass_cmp_opp',
 'ewma19_pass_cmp_perc',
 'ewma19_pass_cmp_perc_def',
 'ewma19_pass_cmp_perc_def_opp',
 'ewma19_pass_cmp_perc_opp',
 'ewma19_pass_int',
 'ewma19_pass_int_def',
 'ewma19_pass_int_def_opp',
 'ewma19_pass_int_opp',
 'ewma19_pass_net_yds_per_att_def',
 'ewma19_pass_net_yds_per_att_de

Our DF is still too large to use `df.info()`.

Let's split the columns into sections. Since we have so many columns, we need to do quite a bit of splitting.


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

def split_and_validate(X, y):
    '''
        For a set of features and target X, y, perform a 80/20 train/val split, 
        fit and validate a linear regression model, and report results
    '''

    # perform train/val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    
    # fit linear regression to training data
    lr_model = LinearRegression()
    print(y_train[:5])
    lr_model.fit(X_train, y_train)
    
    # score fit model on validation data
    val_score = lr_model.score(X_val, y_val)
    
    # report results
    print('\nValidation R^2 score was:', val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}') 
    
    return lr_model

In [23]:
# let's try all of our columns at once.

all_stats = [
 'season_year',
 'week_num', 
 'team_home_game',
#  'ewma19_fga',
#  'ewma19_fga_def',
#  'ewma19_fga_def_opp',
#  'ewma19_fga_opp',
#  'ewma19_fgm',
#  'ewma19_fgm_def',
#  'ewma19_fgm_def_opp',
#  'ewma19_fgm_opp',
#  'ewma19_fourth_down_att',
#  'ewma19_fourth_down_att_def',
#  'ewma19_fourth_down_att_def_opp',
#  'ewma19_fourth_down_att_opp',
#  'ewma19_fourth_down_success',
#  'ewma19_fourth_down_success_def',
#  'ewma19_fourth_down_success_def_opp',
#  'ewma19_fourth_down_success_opp',
#  'ewma19_margin',
#  'ewma19_margin_opp',
#  'ewma19_pass_att',
#  'ewma19_pass_att_def',
#  'ewma19_pass_att_def_opp',
#  'ewma19_pass_att_opp',
#  'ewma19_pass_cmp',
#  'ewma19_pass_cmp_def',
#  'ewma19_pass_cmp_def_opp',
#  'ewma19_pass_cmp_opp',
#  'ewma19_pass_cmp_perc',
#  'ewma19_pass_cmp_perc_def',
#  'ewma19_pass_cmp_perc_def_opp',
#  'ewma19_pass_cmp_perc_opp',
#  'ewma19_pass_int',
#  'ewma19_pass_int_def',
#  'ewma19_pass_int_def_opp',
#  'ewma19_pass_int_opp',
#  'ewma19_pass_net_yds_per_att_def',
#  'ewma19_pass_net_yds_per_att_def_opp',
#  'ewma19_pass_rating',
#  'ewma19_pass_rating_def',
#  'ewma19_pass_rating_def_opp',
#  'ewma19_pass_rating_opp',
#  'ewma19_pass_sacked',
#  'ewma19_pass_sacked_def',
#  'ewma19_pass_sacked_def_opp',
#  'ewma19_pass_sacked_opp',
#  'ewma19_pass_sacked_yds',
#  'ewma19_pass_sacked_yds_def',
#  'ewma19_pass_sacked_yds_def_opp',
#  'ewma19_pass_sacked_yds_opp',
#  'ewma19_pass_td',
#  'ewma19_pass_td_def',
#  'ewma19_pass_td_def_opp',
#  'ewma19_pass_td_opp',
 'ewma19_pass_yds',
 'ewma19_pass_yds_def',
 'ewma19_pass_yds_def_opp',
 'ewma19_pass_yds_opp',
#  'ewma19_pass_yds_per_att',
#  'ewma19_pass_yds_per_att_def',
#  'ewma19_pass_yds_per_att_def_opp',
#  'ewma19_pass_yds_per_att_opp',
#  'ewma19_pts_def',
#  'ewma19_pts_def_opp',
#  'ewma19_pts_off',
#  'ewma19_pts_off_opp',
#  'ewma19_result_tie',
#  'ewma19_result_tie_opp',
#  'ewma19_result_win',
#  'ewma19_result_win_opp',
#  'ewma19_rush_att',
#  'ewma19_rush_att_def',
#  'ewma19_rush_att_def_opp',
#  'ewma19_rush_att_opp',
 'ewma19_rush_td',
 'ewma19_rush_td_def',
 'ewma19_rush_td_def_opp',
 'ewma19_rush_td_opp',
 'ewma19_rush_yds',
 'ewma19_rush_yds_def',
 'ewma19_rush_yds_def_opp',
 'ewma19_rush_yds_opp',
 'ewma19_rush_yds_per_att',
 'ewma19_rush_yds_per_att_def',
 'ewma19_rush_yds_per_att_def_opp',
 'ewma19_rush_yds_per_att_opp',
 'ewma19_team_home_game',
 'ewma19_third_down_att',
 'ewma19_third_down_att_def',
 'ewma19_third_down_att_def_opp',
 'ewma19_third_down_att_opp',
 'ewma19_third_down_success',
 'ewma19_third_down_success_def',
 'ewma19_third_down_success_def_opp',
 'ewma19_third_down_success_opp',
 'prev_result_win',
 'prev_result_win_opp',
 'roll3_wins',
 'roll3_wins_opp'
]

# game_df = game_df.dropna(subset=all_stats
#                          .append('margin'), how='any', inplace=True)
X = game_df[all_stats]
y = game_df['margin']

model = sm.OLS(y, X)
fit = model.fit()
fit.summary()
# X.describe()

0,1,2,3
Dep. Variable:,margin,R-squared (uncentered):,0.14
Model:,OLS,Adj. R-squared (uncentered):,0.138
Method:,Least Squares,F-statistic:,60.8
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,16:13:17,Log-Likelihood:,-48172.0
No. Observations:,11956,AIC:,96410.0
Df Residuals:,11924,BIC:,96640.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
season_year,-0.0011,0.002,-0.485,0.628,-0.005,0.003
week_num,-0.0043,0.030,-0.142,0.887,-0.063,0.054
team_home_game,5.3361,0.281,19.007,0.000,4.786,5.886
ewma19_pass_yds,0.0285,0.004,6.749,0.000,0.020,0.037
ewma19_pass_yds_def,-0.0226,0.004,-5.062,0.000,-0.031,-0.014
ewma19_pass_yds_def_opp,0.0228,0.004,5.097,0.000,0.014,0.032
ewma19_pass_yds_opp,-0.0285,0.004,-6.755,0.000,-0.037,-0.020
ewma19_rush_td,0.6355,0.408,1.557,0.119,-0.164,1.435
ewma19_rush_td_def,-1.1498,0.420,-2.739,0.006,-1.973,-0.327

0,1,2,3
Omnibus:,34.701,Durbin-Watson:,1.966
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.026
Skew:,-0.0,Prob(JB):,1.67e-10
Kurtosis:,3.301,Cond. No.,24700.0


In [13]:
for col in all_stats:
    print(col, ': ', game_df[col].max())

season_year :  2020
week_num :  21.0
team_home_game :  1.0
margin :  59.0
ewma19_fga :  4.734
ewma19_fga_def :  4.827
ewma19_fga_def_opp :  4.827
ewma19_fga_opp :  4.734
ewma19_fgm :  4.402
ewma19_fgm_def :  4.558
ewma19_fgm_def_opp :  4.558
ewma19_fgm_opp :  4.402
ewma19_fourth_down_att :  3.942
ewma19_fourth_down_att_def :  3.435
ewma19_fourth_down_att_def_opp :  3.435
ewma19_fourth_down_att_opp :  3.942
ewma19_fourth_down_success :  2.322
ewma19_fourth_down_success_def :  2.107
ewma19_fourth_down_success_def_opp :  2.107
ewma19_fourth_down_success_opp :  2.322
ewma19_margin :  29.708
ewma19_margin_opp :  29.708
ewma19_pass_att :  51.591
ewma19_pass_att_def :  52.321
ewma19_pass_att_def_opp :  52.321
ewma19_pass_att_opp :  51.591
ewma19_pass_cmp :  34.913
ewma19_pass_cmp_def :  34.675
ewma19_pass_cmp_def_opp :  34.675
ewma19_pass_cmp_opp :  34.913
ewma19_pass_cmp_perc :  80.229
ewma19_pass_cmp_perc_def :  81.432
ewma19_pass_cmp_perc_def_opp :  81.432
ewma19_pass_cmp_perc_opp :  80.22

In [None]:
game_df['log_season_year'] = np.log(game_df['season_year'])

X = game_df[[
     'log_season_year',
     'week_num',
     'team_home_game',
     'roll3_wins',
     'roll3_wins_opp',
     'ewma19_rush_yds',
     'ewma19_rush_yds_def',
     'ewma19_rush_yds_opp',
     'ewma19_rush_yds_def_opp',
     'ewma19_pass_yds',
     'ewma19_pass_yds_def',
     'ewma19_pass_yds_opp',
     'ewma19_pass_yds_def_opp',
     'ewma19_pass_rating',
     'ewma19_pass_rating_def',
     'ewma19_pass_rating_opp',
     'ewma19_pass_rating_def_opp',
     'ewma19_pts_off',
     'ewma19_pts_off_opp',
     'ewma19_pts_def', 
     'ewma19_pts_def_opp',
     'ewma19_margin',
     'ewma19_margin_opp',
     'ewma19_pass_td',
     'ewma19_pass_td_def',
     'ewma19_pass_td_opp',
     'ewma19_pass_td_def_opp',
    ]]
y = game_df['margin']

split_and_validate(X, y)

In [None]:
mask = game_df.team_year == 'chi-2019'
game_df[mask][['team',
         'opp',
         'season_year',
         'week_num',
         'pts_off',
         'pts_def',
         'margin',
         'ewma19_margin',
         'ewma19_margin_opp',
        ]]

In [None]:

plot_df = game_df[[
         'season_year',
         'week_num',
         'roll3_wins',
         'ewma19_pts_off',
         'ewma19_pts_def',
         'ewma19_margin',
         'ewma19_margin_opp',
         'margin',
        ]]

sns.pairplot(plot_df)