In [97]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV

from clean3 import clean_games

Let's clean & load our data. For time's sake, we're only going to use years 1990-2020.

In [2]:
df = clean_games("data/games.pickle", start_year=1990)

In [98]:
game_df = df.copy()

In [99]:
game_df.shape

(5971, 193)

In [100]:
pd.set_option("display.max_columns", 250)
pd.set_option("display.max_rows", 73)

In [101]:
# print all columns
[col for col in game_df.columns]

['game_id',
 'season_year',
 'week_num',
 'team_year',
 'full_game_date',
 'team',
 'decade',
 'opp',
 'game_day_of_week',
 'game_outcome',
 'pts_off',
 'margin',
 'pts_def',
 'pass_cmp',
 'pass_att',
 'pass_yds',
 'pass_td',
 'pass_int',
 'pass_sacked',
 'pass_sacked_yds',
 'pass_yds_per_att',
 'pass_net_yds_per_att',
 'pass_cmp_perc',
 'pass_rating',
 'rush_att',
 'rush_yds',
 'rush_yds_per_att',
 'rush_td',
 'fgm',
 'fga',
 'xpm',
 'xpa',
 'punt',
 'punt_yds',
 'overtime',
 'time_of_poss',
 'third_down_success',
 'third_down_att',
 'fourth_down_success',
 'fourth_down_att',
 'team_home_game',
 'pass_cmp_def',
 'pass_att_def',
 'pass_yds_def',
 'pass_td_def',
 'pass_int_def',
 'pass_sacked_def',
 'pass_sacked_yds_def',
 'pass_yds_per_att_def',
 'pass_net_yds_per_att_def',
 'pass_cmp_perc_def',
 'pass_rating_def',
 'rush_att_def',
 'rush_yds_def',
 'rush_yds_per_att_def',
 'rush_td_def',
 'fgm_def',
 'fga_def',
 'third_down_success_def',
 'third_down_att_def',
 'fourth_down_success_de

In [102]:
selected_columns = [
 'season_year',
 'week_num', 
 'team_home_game',
 'ewma_margin',
 'ewma_margin_opp',
 'ewma_pass_rating',
 'ewma_pass_rating_opp',
 'ewma_pass_td',
 'ewma_pass_td_opp',
 'ewma_rush_td',
 'ewma_rush_td_opp',
]

In [103]:
X = game_df[selected_columns]
y = game_df['margin']

In [104]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=43)

Our DF is still too large to use `df.info()`.

Let's split the columns into sections. Since we have so many columns, we need to do quite a bit of splitting.


In [141]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

def split_and_validate(X, y):
    '''
        For a set of features and target X, y, perform a 80/20 train/val split, 
        fit and validate a linear regression model, and report results
    '''

    # perform train/val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    
    # standard scaler resulting in lower R^2
#     scaler = StandardScaler()

#     X_train_scaled = scaler.fit_transform(X_train.values)
#     X_val_scaled = scaler.transform(X_val.values)
#     X_test_scaled = scaler.transform(X_test.values)
    
    # fit linear regression to training data
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
                         
    # fit simple poly regression to training data (not working)
#     poly = PolynomialFeatures(degree=2)
#     X_train_poly = poly.fit_transform(X_train.values)
#     X_val_poly = poly.transform(X_val.values)
#     X_test_poly = poly.transform(X_test.values)
#     lm_poly = LinearRegression()
#     lm_poly.fit(X_train_poly, y_train)
    
    
    # fit ridge regression to training data    
    lr_model_ridge = Ridge(alpha = 100000)
    lr_model_ridge.fit(X_train, y_train)
    
    # lasso model results very poor
    # fit lasso regression to training data
#     lr_model_lasso = Lasso(alpha = 100000)
#     lr_model_lasso.fit(X_train, y_train)
    
    # score fit model on validation data
    lr_val_score = lr_model.score(X_val, y_val)
    lr_ridge_val_score = lr_model_ridge.score(X_val, y_val)
#     lr_lasso_val_score = lr_model_lasso.score(X_val, y_val)
#     lm_poly_val_score = lm_poly.score(X_val_poly, y_val)
    
    # report results
    print('\nSIMPLE Validation R^2 score was:', lr_val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}') 
    
    print('\nRIDGE Validation R^2 score was:', lr_ridge_val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model_ridge.coef_):
        print(feature, ':', f'{coef:.2f}') 
        
#     print('\nLASSO Validation R^2 score was:', lr_lasso_val_score)
#     print('Feature coefficient results: \n')
#     for feature, coef in zip(X.columns, lr_model_lasso.coef_):
#         print(feature, ':', f'{coef:.2f}') 
    
    return lr_model

In [142]:
# let's try all of our columns at once.

all_stats = [
 'season_year',
 'week_num', 
 'team_home_game',
 'ewma_fga',
 'ewma_fga_def',
 'ewma_fga_def_opp',
 'ewma_fga_opp',
 'ewma_fgm',
 'ewma_fgm_def',
 'ewma_fgm_def_opp',
 'ewma_fgm_opp',
 'ewma_margin',
 'ewma_margin_opp',
 'ewma_pass_att',
 'ewma_pass_att_def',
 'ewma_pass_att_def_opp',
 'ewma_pass_att_opp',
 'ewma_pass_cmp',
 'ewma_pass_cmp_def',
 'ewma_pass_cmp_def_opp',
 'ewma_pass_cmp_opp',
 'ewma_pass_cmp_perc',
 'ewma_pass_cmp_perc_def',
 'ewma_pass_cmp_perc_def_opp',
 'ewma_pass_cmp_perc_opp',
 'ewma_pass_int',
 'ewma_pass_int_def',
 'ewma_pass_int_def_opp',
 'ewma_pass_int_opp',
 'ewma_pass_net_yds_per_att_def',
 'ewma_pass_net_yds_per_att_def_opp',
 'ewma_pass_rating',
 'ewma_pass_rating_def',
 'ewma_pass_rating_def_opp',
 'ewma_pass_rating_opp',
 'ewma_pass_sacked',
 'ewma_pass_sacked_def',
 'ewma_pass_sacked_def_opp',
 'ewma_pass_sacked_opp',
 'ewma_pass_sacked_yds',
 'ewma_pass_sacked_yds_def',
 'ewma_pass_sacked_yds_def_opp',
 'ewma_pass_sacked_yds_opp',
 'ewma_pass_td',
 'ewma_pass_td_def',
 'ewma_pass_td_def_opp',
 'ewma_pass_td_opp',
 'ewma_pass_yds',
 'ewma_pass_yds_opp',
 'ewma_pass_yds_per_att',
 'ewma_pass_yds_per_att_def',
 'ewma_pass_yds_per_att_def_opp',
 'ewma_pass_yds_per_att_opp',
 'ewma_pts_def',
 'ewma_pts_def_opp',
 'ewma_pts_off',
 'ewma_pts_off_opp',
 'ewma_result_tie',
 'ewma_result_tie_opp',
 'ewma_result_win',
 'ewma_result_win_opp',
 'ewma_rush_att',
 'ewma_rush_att_def',
 'ewma_rush_att_def_opp',
 'ewma_rush_att_opp',
 'ewma_rush_td',
 'ewma_rush_td_def',
 'ewma_rush_td_def_opp',
 'ewma_rush_td_opp',
 'ewma_rush_yds',
 'ewma_rush_yds_def',
 'ewma_rush_yds_def_opp',
 'ewma_rush_yds_opp',
 'ewma_rush_yds_per_att',
 'ewma_rush_yds_per_att_def',
 'ewma_rush_yds_per_att_def_opp',
 'ewma_rush_yds_per_att_opp',
 'ewma_third_down_att',
 'ewma_third_down_att_def',
 'ewma_third_down_att_def_opp',
 'ewma_third_down_att_opp',
 'ewma_third_down_success',
 'ewma_third_down_success_def',
 'ewma_third_down_success_def_opp',
 'ewma_third_down_success_opp',
 'prev_result_win',
 'prev_result_win_opp',
 'roll3_wins',
 'roll3_wins_opp',
 'ewma_to_off', 
 'ewma_to_def', 
 'ewma_to_off_opp',
 'ewma_to_def_opp',
]


In [143]:

# game_df = game_df.dropna(subset=all_stats
#                          .append('margin'), how='any', inplace=True)
X = game_df[all_stats]
y = game_df['margin']

model = sm.OLS(y_train, sm.add_constant(X_train))
fit = model.fit()
fit.summary()
# X.describe()

0,1,2,3
Dep. Variable:,margin,R-squared:,0.159
Model:,OLS,Adj. R-squared:,0.157
Method:,Least Squares,F-statistic:,61.53
Date:,"Wed, 20 Jan 2021",Prob (F-statistic):,5.5299999999999996e-126
Time:,00:03:37,Log-Likelihood:,-14443.0
No. Observations:,3582,AIC:,28910.0
Df Residuals:,3570,BIC:,28980.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.6627,63.988,0.042,0.967,-122.794,128.120
season_year,-0.0043,0.033,-0.131,0.896,-0.068,0.060
week_num,0.0350,0.055,0.642,0.521,-0.072,0.142
team_home_game,5.9973,0.458,13.095,0.000,5.099,6.895
ewma_margin,0.3518,0.046,7.589,0.000,0.261,0.443
ewma_margin_opp,-0.5103,0.046,-11.012,0.000,-0.601,-0.419
ewma_pass_rating,0.0365,0.031,1.168,0.243,-0.025,0.098
ewma_pass_rating_opp,-0.0361,0.032,-1.144,0.253,-0.098,0.026
ewma_pass_td,0.9085,0.707,1.284,0.199,-0.478,2.295

0,1,2,3
Omnibus:,9.93,Durbin-Watson:,1.943
Prob(Omnibus):,0.007,Jarque-Bera (JB):,12.266
Skew:,-0.016,Prob(JB):,0.00217
Kurtosis:,3.285,Cond. No.,563000.0


In [144]:
game_df['log_year'] = np.log(game_df['season_year'])

In [145]:
X = game_df[all_stats]
y = game_df['margin']

split_and_validate(X, y)


SIMPLE Validation R^2 score was: 0.12121304088387352
Feature coefficient results: 

season_year : 0.01
week_num : -0.01
team_home_game : 5.67
ewma_fga : -0.97
ewma_fga_def : -1.32
ewma_fga_def_opp : 0.64
ewma_fga_opp : -0.12
ewma_fgm : 1.26
ewma_fgm_def : 0.39
ewma_fgm_def_opp : -0.57
ewma_fgm_opp : -0.26
ewma_margin : 274.92
ewma_margin_opp : 1153.35
ewma_pass_att : 0.28
ewma_pass_att_def : -0.14
ewma_pass_att_def_opp : 0.73
ewma_pass_att_opp : -0.16
ewma_pass_cmp : -0.27
ewma_pass_cmp_def : 0.12
ewma_pass_cmp_def_opp : -1.18
ewma_pass_cmp_opp : -0.29
ewma_pass_cmp_perc : 0.12
ewma_pass_cmp_perc_def : 0.05
ewma_pass_cmp_perc_def_opp : 0.49
ewma_pass_cmp_perc_opp : -0.18
ewma_pass_int : 0.44
ewma_pass_int_def : -1.78
ewma_pass_int_def_opp : -3.12
ewma_pass_int_opp : 1.45
ewma_pass_net_yds_per_att_def : 0.50
ewma_pass_net_yds_per_att_def_opp : -2.27
ewma_pass_rating : -0.01
ewma_pass_rating_def : -0.14
ewma_pass_rating_def_opp : -0.15
ewma_pass_rating_opp : 0.20
ewma_pass_sacked : -0.2

LinearRegression()

In [167]:
selected_columns = [
    'team_home_game',
    'ewma_pass_rating',
    'ewma_pass_rating_def',
    'ewma_pass_rating_def_opp',
    'ewma_rush_att',
    'ewma_margin',
    'ewma_margin_opp',
    'ewma_pass_yds',
    'ewma_pass_yds_def',
    'ewma_pass_yds_def_opp',
    'ewma_pass_td_opp',
    'ewma_rush_td',
    'ewma_pass_sacked',
    'ewma_third_down_att',
    'ewma_third_down_success',
    'ewma_third_down_att_opp',
    'ewma_third_down_success_opp',
    'ewma_result_win_opp',
    'ewma_fga_opp',
    'ewma_fga_def',
    'ewma_pass_int',
    'ewma_to_off',
    'ewma_to_off_opp',
    'ewma_to_def',
    'ewma_to_def_opp',
    'ewma_pass_td',
    'ewma_pass_att',
    'ewma_pass_att_def',
#     'ewma_pass_att_def_opp',
    'ewma_pass_att_opp',
    'ewma_pass_cmp',
    'ewma_pass_cmp_def',
#     'ewma_pass_cmp_def_opp',
#     'ewma_pass_cmp_opp',
#     'ewma_pass_cmp_perc',
    'ewma_pass_cmp_perc_def',
#     'ewma_pass_cmp_perc_def_opp',
#     'ewma_pass_cmp_perc_opp',
#     'ewma_pass_int',
#     'ewma_pass_int_def',
#     'ewma_pass_int_def_opp',
#     'ewma_pass_int_opp',
]

X = game_df[selected_columns]
y = game_df['margin']

split_and_validate(X, y)


SIMPLE Validation R^2 score was: 0.13436903100878173
Feature coefficient results: 

team_home_game : 5.68
ewma_pass_rating : -0.01
ewma_pass_rating_def : 0.02
ewma_pass_rating_def_opp : 0.03
ewma_rush_att : 0.18
ewma_margin : 0.40
ewma_margin_opp : -0.38
ewma_pass_yds : -0.01
ewma_pass_yds_def : -0.01
ewma_pass_yds_def_opp : 0.00
ewma_pass_td_opp : 0.34
ewma_rush_td : -0.00
ewma_pass_sacked : -0.08
ewma_third_down_att : -1.10
ewma_third_down_success : 0.54
ewma_third_down_att_opp : 1.40
ewma_third_down_success_opp : -0.80
ewma_result_win_opp : -4.15
ewma_fga_opp : -0.27
ewma_fga_def : -0.32
ewma_pass_int : 1.08
ewma_to_off : -6.71
ewma_to_off_opp : 1.66
ewma_to_def : 4.39
ewma_to_def_opp : -0.78
ewma_pass_td : 0.71
ewma_pass_att : 0.27
ewma_pass_att_def : -0.06
ewma_pass_att_opp : -0.13
ewma_pass_cmp : -0.08
ewma_pass_cmp_def : 0.18
ewma_pass_cmp_perc_def : -0.11

RIDGE Validation R^2 score was: 0.09769575791008911
Feature coefficient results: 

team_home_game : 0.06
ewma_pass_rating 

LinearRegression()