# Predicting the Margin of NFL Games using linear regression

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
import seaborn as sns

from clean3 import clean_games

In [2]:
# gather & pre-process our data (for details, see clean_game_data.ipynb and clean3.py)
df = clean_games('data/games.pickle', start_year=1990)

In [6]:
game_df = df.copy() # to prevent needing to rerun clean_games

Now, let's model the features we refined in feature_engineering.ipynb using the standard Linear Regression model.

In [10]:
features = [
    'season_year',
    'team_home_game',
    'ewma_team_home_game',
    'ewma10_margin',
    'ewma10_wins_opp',
    'roll19_wins',
    'roll19_margin_opp',
    'ewma_margin_opp',
    'ewma_margin',
    'ewma_pass_yds_def',
    'ewma_total_yds_def',
    'ewma_total_yds_def_opp',
    'ewma_result_win_opp',
    'ewma_third_conv_pct',
    'ewma_third_conv_pct_opp',
    'ewma_pass_cmp_def',
    'ewma_pass_cmp_perc_def', 
    'ewma_pass_cmp_perc_def_opp', 
    'ewma4_margin_opp',
]

target = 'margin'

print("Number of Features: ", len(features))

Number of Features:  19


In [11]:
X = game_df[features]
y = game_df[target]

KeyError: "['ewma_third_conv_pct_opp', 'ewma_total_yds_def', 'ewma_total_yds_def_opp', 'ewma_third_conv_pct'] not in index"

In [8]:
def split_validate(X, y):
    '''
    Performs KFold cross-validation on linear regression model and computes model metrics
    
    Features are scaled using standard scaler.
    
    Parameters
    ----------
    X: DataFrame or numpy array of features/independent variables from training set
    y: numpy array of target/dependent variable from training set
    Returns
    ----------
    Model metrics with cross-validation: R^2 score for training and validation sets
    with each fold, mean R^2 score for validation set across all folds;
    mean training/validation score ratios, MAE, RMSE across all folds
    '''
    kf = KFold(n_splits=5, shuffle=True, random_state=71)   # default values
    train_results = []
    val_results = []
    ratios = []
    maes = []
    rmses = []
    
    for train_ind, val_ind in kf.split(X, y):
        
        # iterate thru 5 shuffled train/validation sets & collect results
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind]
        
        
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        train_results.append(model.score(X_train, y_train))
        val_results.append(model.score(X_val, y_val))
        ratios.append(model.score(X_train, y_train) / model.score(X_val, y_val))
        maes.append(mean_absolute_error(y_val, y_pred))
        rmses.append(np.sqrt(mean_squared_error(y_val, y_pred)))
    print('Linear regression train R^2: ', train_results)
    print('Linear regression val R^2: ', val_results)
    print(f'Linear regression mean val R^2: {np.mean(val_results):.3f} +- {np.std(r2s_val):.3f}')
    print(f'Mean train/val R^2 ratio: {np.mean(ratios):.3f} +- {np.std(ratios):.3f}')
    print('Mean MAE: ', np.mean(maes))
    print('Mean RMSE: ', np.mean(rmses))
    
    return model