In [56]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json

import seaborn as sns

from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge, Lasso
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import time

In [25]:
df = pd.read_csv('intermediate-data/nfl_team_all_stats_2013_2022.csv', index_col=['Year', 'Team'])
df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 320 entries, (2022, 'Seattle Seahawks') to (2021, 'Seattle Seahawks')
Data columns (total 67 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   MadePlayoffs                     320 non-null    float64
 1   QB_COST                          320 non-null    int64  
 2   RB_COST                          320 non-null    int64  
 3   WR_COST                          320 non-null    int64  
 4   TE_COST                          320 non-null    int64  
 5   OL_COST                          320 non-null    int64  
 6   Offense_COST                     320 non-null    int64  
 7   IDL_COST                         320 non-null    int64  
 8   EDGE_COST                        320 non-null    int64  
 9   LB_COST                          320 non-null    int64  
 10  S_COST                           320 non-null    int64  
 11  CB_COST                          320

In [26]:
# dataset balance of subscribed class (derived from column '1 or 0').
df['MadePlayoffs'].value_counts()

MadePlayoffs
0.0    194
1.0    126
Name: count, dtype: int64

In [37]:
# get column names for target categories, off-stats, def-stats, off-cost, def-cost
target_categorical_columns = ['points_scored', 'points_allowed', 'score_differential', 'MadePlayoffs', 'wins', 'losses', 'ties']
off_cost_columns =  ['QB_COST', 'RB_COST', 'WR_COST', 'TE_COST', 'OL_COST', 'Offense_COST']
def_cost_columns = ['IDL_COST', 'EDGE_COST', 'LB_COST', 'S_COST', 'CB_COST', 'Defense_COST']
off_stats_columns = ['offense_completion_percentage', 'offense_total_yards_gained_pass', 'offense_total_yards_gained_run', 'offense_ave_yards_gained_pass', 'offense_ave_yards_gained_run', 'offense_total_air_yards', 'offense_ave_air_yards', 'offense_total_yac', 'offense_ave_yac', 'offense_n_plays_pass', 'offense_n_plays_run', 'offense_n_interceptions', 'offense_n_fumbles_lost_pass', 'offense_n_fumbles_lost_run', 'offense_total_epa_pass', 'offense_total_epa_run', 'offense_ave_epa_pass', 'offense_ave_epa_run', 'offense_total_wpa_pass', 'offense_total_wpa_run', 'offense_ave_wpa_pass', 'offense_ave_wpa_run', 'offense_success_rate_pass', 'offense_success_rate_run']
def_stats_columns = ['defense_completion_percentage', 'defense_total_yards_gained_pass', 'defense_total_yards_gained_run', 'defense_ave_yards_gained_pass', 'defense_ave_yards_gained_run', 'defense_total_air_yards', 'defense_ave_air_yards', 'defense_total_yac', 'defense_ave_yac', 'defense_n_plays_pass', 'defense_n_plays_run', 'defense_n_interceptions', 'defense_n_fumbles_lost_pass', 'defense_n_fumbles_lost_run', 'defense_total_epa_pass', 'defense_total_epa_run', 'defense_ave_epa_pass', 'defense_ave_epa_run', 'defense_total_wpa_pass', 'defense_total_wpa_run', 'defense_ave_wpa_pass', 'defense_ave_wpa_run', 'defense_success_rate_pass', 'defense_success_rate_run']

numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
feature_columns = df.drop(['points_scored', 'points_allowed', 'score_differential', 'MadePlayoffs', 'wins', 'losses', 'ties'], axis=1).columns.tolist()

# create training and test data frames where the training data is from the year prior to 2021 and
# test data is after 2020.

# target categories
points_allowed_y_train = df.loc[:, ['points_allowed']].query('Year <= 2020')
points_allowed_y_test = df.loc[:, ['points_allowed']].query('Year > 2020')

points_scored_y_train = df.loc[:, ['points_scored']].query('Year <= 2020')
points_scored_y_test = df.loc[:, ['points_scored']].query('Year > 2020')

score_diff_y_train = df.loc[:, ['score_differential']].query('Year <= 2020')
score_diff_y_test = df.loc[:, ['score_differential']].query('Year > 2020')

# feature data sets
offcost_X_train = df.loc[:, off_cost_columns].query('Year <= 2020')
offcost_X_test = df.loc[:, off_cost_columns].query('Year > 2020')

defcost_X_train = df.loc[:, def_cost_columns].query('Year <= 2020')
defcost_X_test = df.loc[:, def_cost_columns].query('Year > 2020')

offstats_X_train = df.loc[:, off_stats_columns].query('Year <= 2020')
offstats_X_test = df.loc[:, off_stats_columns].query('Year > 2020')

defstats_X_train = df.loc[:, def_stats_columns].query('Year <= 2020')
defstats_X_test = df.loc[:, def_stats_columns].query('Year > 2020')

target_list = off_cost_columns + def_cost_columns
allcosts_X_train = df.loc[:, target_list].query('Year <= 2020')
allcosts_X_test = df.loc[:, target_list].query('Year > 2020')

target_list = off_stats_columns + def_stats_columns
allstats_X_train = df.loc[:, target_list].query('Year <= 2020')
allstats_X_test = df.loc[:, target_list].query('Year > 2020')


In [None]:
# Calculate the correlation between features
# and create a heat map from the results
def createCorrelationHeatMap(X) :
    corr = X.corr()

    # Generate the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr,annot=True, cmap='coolwarm', linewidths=0.5, linecolor='black',
                xticklabels=corr.columns, yticklabels=corr.columns)  # Consider turning off annotations for speed
    plt.show()

# use standard encoder to encode features
def encodeFeatures(feature_columns) :
    preprocessor = ColumnTransformer(
        transformers=[
            ('encoder', StandardScaler(), feature_columns)
        ])
    return preprocessor

# Encode the target category/class using a
# label encoder
def encodeCategory(y) :
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    return y_encoded

# get root mean squared error using test data and predicted results
def rmse(y_test, y_pred) :
    return np.sprt(mean_squared_error(y_test, y_pred))

# this calculates each feature's permutation-importance on the test data
# using a model regressor (i.e. RandomForestRegressor(n_estimators=100, random_state=42))
# 
# NOTE: can be used to compare accuracy and correlation of features with permutation importance
# If high accuracy is observed with no features having importance there might be multicollinearity
# occuring within feature set.
# 
# Return: Tuple {
#                data frame: containing importance score for each column in training/test data, 
#                float:      accuracy score from baseline prediction using model-regressor
#               }
# 
def evaluateFeatures(model_regressor, X_train, y_train, X_test, y_test) :
    pipe_model = Pipeline([('preprocessor', encodeFeatures(X_train.columns.tolist())),
                           ('regressor', model_regressor)
                        ])
    
    pipe_model.fit(X_train, y_train)
    score = pipe_model.score(X_test, y_test)

    results = permutation_importance(pipe_model, X_test, y_test, n_repeats=10, n_jobs=-1, random_state=42)
    
    return pd.DataFrame(data=results.importances_mean, index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False), score
    # for feature_name, importance_score in zip(X_train.columns, result.importances_mean) :


In [38]:
# evaluate features
# offensive stats vs points scored
selected_features, score = evaluateFeatures(RandomForestRegressor(n_estimators=100, random_state=42),
                                            offstats_X_train, points_scored_y_train, offstats_X_test, points_scored_y_test
                                        )
print(f'Feature Evaluation for [offensive stats vs points scored]:\nAccuracy: {score}\n{selected_features.head(20)}')

# total stats vs points scored
selected_features, score = evaluateFeatures(RandomForestRegressor(n_estimators=100, random_state=42),
                                            allstats_X_train, points_scored_y_train, allstats_X_test, points_scored_y_test
                                        )
print(f'Feature Evaluation for [total stats vs points scored]:\nAccuracy: {score}\n{selected_features.head(20)}')

# defensive stats vs points allowed
selected_features, score = evaluateFeatures(RandomForestRegressor(n_estimators=100, random_state=42),
                                            defstats_X_train, points_allowed_y_train, defstats_X_test, points_allowed_y_test
                                        )
print(f'Feature Evaluation for [defensive stats vs points allowed]:\nAccuracy: {score}\n{selected_features.head(20)}')

# total stats vs points scored
selected_features, score = evaluateFeatures(RandomForestRegressor(n_estimators=100, random_state=42),
                                            allstats_X_train, points_allowed_y_train, allstats_X_test, points_allowed_y_test
                                        )
print(f'Feature Evaluation for [total stats vs points allowed]:\nAccuracy: {score}\n{selected_features.head(20)}')

# offensive stats vs points differential
selected_features, score = evaluateFeatures(RandomForestRegressor(n_estimators=100, random_state=42),
                                            offstats_X_train, score_diff_y_train, offstats_X_test, score_diff_y_test
                                        )
print(f'Feature Evaluation for [offensive stats vs points differential]:\nAccuracy: {score}\n{selected_features.head(20)}')

# defensive stats vs points differential
selected_features, score = evaluateFeatures(RandomForestRegressor(n_estimators=100, random_state=42),
                                            defstats_X_train, score_diff_y_train, defstats_X_test, score_diff_y_test
                                        )
print(f'Feature Evaluation for [defensive stats vs points differential]:\nAccuracy: {score}\n{selected_features.head(20)}')

# all stats vs points differential
selected_features, score = evaluateFeatures(RandomForestRegressor(n_estimators=100, random_state=42),
                                            allstats_X_train, score_diff_y_train, allstats_X_test, score_diff_y_test
                                        )
print(f'Feature Evaluation for [total stats vs points differential]:\nAccuracy: {score}\n{selected_features.head(20)}')



  return fit_method(estimator, *args, **kwargs)


Feature Evaluation for [offensive stats vs points scored]:
Accuracy: 0.7555663929920715
                                 Importance
offense_ave_epa_pass               0.393259
offense_total_epa_pass             0.180124
offense_ave_wpa_run                0.012357
offense_total_yards_gained_run     0.009446
offense_ave_yards_gained_pass      0.004890
offense_success_rate_run           0.004411
offense_total_yards_gained_pass    0.004363
offense_total_wpa_run              0.003911
offense_ave_yards_gained_run       0.003136
offense_ave_epa_run                0.002828
offense_n_plays_pass               0.002574
offense_n_plays_run                0.002520
offense_ave_wpa_pass               0.002335
offense_total_yac                  0.002067
offense_total_epa_run              0.000583
offense_ave_yac                    0.000347
offense_total_wpa_pass            -0.000046
offense_total_air_yards           -0.000220
offense_n_fumbles_lost_run        -0.001169
offense_completion_percentage   

  return fit_method(estimator, *args, **kwargs)


Feature Evaluation for [total stats vs points scored]:
Accuracy: 0.759797966108089
                                 Importance
offense_ave_epa_pass               0.356137
offense_total_epa_pass             0.170487
offense_success_rate_run           0.009039
offense_total_yards_gained_run     0.007573
offense_ave_wpa_run                0.007517
defense_n_plays_pass               0.007389
offense_ave_wpa_pass               0.006226
offense_ave_yards_gained_run       0.004655
offense_ave_epa_run                0.004582
defense_n_fumbles_lost_pass        0.004377
offense_total_wpa_run              0.004114
offense_total_yards_gained_pass    0.003325
defense_n_interceptions            0.002911
offense_n_plays_pass               0.002664
offense_total_yac                  0.002319
offense_total_epa_run              0.002301
offense_ave_yac                    0.001480
offense_total_wpa_pass             0.001370
defense_ave_epa_pass               0.001326
offense_ave_yards_gained_pass      0.

  return fit_method(estimator, *args, **kwargs)


Feature Evaluation for [defensive stats vs points allowed]:
Accuracy: 0.5713743879770854
                                 Importance
defense_ave_epa_pass               0.213684
defense_total_yards_gained_run     0.083642
defense_total_epa_pass             0.071565
defense_n_plays_run                0.031472
defense_ave_wpa_run                0.014305
defense_ave_yards_gained_pass      0.011838
defense_total_yards_gained_pass    0.008746
defense_success_rate_pass          0.008112
defense_success_rate_run           0.003520
defense_total_wpa_run              0.003286
defense_ave_air_yards              0.003052
defense_total_epa_run              0.002443
defense_ave_epa_run                0.002392
defense_total_air_yards            0.001048
defense_n_plays_pass              -0.000146
defense_ave_yards_gained_run      -0.000579
defense_n_fumbles_lost_pass       -0.000746
defense_completion_percentage     -0.001396
defense_ave_wpa_pass              -0.001850
defense_n_fumbles_lost_run     

  return fit_method(estimator, *args, **kwargs)


Feature Evaluation for [total stats vs points allowed]:
Accuracy: 0.5285990755796877
                                 Importance
defense_ave_epa_pass               0.173616
defense_total_yards_gained_run     0.076547
defense_total_epa_pass             0.056631
offense_n_interceptions            0.044040
defense_n_plays_run                0.023014
defense_ave_yards_gained_pass      0.014875
defense_ave_yards_gained_run       0.010527
defense_ave_epa_run                0.010237
defense_total_epa_run              0.010018
defense_total_wpa_run              0.009488
defense_ave_wpa_run                0.007703
defense_total_yards_gained_pass    0.006976
offense_n_fumbles_lost_pass        0.005845
offense_n_plays_pass               0.005772
offense_ave_epa_run                0.004192
defense_success_rate_pass          0.003274
offense_total_epa_run              0.002887
offense_total_air_yards            0.002343
offense_total_epa_pass             0.001850
offense_total_yards_gained_pass    

  return fit_method(estimator, *args, **kwargs)


Feature Evaluation for [offensive stats vs points differential]:
Accuracy: 0.6659588492819399
                                Importance
offense_total_epa_pass            0.346610
offense_ave_epa_pass              0.328379
offense_total_yards_gained_run    0.018314
offense_n_plays_run               0.016332
offense_ave_epa_run               0.015968
offense_ave_wpa_run               0.012913
offense_total_wpa_run             0.006751
offense_n_fumbles_lost_run        0.004357
offense_ave_air_yards             0.003777
offense_n_fumbles_lost_pass       0.002584
offense_total_air_yards           0.002208
offense_ave_yards_gained_pass     0.001438
offense_completion_percentage     0.000611
offense_total_wpa_pass            0.000221
offense_ave_yac                  -0.000851
offense_n_plays_pass             -0.001237
offense_total_yac                -0.002216
offense_ave_wpa_pass             -0.003315
offense_success_rate_run         -0.004271
offense_n_interceptions          -0.005269


  return fit_method(estimator, *args, **kwargs)


Feature Evaluation for [defensive stats vs points differential]:
Accuracy: 0.48030774030577594
                                 Importance
defense_n_plays_run                0.384079
defense_ave_epa_pass               0.140127
defense_completion_percentage      0.099345
defense_success_rate_run           0.020558
defense_total_epa_pass             0.015088
defense_n_fumbles_lost_pass        0.013895
defense_n_interceptions            0.013163
defense_n_fumbles_lost_run         0.009754
defense_ave_yards_gained_pass      0.008766
defense_n_plays_pass               0.000475
defense_ave_yards_gained_run       0.000078
defense_total_wpa_run             -0.000548
defense_ave_wpa_run               -0.000661
defense_ave_epa_run               -0.000724
defense_ave_yac                   -0.000774
defense_total_yards_gained_pass   -0.002074
defense_total_wpa_pass            -0.002985
defense_total_epa_run             -0.004032
defense_success_rate_pass         -0.004130
defense_total_yac        

  return fit_method(estimator, *args, **kwargs)


Feature Evaluation for [total stats vs points differential]:
Accuracy: 0.8341737040963376
                                Importance
offense_total_epa_pass            0.180254
offense_ave_epa_pass              0.162008
defense_ave_epa_pass              0.046784
offense_ave_epa_run               0.015094
defense_n_plays_run               0.010329
offense_total_epa_run             0.010082
defense_total_epa_pass            0.007084
defense_ave_yards_gained_pass     0.005235
offense_n_interceptions           0.004367
offense_n_plays_run               0.003898
offense_ave_yards_gained_pass     0.003668
offense_ave_wpa_run               0.003325
offense_total_wpa_run             0.003212
defense_ave_wpa_pass              0.003105
defense_n_interceptions           0.002801
offense_success_rate_run          0.002736
defense_success_rate_pass         0.002655
defense_total_wpa_pass            0.002575
offense_total_yards_gained_run    0.002329
defense_total_yards_gained_run    0.002140


In [52]:
# Baseline Models:
models = {
    'knn-r': (KNeighborsRegressor()),
    'randomforest': (RandomForestRegressor()),
    'svr': (SVR()),
    'decisiontree-r': (DecisionTreeRegressor()),
    'Ridge' : (Ridge(random_state=42)),
    'Lasso': (Lasso())
}

In [None]:
results = []

# set target data set
X_train = allstats_X_train
X_test = allstats_X_test
y_train = score_diff_y_train
y_test = score_diff_y_test

# Execute pipeline containing encoded data
# and target models
rmse_scorer = make_scorer(rmse, greater_is_better=False)
for name, (model) in models.items():
    # Create a pipeline
    pipeline = Pipeline([
        ('preprocessor', encodeFeatures(X_train.columns.tolist())),
        (name, model)
    ])
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid={}, cv=5, n_jobs=-1, scoring=rmse_scorer)
    
    # Fit the model and time it
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    fit_time = (time.time() - start_time) / len(grid_search.cv_results_['mean_fit_time'])
    
    # Get the best estimator
    best_model = grid_search.best_estimator_
    best_rmse = -grid_search.best_score_
    # print("Best params: \n", grid_search.best_params_)
    print("\nBest estimator: \n", grid_search.best_estimator_)
    
    # Evaluate on training and test sets
    train_score = best_model.score(X_train, y_train)
    test_score = best_model.score(X_test, y_test)
    
    # Append the results
    results.append([name, train_score, test_score, best_rmse, fit_time])

# Create the results DataFrame
results_df = pd.DataFrame(results, columns=['model', 'train score', 'test score', 'rmse', 'average fit time'])
results_df.reset_index(inplace=True)

results_df.to_json('results/baseline_model_results.json', orient='records', double_precision=10)

Best params: 
 {}

Best estimator: 
 Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('encoder', StandardScaler(),
                                                  ['offense_completion_percentage',
                                                   'offense_total_yards_gained_pass',
                                                   'offense_total_yards_gained_run',
                                                   'offense_ave_yards_gained_pass',
                                                   'offense_ave_yards_gained_run',
                                                   'offense_total_air_yards',
                                                   'offense_ave_air_yards',
                                                   'offense_total_yac',
                                                   'offense_ave_yac',
                                                   'offens...
                                                   'offense_total_wpa_r

  return fit_method(estimator, *args, **kwargs)


Best params: 
 {}

Best estimator: 
 Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('encoder', StandardScaler(),
                                                  ['offense_completion_percentage',
                                                   'offense_total_yards_gained_pass',
                                                   'offense_total_yards_gained_run',
                                                   'offense_ave_yards_gained_pass',
                                                   'offense_ave_yards_gained_run',
                                                   'offense_total_air_yards',
                                                   'offense_ave_air_yards',
                                                   'offense_total_yac',
                                                   'offense_ave_yac',
                                                   'offens...
                                                   'offense_total_wpa_r

  y = column_or_1d(y, warn=True)


In [None]:
# baseline results
baseline_df = pd.read_json(r'results/baseline_model_results.json')
baseline_df[['model', 'train score','test score', 'rmse','average fit time']]

Unnamed: 0,model,train score,test score,rmse,average fit time
0,knn-r,0.884443,0.782013,40.686729,2.002221
1,randomforest,0.978108,0.850417,39.281983,2.294578
2,svr,0.194358,0.166547,90.057336,1.208097
3,decisiontree-r,1.0,0.663067,55.809739,1.095784
4,Ridge,0.957276,0.921112,25.098497,0.029948
5,Lasso,0.950154,0.927551,24.553355,0.045877
