In [47]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json

import seaborn as sns

from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import time

In [2]:
df = pd.read_csv('intermediate-data/nfl_team_all_stats_2013_2022.csv', index_col=['Year', 'Team'])
df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 320 entries, (2022, 'Seattle Seahawks') to (2021, 'Seattle Seahawks')
Data columns (total 67 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   MadePlayoffs                     320 non-null    float64
 1   QB_COST                          320 non-null    int64  
 2   RB_COST                          320 non-null    int64  
 3   WR_COST                          320 non-null    int64  
 4   TE_COST                          320 non-null    int64  
 5   OL_COST                          320 non-null    int64  
 6   Offense_COST                     320 non-null    int64  
 7   IDL_COST                         320 non-null    int64  
 8   EDGE_COST                        320 non-null    int64  
 9   LB_COST                          320 non-null    int64  
 10  S_COST                           320 non-null    int64  
 11  CB_COST                          320

In [3]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MadePlayoffs,QB_COST,RB_COST,WR_COST,TE_COST,OL_COST,Offense_COST,IDL_COST,EDGE_COST,LB_COST,...,defense_ave_wpa_pass,defense_ave_wpa_run,defense_success_rate_pass,defense_success_rate_run,points_scored,points_allowed,wins,losses,ties,score_differential
Year,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022,Seattle Seahawks,1.0,1557085,10799653,16831423,12778788,27955261,69922210,7701509,25013832,7377232,...,0.002187,0.002329,0.424958,0.410609,407,401,9,8,0,6
2022,San Francisco 49ers,1.0,2897535,9284062,16963177,10569598,17581156,57295528,11028483,10194288,7666512,...,-0.001752,-0.002444,0.422535,0.37037,450,277,13,4,0,173
2022,Arizona Cardinals,0.0,6172993,5306754,15022466,4425078,19171690,50098981,20498000,5347625,7964030,...,0.001529,0.002561,0.508716,0.435897,340,449,4,13,0,-109
2022,Los Angeles Rams,0.0,13705176,2637944,5479738,6217555,19240160,47280573,9376091,13967275,15711252,...,0.00328,0.002446,0.477234,0.41573,307,384,5,12,0,-77
2022,Carolina Panthers,0.0,7579454,9290450,11041036,4325000,18981126,51217066,5096152,11629844,12335911,...,0.00156,-0.000132,0.455285,0.425263,347,374,7,10,0,-27


In [4]:
# dataset balance of subscribed class (derived from column '1 or 0').
df['MadePlayoffs'].value_counts()

MadePlayoffs
0.0    194
1.0    126
Name: count, dtype: int64

In [35]:
# get column names for target categories, off-stats, def-stats, off-cost, def-cost
target_categorical_columns = ['points_scored', 'points_allowed', 'score_differential', 'MadePlayoffs', 'wins', 'losses', 'ties']
off_cost_columns =  ['QB_COST', 'RB_COST', 'WR_COST', 'TE_COST', 'OL_COST', 'Offense_COST']
def_cost_columns = ['IDL_COST', 'EDGE_COST', 'LB_COST', 'S_COST', 'CB_COST', 'Defense_COST']
off_stats_columns = ['offense_completion_percentage', 'offense_total_yards_gained_pass', 'offense_total_yards_gained_run', 'offense_ave_yards_gained_pass', 'offense_ave_yards_gained_run', 'offense_total_air_yards', 'offense_ave_air_yards', 'offense_total_yac', 'offense_ave_yac', 'offense_n_plays_pass', 'offense_n_plays_run', 'offense_n_interceptions', 'offense_n_fumbles_lost_pass', 'offense_n_fumbles_lost_run', 'offense_total_epa_pass', 'offense_total_epa_run', 'offense_ave_epa_pass', 'offense_ave_epa_run', 'offense_total_wpa_pass', 'offense_total_wpa_run', 'offense_ave_wpa_pass', 'offense_ave_wpa_run', 'offense_success_rate_pass', 'offense_success_rate_run']
def_stats_columns = ['defense_completion_percentage', 'defense_total_yards_gained_pass', 'defense_total_yards_gained_run', 'defense_ave_yards_gained_pass', 'defense_ave_yards_gained_run', 'defense_total_air_yards', 'defense_ave_air_yards', 'defense_total_yac', 'defense_ave_yac', 'defense_n_plays_pass', 'defense_n_plays_run', 'defense_n_interceptions', 'defense_n_fumbles_lost_pass', 'defense_n_fumbles_lost_run', 'defense_total_epa_pass', 'defense_total_epa_run', 'defense_ave_epa_pass', 'defense_ave_epa_run', 'defense_total_wpa_pass', 'defense_total_wpa_run', 'defense_ave_wpa_pass', 'defense_ave_wpa_run', 'defense_success_rate_pass', 'defense_success_rate_run']

numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
feature_columns = df.drop(['points_scored', 'points_allowed', 'score_differential', 'MadePlayoffs', 'wins', 'losses', 'ties'], axis=1).columns.tolist()

# create training and test data frames where the training data is from the year prior to 2021 and
# test data is after 2020.

# target categories
y_lte_2020 = df.loc[:, ['MadePlayoffs']].query('Year <= 2020').MadePlayoffs.reset_index(drop=True) # training data
y_gt_2020 = df.loc[:, ['MadePlayoffs']].query('Year > 2020').MadePlayoffs.reset_index(drop=True) #test data

# feature data
target_list = off_stats_columns + def_stats_columns + list(['points_scored', 'points_allowed', 'score_differential', 'wins', 'losses', 'ties'])
X_lte_2020 = df.loc[:, target_list].query('Year <= 2020').reset_index(drop=True) # training data
X_gt_2020 = df.loc[:, target_list].query('Year > 2020').reset_index(drop=True) # test data


In [36]:
X_lte_2020.head()


Unnamed: 0,offense_completion_percentage,offense_total_yards_gained_pass,offense_total_yards_gained_run,offense_ave_yards_gained_pass,offense_ave_yards_gained_run,offense_total_air_yards,offense_ave_air_yards,offense_total_yac,offense_ave_yac,offense_n_plays_pass,...,defense_ave_wpa_pass,defense_ave_wpa_run,defense_success_rate_pass,defense_success_rate_run,points_scored,points_allowed,score_differential,wins,losses,ties
0,0.570571,4091,2084,6.142643,4.590308,5288,8.474359,2014,5.3,666,...,-0.000379,-0.000629,0.430818,0.416842,444,338,106,12,4,0
1,0.505703,2932,2164,5.574144,4.461856,4326,9.031315,1345,5.056391,526,...,0.00032,-0.003379,0.4336,0.297561,290,387,-97,8,8,0
2,0.550077,3567,1447,5.496148,4.255882,5690,9.660441,1535,4.29972,649,...,-0.002415,0.001803,0.434572,0.445161,317,335,-18,8,8,0
3,0.523643,3107,2338,5.441331,4.42803,4506,8.648752,1508,5.043478,571,...,-0.003505,-0.000593,0.378029,0.368421,339,388,-49,6,10,0
4,0.590909,4136,1770,6.714286,3.839479,5023,8.571672,2080,5.714286,616,...,-0.00224,-0.003505,0.422018,0.342105,430,305,125,11,5,0


In [44]:
def encodeCategory(y) :
    """
    Encode the target category/class using a
    label encoder

    Args: 
        y: target category from data set
    
    Returns:
        y_encoded: encoded version of target category from data set
    """
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    return y_encoded


In [43]:
#encode training data
y_lte_2020_encoded = encodeCategory(y_lte_2020)

#encode test data
y_gt_2020_encoded =  encodeCategory(y_gt_2020)

In [59]:
def rmse(y_test, y_pred) :
    """
    get root mean squared error using test data and predicted results

    Args:
        y_test: target category from test data
        y_pred: predicted target catagory from test data

    Returns:
        float: root mean squared error (RMSE)
    """
    return np.sqrt(mean_squared_error(y_test, y_pred))

# create root mean squared error function as scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False, response_method=['predict'])

def createEncoderPreprocessor(scaler, feature_columns) :
    """
    use standard scaler to scale all features to unit variance

    Args:
        scaler: method used to scale data set (i.e. StandardScaler(), OrdinalEncoder)
        feature_columns: list of feature names
    
    Returns:
        transformer object containing the StandardScaler as the preprocessor
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ('encoder', scaler, feature_columns)
        ])
    return preprocessor

# default encoder/scaler
ENCODER = StandardScaler()




In [48]:
# Baseline models
models = {
    'knn-c': (KNeighborsClassifier()),
    'decisiontree-c': (DecisionTreeClassifier(random_state=42)),
    'randomforest-c': (RandomForestClassifier(random_state=42)),
    'svc': (SVC(random_state=42)),
    'knn-r': (KNeighborsRegressor()),
    'decisiontree-r': (DecisionTreeRegressor(random_state=42)),
    'randomforest-r': (RandomForestRegressor(random_state=42)),
    'svr': (SVR()),
    'ElasticNet': (ElasticNet(random_state=42)),
    'Ridge' : (Ridge(random_state=42)),
    'Lasso': (Lasso(random_state=42))
}

In [52]:
def executeModelsForBaselineEval(pipe_models, X_train, y_train, X_test, y_test, scorer_fx) :
    """
    Execute pipeline containing encoded data
    and target models with a custom scorer function

    Args: 
        pipe_model: list of models
        X_train: DataFrame: contains training features
        y_train: ndarray: contains target category training data
        X_test: DataFrame: contains test features
        y_test: ndarray: contains target category test data
        scorer_fx: error function
    
    Returns:
        DataFrame: contains results for each model.
    """
    results = []
    for name, (model) in pipe_models.items():
        # Create a pipeline
        pipeline = Pipeline([
            ('preprocessor', createEncoderPreprocessor(ENCODER, X_train.columns.tolist())),
            (name, model)
        ])
        feature_cnt = len(X_train.columns.tolist())
        
        # Perform grid search
        grid_search = GridSearchCV(pipeline, param_grid={}, cv=5, n_jobs=-1, scoring=scorer_fx)
        
        # Fit the model and time it
        start_time = time.time()
        grid_search.fit(X_train, y_train)
        fit_time = (time.time() - start_time) / len(grid_search.cv_results_['mean_fit_time'])
        
        # Get the best estimator
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        ## print("Best params: \n", grid_search.best_params_)
        # print("\nBest estimator: \n", grid_search.best_estimator_)
        
        # Evaluate on training and test sets
        train_score = best_model.score(X_train, y_train)
        test_score = best_model.score(X_test, y_test)

        # extract best root mean squared error
        best_rmse = -grid_search.best_score_
        # print(f"best RMSE: {best_rmse}")
        
        # Append the results
        results.append([name + '-baseline', feature_cnt, best_params, train_score, test_score, best_rmse, fit_time])

    # Create the results DataFrame
    results_df = pd.DataFrame(results, columns=['model', 'feature count', 'best params', 'train score', 'test score', 'rmse', 'average fit time'])
    results_df.reset_index(inplace=True)

    return results_df


In [53]:
# STEP 1: Feature Selection - get baseline using X training features allstats_X_train

# create baseline models for all-stats vs score differential
results_df = executeModelsForBaselineEval(models, X_lte_2020, y_lte_2020_encoded, X_gt_2020, y_gt_2020_encoded, rmse_scorer)

results_df.to_json('results/allstats_makeplayoffs_baseline_model_results_df.json', orient='records', double_precision=10)

# baseline results
baseline_df = pd.read_json(r'results/allstats_makeplayoffs_baseline_model_results_df.json')
baseline_df[['model', 'train score','test score', 'rmse','average fit time']]


Unnamed: 0,model,train score,test score,rmse,average fit time
0,knn-c-baseline,0.910156,0.828125,0.354344,0.097406
1,decisiontree-c-baseline,1.0,0.90625,0.335189,0.034633
2,randomforest-c-baseline,1.0,0.875,0.277802,0.350802
3,svc-baseline,0.957031,0.828125,0.351932,0.035344
4,knn-r-baseline,0.725549,0.471746,0.312917,0.107342
5,decisiontree-r-baseline,1.0,0.619048,0.322601,0.039159
6,randomforest-r-baseline,0.956657,0.633829,0.25732,0.613784
7,svr-baseline,0.924564,0.478675,0.308364,0.049256
8,ElasticNet-baseline,0.0,-0.012153,0.48616,0.023503
9,Ridge-baseline,0.687929,0.532914,0.333586,0.034782


In [60]:
def createCorrelationHeatMap(X, figsize) :
    """
    Calculate the correlation between features
    and create a heat map from the results

    Args:
        X: DataFrame: contains feature data
        figsize: tuple containing the size of the diagram
    """
    corr = X.corr()

    # Generate the heatmap
    plt.figure(figsize=figsize)
    sns.heatmap(corr,annot=True, cmap='coolwarm', linewidths=0.5, linecolor='black',
                xticklabels=corr.columns, yticklabels=corr.columns)  # Consider turning off annotations for speed
    plt.show()

def calculatePermutationImportance(model_regressor, X_train, y_train, X_test, y_test, encoder=None) :
    """
    this calculates each feature's permutation-importance on the test data
    using a model regressor (i.e. RandomForestRegressor(n_estimators=100, random_state=42))
     
    NOTE: can be used to compare accuracy and correlation of features with permutation importance
    If high accuracy is observed with no features having importance there might be multicollinearity
    occuring within feature set.
     
    Return: Tuple {
                   data frame: containing importance score for each column in training/test data, 
                   float:      accuracy score from baseline prediction using model-regressor
                  }

    Args:
        model_regressor: type of model used for selecting features
        X_train: features from training data set
        y_train: category from training data set
        X_test: features from test data set
        y_test: category from test data set
        encoder: type of scaler/encoder to encoding the data set
    
    Returns:
        dict: list of features with importance value
        float: number of features
        float: score
        error: RMSE

    """
    # set encoder
    if (encoder == None):
        encoder = ENCODER # default

    pipe_model = Pipeline([('preprocessor', createEncoderPreprocessor(encoder, X_train.columns.tolist())),
                           ('regressor', model_regressor)
                        ])
    
    pipe_model.fit(X_train, y_train)
    score = pipe_model.score(X_test, y_test)
    y_pred = pipe_model.predict(X_test)
    error = rmse(y_test, y_pred)

    results = permutation_importance(pipe_model, X_test, y_test, n_repeats=10, n_jobs=-1, random_state=42)
    results_df = pd.DataFrame(data=results.importances_mean, index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)

    return results_df, len(results_df), score, error



In [65]:
# all stats vs making playoffs
step1_perm_imp_selected_features, feature_cnt, score, error = calculatePermutationImportance(RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
                                            X_lte_2020, y_lte_2020_encoded, X_gt_2020, y_gt_2020_encoded
                                        )
# step1_perm_imp_selected_features, feature_cnt, score, error = calculatePermutationImportance(Lasso(alpha=1.0, fit_intercept=False, max_iter=10000, random_state=42),
#                                             allstats_X_train, score_diff_y_train, allstats_X_test, score_diff_y_test
#                                         )
# print(f'STEP 1: Feature Evaluation for [total stats vs points differential]:\nAccuracy: feature-count: {feature_cnt}, {score}, RMSE: {error}\n{step1_perm_imp_selected_features}\n')
print(f'STEP 1: Feature Evaluation using Lasso(L1) for [total stats vs points differential]:\nAccuracy: feature-count: {feature_cnt}, {score}, RMSE: {error}')


STEP 1: Feature Evaluation using Lasso(L1) for [total stats vs points differential]:
Accuracy: feature-count: 54, 0.875, RMSE: 0.3535533905932738


In [None]:
# Calculate the correlation matrix for the entire data set
createCorrelationHeatMap(X=X_lte_2020, figsize=(40, 32))


In [68]:
# create list of best coefficients
# Cs = np.logspace(-5, .5)
# Cs
    #    1.00000000e-05, 1.29492584e-05, 1.67683294e-05, 2.17137430e-05,
    #    2.81176870e-05, 3.64103195e-05, 4.71486636e-05, 6.10540230e-05,
    #    7.90604321e-05, 1.02377397e-04, 1.32571137e-04, 1.71669791e-04,
    #    2.22299648e-04, 2.87861559e-04, 3.72759372e-04, 4.82695744e-04,
    #    6.25055193e-04, 8.09400122e-04, 1.04811313e-03, 1.35722878e-03,
    #    1.75751062e-03, 2.27584593e-03, 2.94705170e-03, 3.81621341e-03,
    #    4.94171336e-03, 6.39915234e-03, 8.28642773e-03, 1.07303094e-02,
    #    1.38949549e-02, 1.79929362e-02, 2.32995181e-02, 3.01711481e-02,
    #    3.90693994e-02, 5.05919749e-02, 6.55128557e-02, 8.48342898e-02,
    #    1.09854114e-01, 1.42252931e-01, 1.84206997e-01, 2.38534401e-01,
    #    3.08884360e-01, 3.99982340e-01, 5.17947468e-01, 6.70703561e-01,
    #    8.68511374e-01, 1.12465782e+00, 1.45634848e+00, 1.88586328e+00,
    #    2.44205309e+00, 3.16227766e+00

# randomely selected 6 of the 50 coefficients
Cs = [1.00000000e-05, 8.09400122e-04, 2.27584593e-03, 6.55128557e-02, 5.17947468e-01, 3.16227766e+00]

In [70]:
def createCoefToFeatureList(X_train, y_train, coefseeds, pipe_models=None) :
    """
    built convergence table to see what were, optimally, the best features.
        ...the information from this effort matched the output from the SelectFromModel() which, in the end, what was eventually used.

        Args:
            X_train : X features from training data set
            y_train : target category from training data set
            coefseed : list of coeficients

        Returns:
            DataFrame: containing coeficient for each feature by coefseed
    """
    coef_list = []
    for C in coefseeds:
        for name, (model, hyperparams) in pipe_models.items():
            # Create a pipeline
            pipeline = Pipeline([
                ('preprocessor', createEncoderPreprocessor(ENCODER, X_train.columns.tolist())),
                (name, model)
            ])

            model_grid = GridSearchCV(pipeline, param_grid=hyperparams, cv=10, n_jobs=-1, scoring=rmse_scorer)
            model_grid.fit(X_train, y_train)

            coef_list.append(list(model_grid.best_estimator_.named_steps[name].coef_))
    
    coef_df = pd.DataFrame(data=coef_list, columns=X_train.columns)
    coef_df.index = coefseeds
    return coef_df



In [None]:
feature_select_hypermodels = {
    'knn-c': (KNeighborsClassifier(n_jobs=-1), {
                                    'knn__n_neighbors': [2, 5, 10],
                                    'knn-c__weights': ['uniform', 'distance'],
                                    'knn-c__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                    'knn-c__p': [1]}), # l1 error only
    'dt-c': (DecisionTreeClassifier(random_state=42), {
                                                    'dt-c__criterion': ['gini', 'entropy', 'log_loss']
                                                }),
    'rf-c': (RandomForestClassifier(random_state=42, n_jobs=-1), {
                                                    'rf-c__n_estimators': [100, 1000],
                                                    'rf-c__criterion': ['gini', 'entropy', 'log_loss']
                                                }),
    'svc': (SVC(random_state=42), {
                'svc__C': [0.1, 1, 10],
                'svc__kernel': ['linear', 'rbf', 'poly', 'linear', 'sigmoid']
            }),
}

# STEP 1: Feature Selection: create initial coeficients
print("STEP 1: Feature selection - generating initial coeficients")
optimal_features_df = createCoefToFeatureList(X_lte_2020, y_lte_2020_encoded, Cs, feature_select_hypermodels)
optimal_features_df