# Load Packages

In [20]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats
%matplotlib inline

# Feature Selection general package imports
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# We probably don't need gridlines, but if we do comment this line
sns.set(style='ticks')

flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
flatui = sns.color_palette(flatui)

# Set Up Parameters

In [17]:
# If you want to hard limit features selected, or if feature selection methods require a max:
num_feats = 50

# Classification or Regression Togggle
classification_toggle = 1

# Read Data and Preprocess

In [21]:
# Read dataframe

data = pd.read_csv("../../data/clean/model_plays.csv", low_memory = False)

# Test for run and pass predictors. Comment out if we don't want this
data = data[((data['target'] == 'run') |  (data['target'] == 'pass'))]

# Remove labels
del data['play_id']
del data['game_id']

# Separate Target from dataset
#firstdata['target'] = data['target'].astype('category')
#target_cat_label = dict(enumerate(data.target.categories))

# Change categorical variables to numerical
data['target'] = data['target'].astype('category').cat.codes
# data['target'] = data['target'].map(target_cat_label)

target = data['target']
del data['target']
y = target

# Normalize features
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

# Min Max features
min_scaler = MinMaxScaler()
df2 = pd.DataFrame(min_scaler.fit_transform(data), columns = data.columns)

feature_name = list(df.columns)

# Filter Methods

Filter methods consider relationship between features and target variable

# Pearson Correlation

Pearson correlation usually works best with all numerical variables. Binary variables may (will) cause some issues, especially if there are more binary/categorical than numeric.

In [13]:
def cor_selector(X, y, num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0,1]
        cor_list.append(cor)
    # Replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # Feature name
    cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # Feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(df, y, num_feats)
print(str(len(cor_feature)), 'selected features')
    

50 selected features


  c /= stddev[:, None]
  c /= stddev[None, :]


In [14]:
cor_feature

['pd_avg_interceptions',
 'ld_plays',
 'ld_opp_outcome_fumble_lost',
 'def_lolb',
 'Dome',
 'off_hb',
 'pd_average_plays',
 'ld_outcome_punt',
 'off_qb',
 'game_seconds_remaining',
 'play_type_pass',
 'air_yards',
 'off_lt',
 'ld_opp_outcome_interception',
 'pd_pass_completion_pct',
 'ld_opp_outcome_no_ld',
 'qb_scramble',
 'game_half',
 'play_type_run',
 'pd_average_sacks',
 'ld_outcome_turnover_on_downs',
 'ld_outcome_no_ld',
 'off_wr',
 'ld_opp_outcome_field_goal',
 'play_type_qb_spike',
 'qb_spike',
 'pd_run_percentage',
 'pd_average_points',
 'ld_opp_outcome_touchdown',
 'pd_passer_rating',
 'ld_expl_run',
 'down',
 'play_end_total_ydstogo',
 'ep',
 'pd_expl_run',
 'ld_start_yds_to_go',
 'ld_opp_outcome_turnover_on_downs',
 'ydsnet',
 'sack',
 'play_end_first_down_ydstogo',
 'no_huddle',
 'ydstogo',
 'yards_after_catch',
 'shotgun',
 'def_team_pts_scored_vs_pace',
 'pts_scored_vs_pace',
 'adj_wp',
 'score_differential',
 'yards_gained',
 'epa']

# Chi-Squared (Classification)

Use calculated Chi-Squared to find feature set with the best scores. Only use for classification problems!

In [22]:
if classification_toggle == 1:
    from sklearn.feature_selection import SelectKBest, chi2
    
    chi_selector = SelectKBest(chi2, k = num_feats)
    chi_selector.fit(df2, y)
    chi_support = chi_selector.get_support()
    chi_feature = df.loc[:,chi_support].columns.tolist()
    print(str(len(chi_feature)), 'selected features')
else:
    None

50 selected features


In [24]:
chi_feature

['game_seconds_remaining',
 'game_half',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_spike',
 'qb_scramble',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'fumble',
 'Snow',
 'Dome',
 'off_lt',
 'off_qb',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_expl_run',
 'ld_expl_pass',
 'ld_start_yds_to_go',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_sacks',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_completion_pct',
 'play_type_pass',
 'play_type_qb_spike',
 'play_type_run',
 'ld_outcome_fumble_lost',
 'ld_outcome_interception',
 'ld_outcome_no_ld',
 'ld_outcome_punt',
 'ld_outcome_touchdown',
 'ld_outcome_turnover_on_downs',
 'ld_opp_outcome_field_goal',
 'ld_opp_outcome_fumble_lost',
 'ld_opp_outcome_interception',
 'ld_opp_outcome_no_ld',
 'ld_opp_outcome_touchdown',
 'ld_opp_outcome_turnover_on_downs']

# F-Test (Regression)

Least Squares F-Test. Essentially, run a linear regression and test model and feature significance

In [32]:
from sklearn.feature_selection import f_regression, f_classif

if classification_toggle == 0:
    f_selector = SelectKBest(f_regression, k = num_feats)
    f_selector.fit(df,y)
    f_support = f_selector.get_support()
    f_feature = df.loc[:,f_support].columns.tolist()
    print(str(len(f_feature)), 'selected features')
else:
    X_indices = np.arange(df.shape[-1])
    f_selector = SelectKBest(f_classif, k = num_feats)
    f_selector.fit(df, y)
    f_support = f_selector.get_support()
    f_feature = df.loc[:,f_support].columns.tolist()
    print(str(len(f_feature)), 'selected features')


50 selected features


  f = msb / msw


In [33]:
f_feature

['game_seconds_remaining',
 'game_half',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_spike',
 'qb_scramble',
 'air_yards',
 'yards_after_catch',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'Dome',
 'off_hb',
 'off_lt',
 'off_qb',
 'off_wr',
 'def_lolb',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_plays',
 'ld_expl_run',
 'ld_start_yds_to_go',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_plays',
 'pd_average_sacks',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_completion_pct',
 'play_type_pass',
 'play_type_qb_spike',
 'play_type_run',
 'ld_outcome_no_ld',
 'ld_outcome_punt',
 'ld_outcome_turnover_on_downs',
 'ld_opp_outcome_field_goal',
 'ld_opp_outcome_fumble_lost',
 'ld_opp_outcome_interception',
 'ld_opp_outcome_no_ld',
 'ld_opp_outcome_touchdown',
 'ld_opp_outcome_turnover_on_downs']

# Mutual Information

Measures the dependence of one variable to another. Has relative advantage over F-Test because it can contend with non-linear relationships

In [34]:
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif

if classification_toggle == 0:
    mutual_selector = SelectKBest(mutual_info_regression, k = num_feats)
    mutual_selector.fit(df, y)
    mutual_support = mutual_selector.get_support()
    mutual_feature = df.loc[:, mutual_support].columns.tolist()
    print(str(len(mutual_feature)), 'selected features')
else:
    mutual_selector = SelectKBest(mutual_info_classif, k = num_feats)
    mutual_selector.fit(df, y)
    mutual_support = mutual_selector.get_support()
    mutual_feature = df.loc[:, mutual_support].columns.tolist()
    print(str(len(mutual_feature)), 'selected features')

50 selected features


In [35]:
mutual_feature

['game_seconds_remaining',
 'game_half',
 'qtr',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'yards_after_catch',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'Rain',
 'Wind',
 'Dome',
 'GameMonth',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'ld_expl_run',
 'ld_expl_pass',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_top',
 'pd_average_sacks',
 'pd_average_tfl',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_yard_att',
 'pd_rush_yard_att',
 'home',
 'play_type_pass',
 'play_type_run',
 'ld_outcome_field_goal',
 'ld_outcome_fumble_lost',
 'ld_outcome_interception',
 'ld_outcome_no_ld',
 'ld_outcome_punt',
 'ld_outcome_touchdown',
 'ld_opp_outcome_end_of_half',
 'ld_opp_outcome_field_goal',
 'ld_opp_outcome_fumble_lost',
 'ld_opp_outcome_no_ld',
 'ld_opp_outcome_punt',
 'ld_opp_outcome_touchdown',
 'ld_opp_outcome_turnover_on_downs']

# Variance Threshold

Variance threshold identifies features with variations below a certain cutoff point. The intuition is - if a feature doesn't vary too much, we assume it doesn't have a lot of predictive power.

NOTE: This metric does not consider relationship of the feature and the target variable

In [87]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()
selector.fit(df)

variance_selectors = selector.get_support()
variance_selectors
variance_features = list(df.columns[np.array(variance_selectors).astype(bool)])
variance_features

['yardline_100',
 'game_seconds_remaining',
 'game_half',
 'qtr',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_spike',
 'qb_scramble',
 'air_yards',
 'yards_after_catch',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'fumble',
 'Rain',
 'Snow',
 'Wind',
 'Fog',
 'Dome',
 'GameMonth',
 'off_c',
 'off_hb',
 'off_lg',
 'off_lt',
 'off_qb',
 'off_rg',
 'off_rt',
 'off_te',
 'off_wr',
 'def_cb',
 'def_dt',
 'def_fs',
 'def_le',
 'def_lolb',
 'def_mlb',
 'def_re',
 'def_rolb',
 'def_ss',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_plays',
 'ld_drive_length',
 'ld_expl_run',
 'ld_expl_pass',
 'ld_start_yds_to_go',
 'pd_expl_pass',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_plays',
 'pd_average_top',
 'pd_average_sacks',
 'pd_average_tfl',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_yard_att',
 'pd_rush_yard_att',
 

In [88]:
print(len(variance_features))

89


In [89]:
tups = []

for c in enumerate(df.columns):
    tups.append((c[1], selector.variances_[c[0]]))
    
tups

[('yardline_100', 0.9999999999999997),
 ('game_seconds_remaining', 1.0000000000000002),
 ('game_half', 1.0000000000000007),
 ('qtr', 0.9999999999999997),
 ('down', 0.9999999999999992),
 ('ydstogo', 1.0000000000000002),
 ('ydsnet', 1.0),
 ('yards_gained', 0.9999999999999999),
 ('shotgun', 1.0),
 ('no_huddle', 0.9999999999999999),
 ('qb_dropback', 1.0000000000000007),
 ('qb_spike', 1.000000000000001),
 ('qb_scramble', 1.0),
 ('air_yards', 1.0),
 ('yards_after_catch', 1.0000000000000002),
 ('score_differential', 1.0),
 ('ep', 1.0),
 ('epa', 1.0),
 ('sack', 1.0000000000000002),
 ('fumble', 1.0000000000000007),
 ('Rain', 1.0),
 ('Snow', 1.0000000000000002),
 ('Wind', 0.9999999999999999),
 ('Fog', 1.0000000000000007),
 ('Dome', 0.9999999999999997),
 ('GameMonth', 1.0),
 ('off_c', 1.0000000000000002),
 ('off_hb', 0.9999999999999996),
 ('off_lg', 1.0),
 ('off_lt', 1.0000000000000002),
 ('off_qb', 1.0),
 ('off_rg', 1.0000000000000002),
 ('off_rt', 1.0),
 ('off_te', 1.0000000000000002),
 ('off_w

# Recursive Feature Elimination

Eliminates worst performing features one after another until a best subset is known

In [43]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier

clf=GradientBoostingClassifier(n_estimators = 800, 
                               loss = 'exponential', 
                               learning_rate = 0.01, 
                               max_depth = 7, 
                               min_samples_split = 50, 
                               min_samples_leaf = 64,
                               max_features = 'auto')
rfe_selector = RFE(estimator = clf, n_features_to_select = num_feats, step = 10, verbose = 5)
rfe_selector.fit(df, y)

Fitting estimator with 90 features.
Fitting estimator with 80 features.
Fitting estimator with 70 features.
Fitting estimator with 60 features.


RFE(estimator=GradientBoostingClassifier(learning_rate=0.01, loss='exponential',
                                         max_depth=7, max_features='auto',
                                         min_samples_leaf=64,
                                         min_samples_split=50,
                                         n_estimators=800),
    n_features_to_select=50, step=10, verbose=5)

In [45]:
rfe_support = rfe_selector.get_support()
rfe_feature = df.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

50 selected features


In [46]:
rfe_feature

['yardline_100',
 'game_seconds_remaining',
 'game_half',
 'qtr',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'air_yards',
 'score_differential',
 'ep',
 'epa',
 'off_c',
 'off_hb',
 'off_lg',
 'off_lt',
 'off_qb',
 'off_rg',
 'off_rt',
 'off_te',
 'off_wr',
 'def_cb',
 'def_dt',
 'def_fs',
 'def_le',
 'def_lolb',
 'def_mlb',
 'def_re',
 'def_rolb',
 'def_ss',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_drive_length',
 'ld_start_yds_to_go',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_plays',
 'pd_average_top',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_yard_att',
 'pd_rush_yard_att',
 'pd_pass_completion_pct',
 'play_type_run']

# Sequential Feature Selector

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

knn = KNeighborsClassifier(n_neighbors=3)

# Selection Forward Selection
sfs1 = SFS(knn, 
           k_features= num_feats, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(df, y)

sfs1.k_feature_idx_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.4s remaining:    0.0s


In [None]:
sfs1.k_feature_names_

In [None]:
sfs1.k_score_

# Embedded Methods

## Lasso Regression

Put an L1 penalty to squeeze out unimportant features

In [90]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

embedded_lr_selector = SelectFromModel(LassoCV(), max_features = num_feats)
embedded_lr_selector.fit(df, y)

SelectFromModel(estimator=LassoCV(), max_features=50)

In [91]:
embedded_lr_support = embedded_lr_selector.get_support()
embedded_lr_feature = df.loc[:, embedded_lr_support].columns.tolist()
print(str(len(embedded_lr_feature)), 'selected features')

50 selected features


In [92]:
embedded_lr_feature

['yardline_100',
 'game_seconds_remaining',
 'game_half',
 'qtr',
 'down',
 'ydstogo',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_spike',
 'air_yards',
 'yards_after_catch',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'fumble',
 'Rain',
 'Snow',
 'Dome',
 'off_c',
 'off_lt',
 'off_qb',
 'off_rt',
 'off_wr',
 'def_cb',
 'def_dt',
 'def_mlb',
 'pts_scored_vs_pace',
 'adj_wp',
 'play_end_total_ydstogo',
 'ld_plays',
 'ld_drive_length',
 'ld_expl_run',
 'ld_expl_pass',
 'ld_start_yds_to_go',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_top',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_completion_pct',
 'home',
 'play_type_qb_spike',
 'ld_outcome_turnover_on_downs',
 'ld_opp_outcome_fumble_lost',
 'ld_opp_outcome_interception',
 'ld_opp_outcome_touchdown',
 'ld_opp_outcome_turnover_on_downs']

## Random Forest

In [52]:
from sklearn.ensemble import RandomForestClassifier

embedded_clf_selector = SelectFromModel(RandomForestClassifier(n_estimators = 100))
embedded_clf_selector.fit(df, y)

embedded_clf_support = embedded_clf_selector.get_support()
embedded_clf_feature = df.loc[:, embedded_clf_support].columns.tolist()
print(str(len(embedded_clf_feature)), 'selected features')

42 selected features


In [53]:
embedded_clf_feature

['yardline_100',
 'game_seconds_remaining',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'air_yards',
 'score_differential',
 'ep',
 'epa',
 'off_c',
 'off_hb',
 'off_lg',
 'off_lt',
 'off_qb',
 'off_rg',
 'off_rt',
 'off_te',
 'off_wr',
 'def_cb',
 'def_dt',
 'def_fs',
 'def_le',
 'def_lolb',
 'def_mlb',
 'def_re',
 'def_rolb',
 'def_ss',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_drive_length',
 'ld_start_yds_to_go',
 'pd_average_plays',
 'pd_average_top',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_yard_att',
 'pd_rush_yard_att',
 'pd_pass_completion_pct']

## LightGBM

In [54]:
from lightgbm.sklearn import LGBMClassifier

In [56]:
lgbc = LGBMClassifier()

embedded_lgb_selector = SelectFromModel(lgbc, max_features = num_feats)
embedded_lgb_selector.fit(df,y)

embedded_lgb_support = embedded_lgb_selector.get_support()
embedded_lgb_feature = df.loc[:,  embedded_lgb_support].columns.tolist()
print(str(len(embedded_lgb_feature)), 'selected features')

23 selected features


In [57]:
embedded_lgb_feature

['yardline_100',
 'game_seconds_remaining',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'qb_dropback',
 'score_differential',
 'ep',
 'epa',
 'off_c',
 'off_lt',
 'off_qb',
 'off_wr',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_rush_yard_att']

# CatBoost

In [62]:
from catboost import CatBoostClassifier

cbr = CatBoostClassifier()

embedded_cbr_selector = SelectFromModel(cbr)
embedded_cbr_selector.fit(df, y)

embedded_cbr_support = embedded_cbr_selector.get_support()
embedded_cbr_feature = df.loc[:,  embedded_cbr_support].columns.tolist()
print(str(len(embedded_cbr_feature)), 'selected features')

Learning rate set to 0.097862
0:	learn: 0.6773633	total: 201ms	remaining: 3m 20s
1:	learn: 0.6650804	total: 235ms	remaining: 1m 57s
2:	learn: 0.6544021	total: 267ms	remaining: 1m 28s
3:	learn: 0.6455472	total: 300ms	remaining: 1m 14s
4:	learn: 0.6382975	total: 333ms	remaining: 1m 6s
5:	learn: 0.6326226	total: 362ms	remaining: 60s
6:	learn: 0.6277659	total: 390ms	remaining: 55.3s
7:	learn: 0.6231966	total: 421ms	remaining: 52.2s
8:	learn: 0.6194382	total: 452ms	remaining: 49.8s
9:	learn: 0.6161917	total: 478ms	remaining: 47.3s
10:	learn: 0.6135855	total: 505ms	remaining: 45.4s
11:	learn: 0.6112632	total: 535ms	remaining: 44s
12:	learn: 0.6077778	total: 561ms	remaining: 42.6s
13:	learn: 0.6055598	total: 589ms	remaining: 41.5s
14:	learn: 0.6037437	total: 618ms	remaining: 40.6s
15:	learn: 0.6022806	total: 647ms	remaining: 39.8s
16:	learn: 0.6008801	total: 676ms	remaining: 39.1s
17:	learn: 0.5990918	total: 704ms	remaining: 38.4s
18:	learn: 0.5980844	total: 729ms	remaining: 37.6s
19:	learn: 

160:	learn: 0.5630827	total: 5.24s	remaining: 27.3s
161:	learn: 0.5629656	total: 5.27s	remaining: 27.3s
162:	learn: 0.5628966	total: 5.3s	remaining: 27.2s
163:	learn: 0.5628217	total: 5.33s	remaining: 27.2s
164:	learn: 0.5627517	total: 5.37s	remaining: 27.2s
165:	learn: 0.5626414	total: 5.4s	remaining: 27.2s
166:	learn: 0.5625251	total: 5.44s	remaining: 27.1s
167:	learn: 0.5624485	total: 5.47s	remaining: 27.1s
168:	learn: 0.5623516	total: 5.5s	remaining: 27s
169:	learn: 0.5622966	total: 5.53s	remaining: 27s
170:	learn: 0.5622198	total: 5.56s	remaining: 26.9s
171:	learn: 0.5621122	total: 5.59s	remaining: 26.9s
172:	learn: 0.5620165	total: 5.63s	remaining: 26.9s
173:	learn: 0.5619490	total: 5.66s	remaining: 26.9s
174:	learn: 0.5618866	total: 5.68s	remaining: 26.8s
175:	learn: 0.5617956	total: 5.71s	remaining: 26.7s
176:	learn: 0.5617231	total: 5.74s	remaining: 26.7s
177:	learn: 0.5616464	total: 5.77s	remaining: 26.6s
178:	learn: 0.5615714	total: 5.8s	remaining: 26.6s
179:	learn: 0.561515

323:	learn: 0.5510936	total: 10.5s	remaining: 22s
324:	learn: 0.5510316	total: 10.6s	remaining: 21.9s
325:	learn: 0.5510161	total: 10.6s	remaining: 21.9s
326:	learn: 0.5509491	total: 10.6s	remaining: 21.8s
327:	learn: 0.5508833	total: 10.6s	remaining: 21.8s
328:	learn: 0.5508209	total: 10.7s	remaining: 21.8s
329:	learn: 0.5507486	total: 10.7s	remaining: 21.7s
330:	learn: 0.5506807	total: 10.7s	remaining: 21.7s
331:	learn: 0.5506138	total: 10.8s	remaining: 21.6s
332:	learn: 0.5505693	total: 10.8s	remaining: 21.6s
333:	learn: 0.5505205	total: 10.8s	remaining: 21.6s
334:	learn: 0.5504633	total: 10.9s	remaining: 21.5s
335:	learn: 0.5503868	total: 10.9s	remaining: 21.5s
336:	learn: 0.5503320	total: 10.9s	remaining: 21.5s
337:	learn: 0.5502875	total: 11s	remaining: 21.5s
338:	learn: 0.5502366	total: 11s	remaining: 21.4s
339:	learn: 0.5501629	total: 11s	remaining: 21.4s
340:	learn: 0.5501062	total: 11.1s	remaining: 21.4s
341:	learn: 0.5500344	total: 11.1s	remaining: 21.3s
342:	learn: 0.549966

485:	learn: 0.5420018	total: 15.6s	remaining: 16.5s
486:	learn: 0.5419444	total: 15.7s	remaining: 16.5s
487:	learn: 0.5418696	total: 15.7s	remaining: 16.5s
488:	learn: 0.5418283	total: 15.7s	remaining: 16.4s
489:	learn: 0.5418174	total: 15.7s	remaining: 16.4s
490:	learn: 0.5417700	total: 15.8s	remaining: 16.4s
491:	learn: 0.5417151	total: 15.8s	remaining: 16.3s
492:	learn: 0.5416746	total: 15.8s	remaining: 16.3s
493:	learn: 0.5416233	total: 15.9s	remaining: 16.3s
494:	learn: 0.5415673	total: 15.9s	remaining: 16.2s
495:	learn: 0.5415050	total: 15.9s	remaining: 16.2s
496:	learn: 0.5414642	total: 16s	remaining: 16.2s
497:	learn: 0.5414229	total: 16s	remaining: 16.1s
498:	learn: 0.5413693	total: 16s	remaining: 16.1s
499:	learn: 0.5413238	total: 16.1s	remaining: 16.1s
500:	learn: 0.5412802	total: 16.1s	remaining: 16s
501:	learn: 0.5412212	total: 16.1s	remaining: 16s
502:	learn: 0.5411611	total: 16.2s	remaining: 16s
503:	learn: 0.5410753	total: 16.2s	remaining: 15.9s
504:	learn: 0.5410340	to

645:	learn: 0.5340439	total: 20.7s	remaining: 11.4s
646:	learn: 0.5339775	total: 20.8s	remaining: 11.3s
647:	learn: 0.5339325	total: 20.8s	remaining: 11.3s
648:	learn: 0.5338717	total: 20.8s	remaining: 11.3s
649:	learn: 0.5338323	total: 20.9s	remaining: 11.2s
650:	learn: 0.5337895	total: 20.9s	remaining: 11.2s
651:	learn: 0.5337539	total: 20.9s	remaining: 11.2s
652:	learn: 0.5337094	total: 21s	remaining: 11.1s
653:	learn: 0.5336705	total: 21s	remaining: 11.1s
654:	learn: 0.5336336	total: 21s	remaining: 11.1s
655:	learn: 0.5335755	total: 21s	remaining: 11s
656:	learn: 0.5335377	total: 21.1s	remaining: 11s
657:	learn: 0.5334954	total: 21.1s	remaining: 11s
658:	learn: 0.5334313	total: 21.1s	remaining: 10.9s
659:	learn: 0.5333802	total: 21.2s	remaining: 10.9s
660:	learn: 0.5333299	total: 21.2s	remaining: 10.9s
661:	learn: 0.5332828	total: 21.2s	remaining: 10.8s
662:	learn: 0.5332309	total: 21.3s	remaining: 10.8s
663:	learn: 0.5331806	total: 21.3s	remaining: 10.8s
664:	learn: 0.5331240	tota

804:	learn: 0.5263407	total: 25.8s	remaining: 6.24s
805:	learn: 0.5263069	total: 25.8s	remaining: 6.21s
806:	learn: 0.5262737	total: 25.8s	remaining: 6.17s
807:	learn: 0.5262369	total: 25.9s	remaining: 6.14s
808:	learn: 0.5262042	total: 25.9s	remaining: 6.11s
809:	learn: 0.5261641	total: 25.9s	remaining: 6.08s
810:	learn: 0.5261091	total: 25.9s	remaining: 6.05s
811:	learn: 0.5260639	total: 26s	remaining: 6.01s
812:	learn: 0.5260354	total: 26s	remaining: 5.98s
813:	learn: 0.5259879	total: 26s	remaining: 5.95s
814:	learn: 0.5259533	total: 26.1s	remaining: 5.92s
815:	learn: 0.5259146	total: 26.1s	remaining: 5.88s
816:	learn: 0.5258529	total: 26.1s	remaining: 5.85s
817:	learn: 0.5258044	total: 26.2s	remaining: 5.82s
818:	learn: 0.5257726	total: 26.2s	remaining: 5.79s
819:	learn: 0.5257340	total: 26.2s	remaining: 5.75s
820:	learn: 0.5256953	total: 26.3s	remaining: 5.72s
821:	learn: 0.5256381	total: 26.3s	remaining: 5.69s
822:	learn: 0.5255878	total: 26.3s	remaining: 5.66s
823:	learn: 0.5255

963:	learn: 0.5196326	total: 30.9s	remaining: 1.15s
964:	learn: 0.5195996	total: 30.9s	remaining: 1.12s
965:	learn: 0.5195437	total: 30.9s	remaining: 1.09s
966:	learn: 0.5194983	total: 31s	remaining: 1.06s
967:	learn: 0.5194542	total: 31s	remaining: 1.02s
968:	learn: 0.5194128	total: 31s	remaining: 992ms
969:	learn: 0.5193575	total: 31s	remaining: 960ms
970:	learn: 0.5193043	total: 31.1s	remaining: 928ms
971:	learn: 0.5192586	total: 31.1s	remaining: 896ms
972:	learn: 0.5192293	total: 31.1s	remaining: 864ms
973:	learn: 0.5191847	total: 31.2s	remaining: 832ms
974:	learn: 0.5191547	total: 31.2s	remaining: 800ms
975:	learn: 0.5191239	total: 31.2s	remaining: 768ms
976:	learn: 0.5190748	total: 31.3s	remaining: 736ms
977:	learn: 0.5190365	total: 31.3s	remaining: 704ms
978:	learn: 0.5189948	total: 31.3s	remaining: 672ms
979:	learn: 0.5189525	total: 31.3s	remaining: 640ms
980:	learn: 0.5188972	total: 31.4s	remaining: 607ms
981:	learn: 0.5188461	total: 31.4s	remaining: 575ms
982:	learn: 0.518826

In [63]:
embedded_cbr_feature

['yardline_100',
 'game_seconds_remaining',
 'qtr',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'qb_dropback',
 'score_differential',
 'ep',
 'epa',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_total_ydstogo',
 'pd_run_percentage']

# Summary Results

In [93]:
pd.set_option('display.max_rows', None)
# put all selections together
feature_selection_df = pd.DataFrame({'Feature': feature_name, 'Pearson Correlation':cor_support, 'F-Test':f_support, 
                                     'Chi-Squared' : chi_support, 'Mutual Information' : mutual_support, 
                                     'Variance Threshold': variance_selectors, 'LassoCV': embedded_lr_support,'RFE - GB': rfe_support, 
                                     'RandomForest': embedded_clf_support, 'CatBoost': embedded_cbr_support, 
                                     'LightGBM' : embedded_lgb_support})

# Count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis = 1)

# Display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total', 'Feature'] , ascending = False)
feature_selection_df.index = range(1, len(feature_selection_df) + 1)
feature_selection_df.head(100)

Unnamed: 0,Feature,Pearson Correlation,F-Test,Chi-Squared,Mutual Information,Variance Threshold,LassoCV,RFE - GB,RandomForest,CatBoost,LightGBM,Total
1,ydstogo,True,True,True,True,True,True,True,True,True,True,10
2,yards_gained,True,True,True,True,True,True,True,True,True,True,10
3,score_differential,True,True,True,True,True,True,True,True,True,True,10
4,pts_scored_vs_pace,True,True,True,True,True,True,True,True,True,True,10
5,pd_run_percentage,True,True,True,True,True,True,True,True,True,True,10
6,game_seconds_remaining,True,True,True,True,True,True,True,True,True,True,10
7,epa,True,True,True,True,True,True,True,True,True,True,10
8,ep,True,True,True,True,True,True,True,True,True,True,10
9,down,True,True,True,True,True,True,True,True,True,True,10
10,adj_wp,True,True,True,True,True,True,True,True,True,True,10
