# Load Packages

In [20]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats
%matplotlib inline

# Feature Selection general package imports
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# We probably don't need gridlines, but if we do comment this line
sns.set(style='ticks')

flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
flatui = sns.color_palette(flatui)

# Set Up Parameters

In [17]:
# If you want to hard limit features selected, or if feature selection methods require a max:
num_feats = 50

# Classification or Regression Togggle
classification_toggle = 1

# Read Data and Preprocess

In [21]:
# Read dataframe

data = pd.read_csv("../../data/clean/model_plays.csv", low_memory = False)

# Test for run and pass predictors. Comment out if we don't want this
data = data[((data['target'] == 'run') |  (data['target'] == 'pass'))]

# Remove labels
del data['play_id']
del data['game_id']

# Separate Target from dataset
#firstdata['target'] = data['target'].astype('category')
#target_cat_label = dict(enumerate(data.target.categories))

# Change categorical variables to numerical
data['target'] = data['target'].astype('category').cat.codes
# data['target'] = data['target'].map(target_cat_label)

target = data['target']
del data['target']
y = target

# Normalize features
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

# Min Max features
min_scaler = MinMaxScaler()
df2 = pd.DataFrame(min_scaler.fit_transform(data), columns = data.columns)

feature_name = list(df.columns)

# Filter Methods

Filter methods consider relationship between features and target variable

# Pearson Correlation

Pearson correlation usually works best with all numerical variables. Binary variables may (will) cause some issues, especially if there are more binary/categorical than numeric.

In [13]:
def cor_selector(X, y, num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0,1]
        cor_list.append(cor)
    # Replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # Feature name
    cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # Feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(df, y, num_feats)
print(str(len(cor_feature)), 'selected features')
    

50 selected features


  c /= stddev[:, None]
  c /= stddev[None, :]


In [14]:
cor_feature

['pd_avg_interceptions',
 'ld_plays',
 'ld_opp_outcome_fumble_lost',
 'def_lolb',
 'Dome',
 'off_hb',
 'pd_average_plays',
 'ld_outcome_punt',
 'off_qb',
 'game_seconds_remaining',
 'play_type_pass',
 'air_yards',
 'off_lt',
 'ld_opp_outcome_interception',
 'pd_pass_completion_pct',
 'ld_opp_outcome_no_ld',
 'qb_scramble',
 'game_half',
 'play_type_run',
 'pd_average_sacks',
 'ld_outcome_turnover_on_downs',
 'ld_outcome_no_ld',
 'off_wr',
 'ld_opp_outcome_field_goal',
 'play_type_qb_spike',
 'qb_spike',
 'pd_run_percentage',
 'pd_average_points',
 'ld_opp_outcome_touchdown',
 'pd_passer_rating',
 'ld_expl_run',
 'down',
 'play_end_total_ydstogo',
 'ep',
 'pd_expl_run',
 'ld_start_yds_to_go',
 'ld_opp_outcome_turnover_on_downs',
 'ydsnet',
 'sack',
 'play_end_first_down_ydstogo',
 'no_huddle',
 'ydstogo',
 'yards_after_catch',
 'shotgun',
 'def_team_pts_scored_vs_pace',
 'pts_scored_vs_pace',
 'adj_wp',
 'score_differential',
 'yards_gained',
 'epa']

# Chi-Squared (Classification)

Use calculated Chi-Squared to find feature set with the best scores. Only use for classification problems!

In [22]:
if classification_toggle == 1:
    from sklearn.feature_selection import SelectKBest, chi2
    
    chi_selector = SelectKBest(chi2, k = num_feats)
    chi_selector.fit(df2, y)
    chi_support = chi_selector.get_support()
    chi_feature = df.loc[:,chi_support].columns.tolist()
    print(str(len(chi_feature)), 'selected features')
else:
    None

50 selected features


In [24]:
chi_feature

['game_seconds_remaining',
 'game_half',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_spike',
 'qb_scramble',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'fumble',
 'Snow',
 'Dome',
 'off_lt',
 'off_qb',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_expl_run',
 'ld_expl_pass',
 'ld_start_yds_to_go',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_sacks',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_completion_pct',
 'play_type_pass',
 'play_type_qb_spike',
 'play_type_run',
 'ld_outcome_fumble_lost',
 'ld_outcome_interception',
 'ld_outcome_no_ld',
 'ld_outcome_punt',
 'ld_outcome_touchdown',
 'ld_outcome_turnover_on_downs',
 'ld_opp_outcome_field_goal',
 'ld_opp_outcome_fumble_lost',
 'ld_opp_outcome_interception',
 'ld_opp_outcome_no_ld',
 'ld_opp_outcome_touchdown',
 'ld_opp_outcome_turnover_on_downs']

# F-Test (Regression)

Least Squares F-Test. Essentially, run a linear regression and test model and feature significance

In [32]:
from sklearn.feature_selection import f_regression, f_classif

if classification_toggle == 0:
    f_selector = SelectKBest(f_regression, k = num_feats)
    f_selector.fit(df,y)
    f_support = f_selector.get_support()
    f_feature = df.loc[:,f_support].columns.tolist()
    print(str(len(f_feature)), 'selected features')
else:
    X_indices = np.arange(df.shape[-1])
    f_selector = SelectKBest(f_classif, k = num_feats)
    f_selector.fit(df, y)
    f_support = f_selector.get_support()
    f_feature = df.loc[:,f_support].columns.tolist()
    print(str(len(f_feature)), 'selected features')


50 selected features


  f = msb / msw


In [33]:
f_feature

['game_seconds_remaining',
 'game_half',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_spike',
 'qb_scramble',
 'air_yards',
 'yards_after_catch',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'Dome',
 'off_hb',
 'off_lt',
 'off_qb',
 'off_wr',
 'def_lolb',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_plays',
 'ld_expl_run',
 'ld_start_yds_to_go',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_plays',
 'pd_average_sacks',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_completion_pct',
 'play_type_pass',
 'play_type_qb_spike',
 'play_type_run',
 'ld_outcome_no_ld',
 'ld_outcome_punt',
 'ld_outcome_turnover_on_downs',
 'ld_opp_outcome_field_goal',
 'ld_opp_outcome_fumble_lost',
 'ld_opp_outcome_interception',
 'ld_opp_outcome_no_ld',
 'ld_opp_outcome_touchdown',
 'ld_opp_outcome_turnover_on_downs']

# Mutual Information

Measures the dependence of one variable to another. Has relative advantage over F-Test because it can contend with non-linear relationships

In [34]:
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif

if classification_toggle == 0:
    mutual_selector = SelectKBest(mutual_info_regression, k = num_feats)
    mutual_selector.fit(df, y)
    mutual_support = mutual_selector.get_support()
    mutual_feature = df.loc[:, mutual_support].columns.tolist()
    print(str(len(mutual_feature)), 'selected features')
else:
    mutual_selector = SelectKBest(mutual_info_classif, k = num_feats)
    mutual_selector.fit(df, y)
    mutual_support = mutual_selector.get_support()
    mutual_feature = df.loc[:, mutual_support].columns.tolist()
    print(str(len(mutual_feature)), 'selected features')

50 selected features


In [35]:
mutual_feature

['game_seconds_remaining',
 'game_half',
 'qtr',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'yards_after_catch',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'Rain',
 'Wind',
 'Dome',
 'GameMonth',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'ld_expl_run',
 'ld_expl_pass',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_top',
 'pd_average_sacks',
 'pd_average_tfl',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_yard_att',
 'pd_rush_yard_att',
 'home',
 'play_type_pass',
 'play_type_run',
 'ld_outcome_field_goal',
 'ld_outcome_fumble_lost',
 'ld_outcome_interception',
 'ld_outcome_no_ld',
 'ld_outcome_punt',
 'ld_outcome_touchdown',
 'ld_opp_outcome_end_of_half',
 'ld_opp_outcome_field_goal',
 'ld_opp_outcome_fumble_lost',
 'ld_opp_outcome_no_ld',
 'ld_opp_outcome_punt',
 'ld_opp_outcome_touchdown',
 'ld_opp_outcome_turnover_on_downs']

# Variance Threshold

Variance threshold identifies features with variations below a certain cutoff point. The intuition is - if a feature doesn't vary too much, we assume it doesn't have a lot of predictive power.

NOTE: This metric does not consider relationship of the feature and the target variable

In [37]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()
selector.fit(df)

VarianceThreshold()

In [38]:
variance_selectors = selector.get_support()
variance_selectors
variance_features = list(df.columns[np.array(variance_selectors).astype(bool)])
variance_features

['yardline_100',
 'game_seconds_remaining',
 'game_half',
 'qtr',
 'down',
 'ydstogo',
 'ydsnet',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_spike',
 'qb_scramble',
 'air_yards',
 'yards_after_catch',
 'score_differential',
 'ep',
 'epa',
 'sack',
 'fumble',
 'Rain',
 'Snow',
 'Wind',
 'Fog',
 'Dome',
 'GameMonth',
 'off_c',
 'off_hb',
 'off_lg',
 'off_lt',
 'off_qb',
 'off_rg',
 'off_rt',
 'off_te',
 'off_wr',
 'def_cb',
 'def_dt',
 'def_fs',
 'def_le',
 'def_lolb',
 'def_mlb',
 'def_re',
 'def_rolb',
 'def_ss',
 'pts_scored_vs_pace',
 'def_team_pts_scored_vs_pace',
 'adj_wp',
 'play_end_first_down_ydstogo',
 'play_end_total_ydstogo',
 'ld_plays',
 'ld_drive_length',
 'ld_expl_run',
 'ld_expl_pass',
 'ld_start_yds_to_go',
 'pd_expl_pass',
 'pd_expl_run',
 'pd_average_points',
 'pd_average_plays',
 'pd_average_top',
 'pd_average_sacks',
 'pd_average_tfl',
 'pd_avg_interceptions',
 'pd_passer_rating',
 'pd_run_percentage',
 'pd_pass_yard_att',
 'pd_rush_yard_att',
 

In [39]:
tups = []

for c in enumerate(df.columns):
    tups.append((c[1], selector.variances_[c[0]]))
    
tups

[('yardline_100', 0.9999999999999997),
 ('game_seconds_remaining', 1.0000000000000002),
 ('game_half', 1.0000000000000007),
 ('qtr', 0.9999999999999997),
 ('down', 0.9999999999999992),
 ('ydstogo', 1.0000000000000002),
 ('ydsnet', 1.0),
 ('yards_gained', 0.9999999999999999),
 ('shotgun', 1.0),
 ('no_huddle', 0.9999999999999999),
 ('qb_dropback', 1.0000000000000007),
 ('qb_spike', 1.000000000000001),
 ('qb_scramble', 1.0),
 ('air_yards', 1.0),
 ('yards_after_catch', 1.0000000000000002),
 ('score_differential', 1.0),
 ('ep', 1.0),
 ('epa', 1.0),
 ('sack', 1.0000000000000002),
 ('fumble', 1.0000000000000007),
 ('Rain', 1.0),
 ('Snow', 1.0000000000000002),
 ('Wind', 0.9999999999999999),
 ('Fog', 1.0000000000000007),
 ('Dome', 0.9999999999999997),
 ('GameMonth', 1.0),
 ('off_c', 1.0000000000000002),
 ('off_hb', 0.9999999999999996),
 ('off_lg', 1.0),
 ('off_lt', 1.0000000000000002),
 ('off_qb', 1.0),
 ('off_rg', 1.0000000000000002),
 ('off_rt', 1.0),
 ('off_te', 1.0000000000000002),
 ('off_w

# Recursive Feature Elimination

Eliminates worst performing features one after another until a best subset is known

In [41]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier

clf=GradientBoostingClassifier(n_estimators = 800, 
                               loss = 'exponential', 
                               learning_rate = 0.01, 
                               max_depth = 7, 
                               min_samples_split = 50, 
                               min_samples_leaf = 64,
                               max_features = 'auto')
rfe_selector = RFE(estimator = clf, n_features_to_select = num_feats, step = 10, verbose = 5)
rfe_selector.fit(x, y)