Goal:  Select features for a play-level model that calculates the potential of each play call (e.g. pass, rush, field goal, etc)  based on the situation and poer scores
    - X values are all the features we've gathered at the level of one record per game, where the target y = WIN/LOSS
    - Use stats and ML to select the fetures that impact wins and losses
    - we'll use these to create an offense-power and defense-power for each play in each  game
    - that should allow us to

Input: nfl_ml_weekly_stats.parquet

Activity:
    - Extract performance data
    - convert ints to float
    - measure correlation against the WIN/LOSS target
    - Classify important features from XGBoost
    - Classify weights from a shallow neural net
    - review that they are all in agreement and pick the best from each analysis
    - Use SHAP to get the relative weights (importance) for each feature
    - Manually separate stats that are offense-related and those that are defence related - for any given drive the offense and defense flip. and we want to be able to provide the stats for whichever team is on offense vs the stats for the opposing team
    - take the weighted average of each feature * its weights from SHAP - sum them all up to get a
    - this produces a **power_scores** dataset with single offense_power score and defense_power score for each team, for each season and week
    - query a version of the **play_actions** table from the database that we'll use to input for a play calling model
    - merge the **power_scores** and **play_actions** datasets, so we have the correct offense and defense 'powers' scored for each play, depending on which team is offense and defense.



# imports

In [1]:
import os
import pandas as pd
import sys

sys.path.append(os.path.abspath("../src"))

In [2]:
import numpy as np
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

import warnings

warnings.filterwarnings('ignore')


In [3]:
from src import *

# set flags

In [4]:
db = database_loader.DatabaseLoader(get_config('connection_string'))
DEBUG = False
SCHEMA = 'controls'

file_name = "nfl_ml_weekly_stats"
data_directory = get_config('data_directory')


plt.style.use('ggplot')


# load the NFL stats dataset

# perform feature selection

## prepare our data for feature selection
We'll have an X, and y set plus the original stats_df datasets, which we'll use going forward

In [5]:
%%time
full_path = os.path.join(data_directory, "pbp_actions.parquet")
pbp_actions_df = pd.read_parquet(full_path)
pbp_actions_df.head()

CPU times: user 43 ms, sys: 24.4 ms, total: 67.5 ms
Wall time: 41.3 ms


Unnamed: 0,row_id,season,week,game_id,drive,play_counter,posteam,posteam_score,posteam_score_post,defteam,...,defteam_score_post,point_differential,next_starting_score,down,ydstogo,yards_to_goal,game_seconds_remaining,action,yards_gained,points_gained
0,3,2016,1,2016_01_DET_IND,1.0,55.0,IND,0,0,DET,...,0.0,0.0,0.0,1.0,10.0,75,3600.0,pass,6.0,0
1,4,2016,1,2016_01_MIN_TEN,1.0,58.0,TEN,0,0,MIN,...,0.0,0.0,0.0,1.0,10.0,71,3594.0,rush,6.0,0
2,6,2016,1,2016_01_NYG_DAL,1.0,60.0,DAL,0,0,NYG,...,0.0,0.0,0.0,1.0,10.0,75,3600.0,rush,1.0,0
3,7,2016,1,2016_01_GB_JAX,1.0,62.0,JAX,0,0,GB,...,0.0,0.0,0.0,1.0,10.0,82,3596.0,rush,6.0,0
4,10,2016,1,2016_01_MIA_SEA,1.0,75.0,SEA,0,0,MIA,...,0.0,0.0,0.0,2.0,3.0,68,3569.0,pass,6.0,0


In [6]:
# pbp_actions_df = pbp_actions_df.loc[(pbp_actions_df.season> 2016)]
pbp_actions_df.loc[(pbp_actions_df['defteam']=='LA'), 'defteam'] = 'LAC'


In [7]:
%%time
full_path = os.path.join(data_directory, "offense_power.parquet")
offense_powers_df = pd.read_parquet(full_path)
offense_powers_df.head()

CPU times: user 2.26 ms, sys: 1.13 ms, total: 3.39 ms
Wall time: 2.39 ms


Unnamed: 0,team,season,week,offense_power
0,ARI,2016,1,28.751645
1,ARI,2016,2,38.010825
2,ARI,2016,3,18.23125
3,ARI,2016,4,28.217851
4,ARI,2016,5,33.784695


In [8]:
%%time
full_path = os.path.join(data_directory, "defense_power.parquet")
defense_powers_df = pd.read_parquet(full_path)
defense_powers_df.head()

CPU times: user 2.03 ms, sys: 1.18 ms, total: 3.21 ms
Wall time: 2.17 ms


Unnamed: 0,team,season,week,defense_power
0,ARI,2016,1,16.028442
1,ARI,2016,2,17.571463
2,ARI,2016,3,19.256932
3,ARI,2016,4,12.996981
4,ARI,2016,5,21.221539


In [9]:
def drop_extras(df: pd.DataFrame):
    drops=['team']
    for col in df.columns.values:
        if str(col).endswith("_y") or str(col).endswith("_x"):
            drops.append(col)
    if len(drops) > 0:
        df.drop(columns=drops, inplace=True)


In [10]:

df = pd.merge(pbp_actions_df, offense_powers_df, left_on=['season', 'week', 'posteam'], right_on=['season', 'week', 'team']).drop_duplicates()
drop_extras(df)
df.shape

(202773, 22)

In [11]:
df = pd.merge(df, defense_powers_df, left_on=['season', 'week', 'defteam'], right_on=['season', 'week', 'team']).drop_duplicates()
drop_extras(df)
df.shape

(202773, 23)

In [12]:
df.head()

Unnamed: 0,row_id,season,week,game_id,drive,play_counter,posteam,posteam_score,posteam_score_post,defteam,...,next_starting_score,down,ydstogo,yards_to_goal,game_seconds_remaining,action,yards_gained,points_gained,offense_power,defense_power
0,3,2016,1,2016_01_DET_IND,1.0,55.0,IND,0,0,DET,...,0.0,1.0,10.0,75,3600.0,pass,6.0,0,31.511925,16.006158
1,36,2016,1,2016_01_DET_IND,1.0,142.0,IND,0,0,DET,...,0.0,2.0,6.0,61,3454.0,rush,2.0,0,31.511925,16.006158
2,80,2016,1,2016_01_DET_IND,1.0,241.0,IND,0,0,DET,...,,3.0,15.0,51,3295.0,pass,3.0,0,31.511925,16.006158
3,199,2016,1,2016_01_DET_IND,3.0,532.0,IND,0,0,DET,...,0.0,1.0,10.0,75,2983.0,pass,0.0,0,31.511925,16.006158
4,219,2016,1,2016_01_DET_IND,3.0,577.0,IND,0,0,DET,...,,3.0,8.0,73,2902.0,pass,0.0,0,31.511925,16.006158


In [13]:
pbp_play_calls_df = df[[
    'season',
    'week',
    'drive',
    'play_counter',
    'posteam',
    'defteam',
    'point_differential',
    'down',
    'ydstogo',
    'yards_to_goal',
    'game_seconds_remaining',
    'action',
    'yards_gained',
    'points_gained',
    'defense_power',
    'offense_power']]
pbp_play_calls_df.head()

Unnamed: 0,season,week,drive,play_counter,posteam,defteam,point_differential,down,ydstogo,yards_to_goal,game_seconds_remaining,action,yards_gained,points_gained,defense_power,offense_power
0,2016,1,1.0,55.0,IND,DET,0.0,1.0,10.0,75,3600.0,pass,6.0,0,16.006158,31.511925
1,2016,1,1.0,142.0,IND,DET,0.0,2.0,6.0,61,3454.0,rush,2.0,0,16.006158,31.511925
2,2016,1,1.0,241.0,IND,DET,0.0,3.0,15.0,51,3295.0,pass,3.0,0,16.006158,31.511925
3,2016,1,3.0,532.0,IND,DET,-7.0,1.0,10.0,75,2983.0,pass,0.0,0,16.006158,31.511925
4,2016,1,3.0,577.0,IND,DET,-7.0,3.0,8.0,73,2902.0,pass,0.0,0,16.006158,31.511925


In [14]:
print("validate expected results from one drive")
test_df = pbp_play_calls_df.loc[(pbp_play_calls_df.season==2016) & (pbp_play_calls_df.week==1) & (pbp_play_calls_df.drive==7) & (pbp_play_calls_df.posteam=='BAL')].sort_values(by=['play_counter'])
assert 7 == test_df.points_gained.sum()
assert len(test_df) == 5
assert 80 == test_df.yards_gained.sum()
merged_shape = pbp_play_calls_df.shape
print(merged_shape)
test_df

validate expected results from one drive
(202773, 16)


Unnamed: 0,season,week,drive,play_counter,posteam,defteam,point_differential,down,ydstogo,yards_to_goal,game_seconds_remaining,action,yards_gained,points_gained,defense_power,offense_power
395,2016,1,7.0,1004.0,BAL,BUF,3.0,1.0,10.0,85,2579.0,pass,0.0,0,18.977319,31.920248
368,2016,1,7.0,1108.0,BAL,BUF,3.0,1.0,15.0,80,2492.0,rush,10.0,0,18.977319,31.920248
356,2016,1,7.0,1129.0,BAL,BUF,3.0,2.0,5.0,70,2457.0,pass,4.0,0,18.977319,31.920248
357,2016,1,7.0,1153.0,BAL,BUF,3.0,3.0,1.0,66,2422.0,pass,66.0,6,18.977319,31.920248
369,2016,1,7.0,1173.0,BAL,BUF,9.0,0.0,0.0,15,2411.0,extra_point,0.0,1,18.977319,31.920248


#### save features dataset

In [15]:
%%time
full_path = os.path.join(data_directory, "nfl_pbp_play_calls.parquet")
pbp_play_calls_df.to_parquet(full_path)

CPU times: user 70.2 ms, sys: 9.45 ms, total: 79.6 ms
Wall time: 78.6 ms
