In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob
import joblib
from xgboost import plot_importance
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

preds = '../outputs/predictions/xgboost_optuna_vals.csv'
test = '../data/processed/processed_all.csv'

df = pd.read_csv(test)
supp = pd.read_csv(preds)

df = df[df['season'] == 2023]
df = df[(df['down'] == 3) | (df['down'] == 4)]
df["log_ydstogo"] = np.log1p(df["ydstogo"])
df["log_yardline_100"] = np.log1p(df["yardline_100"])

data = df.merge(supp, left_index=True, right_on="orig_index").drop(columns=["Unnamed: 0", "orig_index"])

In [24]:
xgb_model = joblib.load('../models/xgboost_optuna.joblib')

In [25]:
preproc = xgb_model.named_steps["preproc"]
model = xgb_model.named_steps["model"]

feature_names = preproc.get_feature_names_out()
importances = model.feature_importances_

df_xgb = (
    pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
)

df_xgb

Unnamed: 0,feature,importance
4,remainder__ydstogo,0.149213
2,cat__offense_formation_UNDER_CENTER,0.062938
1,cat__offense_formation_SHOTGUN,0.040432
3,remainder__down,0.031186
13,remainder__DB,0.028048
9,remainder__WR,0.02729
5,remainder__yardline_100,0.026104
18,remainder__qtr,0.026078
17,remainder__total_line,0.02528
20,remainder__QB1,0.024302


In [26]:
rf_model = joblib.load('../models/random_forest_optuna.joblib')

rf = rf_model.named_steps["model"]
rf_importances = rf.feature_importances_
rf_features = rf_model.named_steps["preproc"].get_feature_names_out()

df_rf = pd.DataFrame({
    "feature": rf_features,
    "importance": rf_importances
}).sort_values("importance", ascending=False)

display(df_rf)

Unnamed: 0,feature,importance
4,remainder__ydstogo,0.34699
19,remainder__quarter_seconds_remaining,0.071807
5,remainder__yardline_100,0.067208
17,remainder__total_line,0.042687
15,remainder__score_differential,0.039319
16,remainder__spread,0.03851
20,remainder__QB1,0.030203
27,remainder__TE1,0.030072
31,remainder__pass_run_ratio,0.02746
22,remainder__RB1,0.026726


In [27]:
log_model = joblib.load('../models/logistic_optuna.joblib')

logreg = log_model.named_steps["model"]
coef = logreg.coef_[0]

features = log_model.named_steps["preproc"].get_feature_names_out()

df_log = pd.DataFrame({
    "feature": features,
    "coef": coef,
    "abs_coef": np.abs(coef)
}).sort_values("abs_coef", ascending=False)

display(df_log)

Unnamed: 0,feature,coef,abs_coef
0,num__ydstogo,-0.509458,0.509458
22,cat__offense_formation_UNDER_CENTER,0.149836,0.149836
21,cat__offense_formation_SHOTGUN,-0.092909,0.092909
4,num__total_line,0.083432,0.083432
5,num__QB1,0.064479,0.064479
1,num__yardline_100,0.048999,0.048999
11,num__WR3,0.036099,0.036099
8,num__RB2,0.033014,0.033014
13,num__TE2,0.032111,0.032111
15,num__pass_yards_season,-0.026466,0.026466


In [28]:
df_xgb = df_xgb[['feature','importance']].rename(columns={'importance':'xgb'})
df_rf  = df_rf[['feature','importance']].rename(columns={'importance':'rf'})
df_log = df_log[['feature','abs_coef']].rename(columns={'abs_coef':'logreg'})

In [29]:
df_all = (
    df_xgb
    .merge(df_rf, on='feature', how='outer')
    .merge(df_log, on='feature', how='outer')
)

In [30]:
df_all = df_all.fillna(0)

In [33]:
df_all = df_all.sort_values('rf', ascending=False)

In [34]:
display(df_all)

Unnamed: 0,feature,xgb,rf,logreg
54,remainder__ydstogo,0.149213,0.34699,0.0
46,remainder__quarter_seconds_remaining,0.023481,0.071807,0.0
53,remainder__yardline_100,0.026104,0.067208,0.0
52,remainder__total_line,0.02528,0.042687,0.0
49,remainder__score_differential,0.022726,0.039319,0.0
50,remainder__spread,0.022831,0.03851,0.0
27,remainder__QB1,0.024302,0.030203,0.0
33,remainder__TE1,0.023439,0.030072,0.0
43,remainder__pass_run_ratio,0.023307,0.02746,0.0
30,remainder__RB1,0.022087,0.026726,0.0


In [39]:
pass_mask = data['play_type'] == 'pass'

precision_pass = precision_score(
    data[pass_mask]['y_true'],
    data[pass_mask]['y_pred']
)

recall_pass = recall_score(
    data[pass_mask]['y_true'],
    data[pass_mask]['y_pred']
)

precision_pass, recall_pass

(0.5049283154121864, 0.47552742616033755)

In [40]:
pass_mask = data['play_type'] == 'run'

precision_pass = precision_score(
    data[pass_mask]['y_true'],
    data[pass_mask]['y_pred']
)

recall_pass = recall_score(
    data[pass_mask]['y_true'],
    data[pass_mask]['y_pred']
)

precision_pass, recall_pass

(0.6540880503144654, 0.8041237113402062)