In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import classification_report


#Code to always set working directory one level up so we can access /data easily
import os
if "NOTEBOOK_DIR" not in globals():
    NOTEBOOK_DIR = os.getcwd()
    print('Notebook Directory Set:', os.getcwd())

os.chdir(os.path.join(NOTEBOOK_DIR, ".."))
print("Current working directory:", os.getcwd())

Notebook Directory Set: /shared_folder/eas_508_project/notebooks
Current working directory: /shared_folder/eas_508_project


In [2]:
pbp_path = 'data/interim/pbp/'

pbp_name = 'featured_11_12.csv'

df = pd.read_csv(pbp_path + pbp_name, dtype={'personnel_num': 'string'})
shape = df.shape
columns = df.columns

print('✔✔✔ File Loaded\n')
print(f'Shape: {shape}\n')
print(f'Column Names: {list(columns)}\n')

df.head()

  df = pd.read_csv(pbp_path + pbp_name, dtype={'personnel_num': 'string'})


✔✔✔ File Loaded

Shape: (307576, 107)

Column Names: ['nflverse_game_id', 'play_id', 'possession_team', 'offense_formation', 'offense_personnel', 'defenders_in_box', 'defense_personnel', 'n_offense', 'n_defense', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'yrdln', 'ydstogo', 'ydsnet', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'run_location', 'run_gap', 'score_differential', 'no_score_prob', 'fg_prob', 'td_prob', 'ep', 'epa', 'wp', 'def_wp', 'home_wp', 'away_wp', 'wpa', 'third_down_converted', 'third_down_failed', 'fourth_down_converted', 'fourth_down_failed', 'penalty', 'field_goal_attempt', 'complete_pass', 'passing_yards', 'receiving_yards', 'rushing_yards', 'season', 'order_sequence', 'stadium', 'weather', 'nfl_api_id', 'fixed_drive'

Unnamed: 0,nflverse_game_id,play_id,possession_team,offense_formation,offense_personnel,defenders_in_box,defense_personnel,n_offense,n_defense,home_team,...,personnel_num,ydstosuccess,fp_success,smoothed_fp_success,fp_epa,f_success,smoothed_f_success,f_epa,yard_group,previous_success
0,2016_01_BUF_BAL,58,BAL,UNDER_CENTER,"2 RB, 1 TE, 2 WR",8.0,"4 DL, 3 LB, 4 DB",11,11,BAL,...,12,4.0,0.42623,0.426601,0.01953,0.429168,0.429205,0.010042,0to5,0.0
1,2016_01_BUF_BAL,85,BAL,SHOTGUN,"2 RB, 1 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",11,11,BAL,...,12,2.4,0.4644,0.462669,0.057628,0.441437,0.441424,0.021173,0to5,1.0
2,2016_01_BUF_BAL,109,BAL,SHOTGUN,"2 RB, 1 TE, 2 WR",7.0,"4 DL, 3 LB, 4 DB",11,11,BAL,...,12,4.0,0.4644,0.462669,0.057628,0.441437,0.441424,0.021173,0to5,1.0
3,2016_01_BUF_BAL,130,BAL,UNDER_CENTER,"2 RB, 1 TE, 2 WR",7.0,"4 DL, 3 LB, 4 DB",11,11,BAL,...,12,6.0,0.42623,0.426601,0.01953,0.429168,0.429205,0.010042,5to10,0.0
4,2016_01_BUF_BAL,154,BAL,UNDER_CENTER,"1 RB, 2 TE, 2 WR",8.0,"4 DL, 3 LB, 4 DB",11,11,BAL,...,21,1.0,0.420136,0.420369,0.006901,0.429168,0.429205,0.010042,0to5,1.0


In [3]:
features = ['down','yard_group','offense_formation','success']
cat_features = ['yard_group','offense_formation']
target = 'success'

model_df = df[features]
model_df = pd.get_dummies(data=model_df, columns=cat_features)
model_df = model_df.copy()

X_train, X_test, y_train, y_test = train_test_split(model_df.drop(columns=target), model_df['success'], train_size=.8, random_state=42, stratify=model_df['success'])

In [19]:
model = KNeighborsClassifier(n_neighbors=50, metric='hamming')

model.fit(X_train, y_train)
y_preds = model.predict(X_test)
y_probs = model.predict_proba(X_test)

y_probs_preds = (y_probs > .425).astype(int)[:,1]

print(classification_report(y_test, y_preds))
print(classification_report(y_test, y_probs_preds))

              precision    recall  f1-score   support

         0.0       0.59      0.78      0.67     34651
         1.0       0.52      0.32      0.40     26865

    accuracy                           0.58     61516
   macro avg       0.56      0.55      0.53     61516
weighted avg       0.56      0.58      0.55     61516

              precision    recall  f1-score   support

         0.0       0.62      0.47      0.53     34651
         1.0       0.48      0.63      0.54     26865

    accuracy                           0.54     61516
   macro avg       0.55      0.55      0.54     61516
weighted avg       0.56      0.54      0.54     61516



In [None]:
model = KNeighborsClassifier(n_neighbors=50)

model.fit(X_train, y_train)
y_preds = model.predict(X_test)
y_probs = model.predict_proba(X_test)

y_probs_preds = (y_probs > .425).astype(int)[:,1]

print(classification_report(y_test, y_preds))
print(classification_report(y_test, y_probs_preds))

              precision    recall  f1-score   support

         0.0       0.59      0.78      0.67     34651
         1.0       0.52      0.32      0.40     26865

    accuracy                           0.58     61516
   macro avg       0.56      0.55      0.53     61516
weighted avg       0.56      0.58      0.55     61516

              precision    recall  f1-score   support

         0.0       0.62      0.47      0.53     34651
         1.0       0.48      0.63      0.54     26865

    accuracy                           0.54     61516
   macro avg       0.55      0.55      0.54     61516
weighted avg       0.56      0.54      0.54     61516



In [1]:
for down in model_df.down.unique():
    idx_train = X_train['down'] == down
    temp_X_train = X_train[idx_train].drop(columns='down')
    temp_y_train = y_train[idx_train]

    idx_test = X_test['down'] == down
    temp_X_test = X_test[idx_test].drop(columns='down')
    temp_y_test = y_test[idx_test]

    neg, pos = np.bincount(temp_y_train.astype(int))
    temp_scale = neg / pos

    temp = KNeighborsClassifier(n_neighbors=20, metric='hamming')
    
    temp.fit(temp_X_train, temp_y_train)

    temp_y_pred = temp.predict(temp_X_test)
    temp_y_probs = temp.predict_proba(temp_X_test)[:,1]

    print(f'Down: {down}\n')
    print(f'Baseline: {temp_y_test.mean()}\n')
    print('Threshold=.5\n', classification_report(temp_y_test, temp_y_pred))
    print('Threshold=.45\n', classification_report(temp_y_test, temp_y))
    print('\n')

NameError: name 'model_df' is not defined

I don't think KNN is the move in regards to this but 

In [28]:
for column in down2.columns:
    print(down2[f'{column}'].value_counts())

yard_group_0to5
True    10930
Name: count, dtype: int64
yard_group_10to15
False    10930
Name: count, dtype: int64
yard_group_15to20
False    10930
Name: count, dtype: int64
yard_group_20toinf
False    10930
Name: count, dtype: int64
yard_group_5to10
False    10930
Name: count, dtype: int64
offense_formation_PISTOL
False    10930
Name: count, dtype: int64
offense_formation_SHOTGUN
True     6370
False    4560
Name: count, dtype: int64
offense_formation_UNDER_CENTER
False    6370
True     4560
Name: count, dtype: int64


In [27]:
for column in down1.columns:
    print(down1[f'{column}'].value_counts())

yard_group_0to5
False    737
Name: count, dtype: int64
yard_group_10to15
False    737
Name: count, dtype: int64
yard_group_15to20
False    737
Name: count, dtype: int64
yard_group_20toinf
False    737
Name: count, dtype: int64
yard_group_5to10
True    737
Name: count, dtype: int64
offense_formation_PISTOL
False    737
Name: count, dtype: int64
offense_formation_SHOTGUN
True    737
Name: count, dtype: int64
offense_formation_UNDER_CENTER
False    737
Name: count, dtype: int64
