In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import matplotlib.animation as animation
from IPython.display import HTML
from IPython.display import Image

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score


#Code to always set working directory one level up so we can access /data easily
import os
if "NOTEBOOK_DIR" not in globals():
    NOTEBOOK_DIR = os.getcwd()
    print('Notebook Directory Set:', os.getcwd())

os.chdir(os.path.join(NOTEBOOK_DIR, ".."))
print("Current working directory:", os.getcwd())

Notebook Directory Set: /shared_folder/eas_508_project/notebooks
Current working directory: /shared_folder/eas_508_project


In [2]:
pbp_path = 'data/raw/pbp_data/'

pbp_name = 'pbp_2016.csv'

fb_df = pd.read_csv(pbp_path + pbp_name)
shape = fb_df.shape
columns = fb_df.columns

print('✔✔✔ File Loaded\n')
print(f'Shape: {shape}\n')
print(f'Column Names: {list(columns)}\n')

fb_df.head()

  fb_df = pd.read_csv(pbp_path + pbp_name)


✔✔✔ File Loaded

Shape: (47651, 389)

Column Names: ['nflverse_game_id', 'play_id', 'possession_team', 'offense_formation', 'offense_personnel', 'defenders_in_box', 'defense_personnel', 'number_of_pass_rushers', 'players_on_play', 'offense_players', 'defense_players', 'n_offense', 'n_defense', 'ngs_air_yards', 'time_to_throw', 'was_pressure', 'route', 'defense_man_zone_type', 'defense_coverage_type', 'old_game_id', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_kneel', 'qb_spike', 'qb_scramble', 'pass_length', 'pass_location', 'air_yards', 'yards_after_catch', 'run_location', 'run_gap', 'field_goal_result', 'kick_distance', 'extra_poin

Unnamed: 0,nflverse_game_id,play_id,possession_team,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,...,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,2016_01_BUF_BAL,1,,,,,,,,,...,0,1,0.0,,,,,,,
1,2016_01_BUF_BAL,36,BUF,,,,,,40078;40151;37977;36060;40494;40053;40253;4011...,00-0030041;00-0030073;00-0029799;00-0030433;00...,...,0,1,-0.955114,,,,,,,
2,2016_01_BUF_BAL,58,BAL,I_FORM,"2 RB, 1 TE, 2 WR",8.0,"4 DL, 3 LB, 4 DB",,38540;41302;40078;35553;38582;43295;40053;4336...,00-0029892;00-0027714;00-0032965;00-0029893;00...,...,1,1,0.336466,,,,,,0.460759,-46.075901
3,2016_01_BUF_BAL,85,BAL,SHOTGUN,"2 RB, 1 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",3.0,41302;40078;38540;35553;43295;38582;40053;4336...,00-0029892;00-0027714;00-0032965;00-0029893;00...,...,0,1,0.64454,1.567466,9.753489,8.0,0.63767,0.530295,0.426425,57.357508
4,2016_01_BUF_BAL,109,BAL,SHOTGUN,"2 RB, 1 TE, 2 WR",7.0,"4 DL, 3 LB, 4 DB",,40078;38540;41302;35553;43295;38582;40053;4336...,00-0029892;00-0027714;00-0032965;00-0029893;00...,...,0,1,-0.546209,,,,,,0.456597,-45.659652


#### Extracting useful columns from dataframe and grouping them by category

In [3]:
#Extracting Useful Columns
#play info
play_state = ['nflverse_game_id','play_id','posteam','posteam_type','defteam','yardline_100','game_seconds_remaining','down','goal_to_go','ydstogo',
              'desc','posteam_score','defteam_score','score_differential','wp','ep']

#Formation for pre-snap
formation_info = ['offense_formation','offense_personnel','defenders_in_box','defense_personnel','n_offense','n_defense']

#outputs to test on
output_info = ['epa','wpa','success']

#Additional info for deep dives
side_info = ['home_timeouts_remaining','home_timeouts_remaining','timeout','timeout_team','posteam_score','defteam_score','roof','surface','temp','wind','total_line','spread_line']

#yardage in games, do not use for modeling to prevent leakage
in_play = ['rush_attempt','pass_attempt','passing_yards','receiving_yards','rushing_yards','lateral_receiving_yards','lateral_rushing_yards']

#remove kneel, spike == 1 and play_type not pass or run
to_remove = ['qb_kneel', 'qb_spike', 'play_type']

In [4]:
to_pull = play_state + formation_info + side_info + to_remove + in_play + output_info

df = fb_df[to_pull]

#### NaN plays are in DF, these represent no plays or special team plays or end of quarters/halfs. Making a function to remove such plays as well as kneels, spikes.

In [5]:
#Create Remove Plays func for clean_data func
def remove_plays(dataframe):
    df = dataframe.copy()

    formation = df['offense_formation'].notna()
    personnel = df['offense_personnel'].notna()
    no_play = df['play_type'] != 'no_play'
    kneel = df['qb_kneel'] == 0
    spike = df['qb_spike'] == 0

    df = df[formation & personnel & no_play & kneel & spike].drop(columns=['qb_kneel','qb_spike'])

    return df

In [6]:
df = remove_plays(df)

#Check Data

In [7]:
df['play_type'].value_counts()

play_type
pass    20115
run     13344
Name: count, dtype: int64

In [8]:
#Look at formation_info set
df[formation_info].head()

Unnamed: 0,offense_formation,offense_personnel,defenders_in_box,defense_personnel,n_offense,n_defense
2,I_FORM,"2 RB, 1 TE, 2 WR",8.0,"4 DL, 3 LB, 4 DB",11,11
3,SHOTGUN,"2 RB, 1 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",11,11
4,SHOTGUN,"2 RB, 1 TE, 2 WR",7.0,"4 DL, 3 LB, 4 DB",11,11
5,I_FORM,"2 RB, 1 TE, 2 WR",7.0,"4 DL, 3 LB, 4 DB",11,11
6,I_FORM,"1 RB, 2 TE, 2 WR",8.0,"4 DL, 3 LB, 4 DB",11,11


In [9]:
#Check for low offense and defense counts:
print(df['n_offense'].value_counts(), df['n_defense'].value_counts())

n_offense
11    31318
10     2127
9        13
0         1
Name: count, dtype: int64 n_defense
11    31278
10     2093
9        74
8        13
0         1
Name: count, dtype: int64


In [10]:
#Need to remove values less than 11 as this is random noise and will mess with personnel data
df = df[(df['n_offense'] == 11) & (df['n_defense'] == 11)].drop(columns=['n_offense','n_defense'])

In [11]:
#We have personnel counts 
df['offense_personnel'].value_counts()

offense_personnel
1 RB, 1 TE, 3 WR                17007
1 RB, 2 TE, 2 WR                 5115
2 RB, 1 TE, 2 WR                 2145
1 RB, 3 TE, 1 WR                  862
1 RB, 0 TE, 4 WR                  724
2 RB, 2 TE, 1 WR                  558
6 OL, 1 RB, 1 TE, 2 WR            512
6 OL, 1 RB, 2 TE, 1 WR            441
0 RB, 1 TE, 4 WR                  408
2 RB, 0 TE, 3 WR                  394
6 OL, 2 RB, 1 TE, 1 WR            253
6 OL, 2 RB, 2 TE, 0 WR            208
6 OL, 1 RB, 0 TE, 3 WR            133
0 RB, 2 TE, 3 WR                  113
6 OL, 2 RB, 0 TE, 2 WR             59
0 RB, 0 TE, 5 WR                   58
6 OL, 1 RB, 3 TE, 0 WR             41
0 RB, 3 TE, 2 WR                   30
2 RB, 3 TE, 0 WR                   25
3 RB, 1 TE, 1 WR                   25
2 QB, 1 RB, 1 TE, 2 WR             16
7 OL, 1 RB, 1 TE, 1 WR             11
1 RB, 1 TE, 4 WR                   11
6 OL, 1 RB, 2 TE, 0 WR,1 DL        10
7 OL, 1 RB, 2 TE, 0 WR              9
3 RB, 0 TE, 2 WR                

In [12]:
#Convert Offense Personnel to Dictionary Storage for easy recall
df['personnel_dict'] = df['offense_personnel'].apply(lambda x: {pos: int(count) for count, pos in (p.strip().split(' ') for p in x.split(','))})

df['QB'] = 1
df['OL'] = 5

personnel_titles = ['RB','TE','WR','OTHER']

for pos in personnel_titles:
    df[f'{pos}'] = 0

def update_personnel_counts(row):
    for k,v in row['personnel_dict'].items():
        if k in personnel_titles:
            row[k] += v
        else:
            if k == 'QB':
                row[k] = v
            elif k == 'OL':
                row[k] = v
            else:
                row['OTHER'] += v
    return row

df = df.apply(update_personnel_counts, axis=1)

In [13]:
df.columns

Index(['nflverse_game_id', 'play_id', 'posteam', 'posteam_type', 'defteam',
       'yardline_100', 'game_seconds_remaining', 'down', 'goal_to_go',
       'ydstogo', 'desc', 'posteam_score', 'defteam_score',
       'score_differential', 'wp', 'ep', 'offense_formation',
       'offense_personnel', 'defenders_in_box', 'defense_personnel',
       'home_timeouts_remaining', 'home_timeouts_remaining', 'timeout',
       'timeout_team', 'posteam_score', 'defteam_score', 'roof', 'surface',
       'temp', 'wind', 'total_line', 'spread_line', 'play_type',
       'rush_attempt', 'pass_attempt', 'passing_yards', 'receiving_yards',
       'rushing_yards', 'lateral_receiving_yards', 'lateral_rushing_yards',
       'epa', 'wpa', 'success', 'personnel_dict', 'QB', 'OL', 'RB', 'TE', 'WR',
       'OTHER'],
      dtype='object')

In [14]:
feature_cols = ['posteam_type','yardline_100','game_seconds_remaining','down','ydstogo','score_differential',
                'offense_formation','TE','RB','OL','OTHER','defenders_in_box','QB','WR']
output_cols = ['success']

model_1_df = df[feature_cols + output_cols]

model_1_df.dtypes

posteam_type               object
yardline_100              float64
game_seconds_remaining    float64
down                      float64
ydstogo                     int64
score_differential        float64
offense_formation          object
TE                          int64
RB                          int64
OL                          int64
OTHER                       int64
defenders_in_box          float64
QB                          int64
WR                          int64
success                   float64
dtype: object

In [15]:
#Need to convert categorical columns to numerical and one hot encode formations
model_1_df['posteam_type'] = model_1_df['posteam_type'].map({'home':1, 'away':0})

encoded_form = pd.get_dummies(model_1_df['offense_formation'], prefix='formation').astype(int)
model_1_df = pd.concat([model_1_df, encoded_form], axis=1)
model_1_df.drop(columns=['offense_formation'], inplace=True)

model_1_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_1_df['posteam_type'] = model_1_df['posteam_type'].map({'home':1, 'away':0})


In [16]:
X = model_1_df.drop(columns=['success'])
y = model_1_df['success']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for k in range(1, 30):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    
    y_pred = knn.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"\nK = {k} | Test Accuracy: {acc:.3f}")
    cm = confusion_matrix(y_test, y_pred)


K = 1 | Test Accuracy: 0.526

K = 2 | Test Accuracy: 0.551

K = 3 | Test Accuracy: 0.534

K = 4 | Test Accuracy: 0.547

K = 5 | Test Accuracy: 0.532

K = 6 | Test Accuracy: 0.548

K = 7 | Test Accuracy: 0.544

K = 8 | Test Accuracy: 0.552

K = 9 | Test Accuracy: 0.549

K = 10 | Test Accuracy: 0.555

K = 11 | Test Accuracy: 0.548

K = 12 | Test Accuracy: 0.552

K = 13 | Test Accuracy: 0.543

K = 14 | Test Accuracy: 0.552

K = 15 | Test Accuracy: 0.551

K = 16 | Test Accuracy: 0.551

K = 17 | Test Accuracy: 0.554

K = 18 | Test Accuracy: 0.562

K = 19 | Test Accuracy: 0.552

K = 20 | Test Accuracy: 0.556

K = 21 | Test Accuracy: 0.555

K = 22 | Test Accuracy: 0.557

K = 23 | Test Accuracy: 0.551

K = 24 | Test Accuracy: 0.558

K = 25 | Test Accuracy: 0.556

K = 26 | Test Accuracy: 0.558

K = 27 | Test Accuracy: 0.558

K = 28 | Test Accuracy: 0.559

K = 29 | Test Accuracy: 0.560


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.59      0.75      0.66      3286
         1.0       0.49      0.31      0.38      2544

    accuracy                           0.56      5830
   macro avg       0.54      0.53      0.52      5830
weighted avg       0.55      0.56      0.54      5830



In [18]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.61      0.56      0.58      3286
         1.0       0.48      0.54      0.51      2544

    accuracy                           0.55      5830
   macro avg       0.55      0.55      0.55      5830
weighted avg       0.55      0.55      0.55      5830



In [19]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values('Coefficient', ascending=False)

print(coef_df)

                   Feature  Coefficient
13         formation_EMPTY     0.066451
1             yardline_100     0.062972
2   game_seconds_remaining     0.038711
0             posteam_type     0.025393
17       formation_SHOTGUN     0.018256
15         formation_JUMBO     0.018143
19       formation_WILDCAT    -0.005224
18    formation_SINGLEBACK    -0.015781
9                    OTHER    -0.025813
5       score_differential    -0.039492
14        formation_I_FORM    -0.043169
16        formation_PISTOL    -0.046692
11                      QB    -0.048795
10        defenders_in_box    -0.080865
3                     down    -0.105639
8                       OL    -0.193045
7                       RB    -0.220279
4                  ydstogo    -0.305446
6                       TE    -0.399130
12                      WR    -0.517767


In [29]:
X_2 = model_1_df.drop(columns=['QB','RB','TE','OL','WR','OTHER','success'])
y_2 = model_1_df['success']

X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_2_train_scaled = scaler.fit_transform(X_2_train)
X_2_test_scaled = scaler.transform(X_2_test)

print("Train scaled shape:", X_2_train_scaled.shape)
print("Test scaled shape:", X_2_test_scaled.shape)

log_reg_2 = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')
log_reg_2.fit(X_2_train_scaled, y_2_train)
y_2_pred = log_reg_2.predict(X_2_test_scaled)

print(classification_report(y_2_test, y_2_pred))

import pandas as pd
pd.DataFrame({
    'Feature': X_2.columns,
    'Coefficient': log_reg_2.coef_[0]
}).sort_values('Coefficient', ascending=False)

Train scaled shape: (23319, 14)
Test scaled shape: (5830, 14)
              precision    recall  f1-score   support

         0.0       0.61      0.56      0.58      3286
         1.0       0.49      0.54      0.51      2544

    accuracy                           0.55      5830
   macro avg       0.55      0.55      0.55      5830
weighted avg       0.56      0.55      0.55      5830



Unnamed: 0,Feature,Coefficient
7,formation_EMPTY,0.062973
1,yardline_100,0.062526
2,game_seconds_remaining,0.040764
0,posteam_type,0.024706
9,formation_JUMBO,0.016862
11,formation_SHOTGUN,0.014008
13,formation_WILDCAT,-0.010298
12,formation_SINGLEBACK,-0.020642
8,formation_I_FORM,-0.026559
5,score_differential,-0.038287


In [42]:
X_3 = model_1_df.drop(columns=['QB','RB','TE','OL','WR','OTHER','success','formation_I_FORM','formation_JUMBO','formation_PISTOL','formation_EMPTY','formation_SINGLEBACK','formation_WILDCAT','formation_SHOTGUN'])
y_3 = model_1_df['success']

X_3_train, X_3_test, y_3_train, y_3_test = train_test_split(X_3, y_3, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_3_train_scaled = scaler.fit_transform(X_3_train)
X_3_test_scaled = scaler.transform(X_3_test)

print("Train scaled shape:", X_3_train_scaled.shape)
print("Test scaled shape:", X_3_test_scaled.shape)

log_reg_3 = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')
log_reg_3.fit(X_3_train_scaled, y_3_train)
y_3_pred = log_reg_3.predict(X_3_test_scaled)

print(classification_report(y_3_test, y_3_pred))

pd.DataFrame({
    'Feature': X_3.columns,
    'Coefficient': log_reg_3.coef_[0]
}).sort_values('Coefficient', ascending=False)

Train scaled shape: (23319, 7)
Test scaled shape: (5830, 7)
              precision    recall  f1-score   support

         0.0       0.61      0.54      0.57      3286
         1.0       0.48      0.55      0.51      2544

    accuracy                           0.54      5830
   macro avg       0.54      0.54      0.54      5830
weighted avg       0.55      0.54      0.55      5830



Unnamed: 0,Feature,Coefficient
1,yardline_100,0.054129
2,game_seconds_remaining,0.038825
0,posteam_type,0.025
5,score_differential,-0.042293
3,down,-0.093003
6,defenders_in_box,-0.107847
4,ydstogo,-0.301869


In [47]:
X_4 = model_1_df[['yardline_100','ydstogo','down','defenders_in_box']]
y_4 = model_1_df['success']

X_4_train, X_4_test, y_4_train, y_4_test = train_test_split(
    X_4, y_4, test_size=0.2, random_state=42, stratify=y_4
)

scaler_4 = StandardScaler()
X_4_train_scaled = scaler_4.fit_transform(X_4_train)
X_4_test_scaled = scaler_4.transform(X_4_test)

print("Train scaled shape:", X_4_train_scaled.shape)
print("Test scaled shape:", X_4_test_scaled.shape)

log_reg_4 = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')
log_reg_4.fit(X_4_train_scaled, y_4_train)

y_4_pred = log_reg_4.predict(X_4_test_scaled)

print(classification_report(y_4_test, y_4_pred))

Train scaled shape: (23319, 4)
Test scaled shape: (5830, 4)
              precision    recall  f1-score   support

         0.0       0.60      0.56      0.58      3286
         1.0       0.48      0.53      0.50      2544

    accuracy                           0.54      5830
   macro avg       0.54      0.54      0.54      5830
weighted avg       0.55      0.54      0.55      5830



In [48]:
coef_df_4 = pd.DataFrame({
    'Feature': X_4.columns,
    'Coefficient': log_reg_4.coef_[0]
}).sort_values('Coefficient', ascending=False)

coef_df_4

Unnamed: 0,Feature,Coefficient
0,yardline_100,0.054726
2,down,-0.096515
3,defenders_in_box,-0.113512
1,ydstogo,-0.304396
