In [53]:
import pandas as pd
import numpy as np
import pymc as pm

## Speed
Our response will be predicted get off time
    - Need to decide how we want to do this
    - Option 1 is amount of time to get to a certain distance (Say 1.5 yards)
    - Option 2 would be predicted yards given a cetain amount of time
For predictors:
    - Height/weight of the tackle
    - Quarter
    - Whether it's an obvious passing situation
    - Left vs right tackle?

## Power
Our response here is some variation of percentage of the distance to the QB given up once engaged with the defensive end
For predictors:
    - Height/weight of both the tackle and the end
    - Obvious passing situation
    - Time to throw for the QB
    - Quarter

In [54]:
model_data = pd.read_csv('https://media.githubusercontent.com/media/cnickol26/BigDataBowl2023/main/clustered.csv')

In [55]:
model_data.columns

Index(['uniqueplayId', 'QB_Flip', 'rushType', 'Power_prob', 'Speed_prob',
       'football_x', 'football_y', 'gameId', 'playId', 'frameId', 'time',
       'playDirection', 'event', 'ball_snap_frame', 'end_frame',
       'play_length_frames', 'Right_end', 'Left_end', 'QB_nflId',
       'QB_jerseyNumber', 'QB_team', 'QB_x', 'QB_y', 'QB_s', 'QB_a', 'QB_dis',
       'QB_o', 'QB_dir', 'QB_pff_role', 'QB_pff_positionLinedUp',
       'QB_pff_nflIdBlockedPlayer', 'QB_pff_blockType', 'QB_new_x', 'QB_new_y',
       'ball_nflId', 'ball_jerseyNumber', 'ball_team', 'ball_x', 'ball_y',
       'ball_s', 'ball_a', 'ball_dis', 'ball_o', 'ball_dir', 'ball_pff_role',
       'ball_pff_positionLinedUp', 'ball_pff_nflIdBlockedPlayer',
       'ball_pff_blockType', 'ball_new_x', 'ball_new_y', 'ball_Flip',
       'Tackle_nflId', 'Tackle_jerseyNumber', 'Tackle_team', 'Tackle_x',
       'Tackle_y', 'Tackle_s', 'Tackle_a', 'Tackle_dis', 'Tackle_o',
       'Tackle_dir', 'Tackle_pff_role', 'Tackle_pff_positionLined

In [56]:
model_data2 = model_data[['uniqueplayId','QB_Flip','rushType','Power_prob','Speed_prob',
            'football_x', 'football_y', 'gameId', 'playId', 'frameId', 'time',
       'event','play_length_frames','QB_s', 'QB_a', 'QB_dis',
       'QB_o', 'QB_dir', 'QB_pff_role', 'QB_pff_positionLinedUp','QB_new_x', 'QB_new_y',
            'ball_new_x', 'ball_new_y', 'ball_Flip','ball_snap_frame',
       'Tackle_nflId', 'Tackle_s', 'Tackle_a', 'Tackle_dis', 'Tackle_o',
       'Tackle_dir', 'Tackle_pff_role', 'Tackle_pff_positionLinedUp',
       'Tackle_pff_nflIdBlockedPlayer', 'Tackle_pff_blockType', 'Tackle_new_x',
       'Tackle_new_y', 'Tackle_Flip', 'End_nflId', 'End_s', 'End_a', 'End_dis', 'End_o',
       'End_dir', 'End_pff_role', 'End_pff_positionLinedUp', 'End_new_x',
       'End_new_y', 'End_Flip', 'tackle_end_dist', 'tackle_end_facing']]

In [57]:
players = pd.read_csv('https://media.githubusercontent.com/media/cnickol26/BigDataBowl2023/main/nfl-big-data-bowl-2023/players.csv')

In [58]:
players = players[['height', 'weight', 'nflId','displayName']]

In [59]:
model_data3 = model_data2.merge(players, left_on = 'Tackle_nflId', right_on = 'nflId', how = 'left')

In [60]:
model_data4 = model_data3.rename(columns = {'height':'tackle_height','weight':'tackle_weight','displayName':'tackle_name'}).drop('nflId', axis = 1)

In [61]:
model_data5 = model_data4.merge(players, left_on = 'End_nflId', right_on = 'nflId', how = 'left')

In [62]:
model_data6 = model_data5.rename(columns = {'height':'end_height','weight':'end_weight','displayName':'end_name'}).drop('nflId', axis = 1)

In [63]:
plays = pd.read_csv('https://media.githubusercontent.com/media/cnickol26/BigDataBowl2023/main/nfl-big-data-bowl-2023/plays.csv')

In [64]:
plays['obvious_pass'] = ((plays['down'] == 2) & (plays['yardsToGo'] > 10)) | ((plays['down'] == 3) & (plays['yardsToGo'] > 4)) | ((plays['down'] == 4) & (plays['yardsToGo'] > 2))

In [65]:
plays['obvious_pass'] = plays['obvious_pass'].astype(int)

In [66]:
plays = plays[['gameId','playId','quarter','obvious_pass']]

In [67]:
plays

Unnamed: 0,gameId,playId,quarter,obvious_pass
0,2021090900,97,1,0
1,2021090900,137,1,0
2,2021090900,187,1,0
3,2021090900,282,1,0
4,2021090900,349,1,1
...,...,...,...,...
8553,2021110100,4310,4,1
8554,2021110100,4363,4,0
8555,2021110100,4392,4,0
8556,2021110100,4411,4,1


In [68]:
model_data7 = model_data6.merge(plays, on = ['gameId','playId'], how = 'left')

In [69]:
model_data7['tackle_end_engaged'] = (model_data7['tackle_end_facing'] <= 40) & (model_data7['tackle_end_dist'] <= 1)

In [70]:
model_data7['tackle_end_engaged'] = model_data7['tackle_end_engaged'].astype(int)

In [71]:
model_data7['tackle_end_engaged'].value_counts()

0    217991
1     70416
Name: tackle_end_engaged, dtype: int64

In [72]:
def dist(x_1, y_1, x_2, y_2):
    return np.sqrt(np.sum([(x_1-x_2)**2, (y_1-y_2)**2], axis=0))

In [73]:
model_data7['tackle_qb_dist'] = dist(model_data7['Tackle_new_x'], model_data7['Tackle_new_y'], model_data7['QB_new_x'], model_data7['QB_new_y'])

In [74]:
model_data7['end_qb_dist'] = dist(model_data7['End_new_x'], model_data7['End_new_y'], model_data7['QB_new_x'], model_data7['QB_new_y'])

In [171]:
speed = model_data7[model_data7['rushType'] == 'Speed']
speed

Unnamed: 0,uniqueplayId,QB_Flip,rushType,Power_prob,Speed_prob,football_x,football_y,gameId,playId,frameId,...,tackle_weight,tackle_name,end_height,end_weight,end_name,quarter,obvious_pass,tackle_end_engaged,tackle_qb_dist,end_qb_dist
0,202109090097,1.0,Speed,0.07851,0.92149,41.56,23.92,2021090900,97,6,...,322,Tristan Wirfs,6-3,245,Micah Parsons,1,0,0,4.554492,2.944300
1,202109090097,1.0,Speed,0.07851,0.92149,41.56,23.92,2021090900,97,7,...,322,Tristan Wirfs,6-3,245,Micah Parsons,1,0,0,4.610878,3.008920
2,202109090097,1.0,Speed,0.07851,0.92149,41.56,23.92,2021090900,97,8,...,322,Tristan Wirfs,6-3,245,Micah Parsons,1,0,0,4.651021,3.091035
3,202109090097,1.0,Speed,0.07851,0.92149,41.56,23.92,2021090900,97,9,...,322,Tristan Wirfs,6-3,245,Micah Parsons,1,0,0,4.667987,3.220761
4,202109090097,1.0,Speed,0.07851,0.92149,41.56,23.92,2021090900,97,10,...,322,Tristan Wirfs,6-3,245,Micah Parsons,1,0,0,4.714106,3.402029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288331,20211017071173,0.0,Speed,,,57.90,24.08,2021101707,1173,27,...,305,Charles Leno,6-0,232,Nick Bolton,2,0,0,4.893107,6.758913
288332,20211017071173,0.0,Speed,,,57.90,24.08,2021101707,1173,28,...,305,Charles Leno,6-0,232,Nick Bolton,2,0,0,4.787839,6.807437
288333,20211017071173,0.0,Speed,,,57.90,24.08,2021101707,1173,29,...,305,Charles Leno,6-0,232,Nick Bolton,2,0,0,4.674409,6.866513
288334,20211017071173,0.0,Speed,,,57.90,24.08,2021101707,1173,30,...,305,Charles Leno,6-0,232,Nick Bolton,2,0,0,4.555052,6.941823


In [76]:
power = model_data7[model_data7['rushType'] == 'Power']
power_filt = power[(power['tackle_qb_dist'] < power['end_qb_dist'])]

In [77]:
power_filt['rushType'].value_counts()

Power    178518
Name: rushType, dtype: int64

In [78]:
subset = power_filt[power_filt['tackle_end_engaged'] == 1]

In [79]:
subset2 = subset.groupby(['uniqueplayId','QB_Flip']).first().reset_index()

In [80]:
distance_when_engaged = subset2[['uniqueplayId','QB_Flip','tackle_qb_dist']]
distance_when_engaged

Unnamed: 0,uniqueplayId,QB_Flip,tackle_qb_dist
0,202109120163,0.0,3.927467
1,202109120163,1.0,5.853136
2,202109120578,1.0,4.997960
3,202109120676,0.0,4.894865
4,202109120676,1.0,3.900820
...,...,...,...
5547,20211025003660,1.0,5.355978
5548,20211025003904,0.0,3.735305
5549,20211025003904,1.0,4.439279
5550,20211025003926,0.0,4.787118


In [81]:
end_distance = power_filt.groupby(['uniqueplayId','QB_Flip']).last().reset_index()

In [82]:
end_distance = end_distance[['uniqueplayId','QB_Flip','tackle_qb_dist']]
end_distance

Unnamed: 0,uniqueplayId,QB_Flip,tackle_qb_dist
0,202109120163,0.0,2.438237
1,202109120163,1.0,5.594569
2,202109120288,1.0,6.659084
3,202109120578,1.0,2.407094
4,202109120676,0.0,4.827132
...,...,...,...
6218,20211025003660,1.0,3.138057
6219,20211025003904,0.0,2.595149
6220,20211025003904,1.0,4.460717
6221,20211025003926,0.0,8.396076


In [83]:
end_distance = end_distance.rename(columns = {'tackle_qb_dist': 'ending_distance'})

In [84]:
distance = distance_when_engaged.merge(end_distance, on = ['uniqueplayId', 'QB_Flip'], how = 'left')
distance

Unnamed: 0,uniqueplayId,QB_Flip,tackle_qb_dist,ending_distance
0,202109120163,0.0,3.927467,2.438237
1,202109120163,1.0,5.853136,5.594569
2,202109120578,1.0,4.997960,2.407094
3,202109120676,0.0,4.894865,4.827132
4,202109120676,1.0,3.900820,1.730665
...,...,...,...,...
5547,20211025003660,1.0,5.355978,3.138057
5548,20211025003904,0.0,3.735305,2.595149
5549,20211025003904,1.0,4.439279,4.460717
5550,20211025003926,0.0,4.787118,8.396076


In [85]:
distance['pct_given_up'] = (distance['tackle_qb_dist'] - distance['ending_distance']) / distance['tackle_qb_dist']
distance

Unnamed: 0,uniqueplayId,QB_Flip,tackle_qb_dist,ending_distance,pct_given_up
0,202109120163,0.0,3.927467,2.438237,0.379183
1,202109120163,1.0,5.853136,5.594569,0.044176
2,202109120578,1.0,4.997960,2.407094,0.518385
3,202109120676,0.0,4.894865,4.827132,0.013838
4,202109120676,1.0,3.900820,1.730665,0.556333
...,...,...,...,...,...
5547,20211025003660,1.0,5.355978,3.138057,0.414102
5548,20211025003904,0.0,3.735305,2.595149,0.305238
5549,20211025003904,1.0,4.439279,4.460717,-0.004829
5550,20211025003926,0.0,4.787118,8.396076,-0.753890


In [86]:
distance = distance.rename(columns = {'tackle_qb_dist' : 'starting_distance'})

In [87]:
model_data8 = power_filt.merge(distance, on = ['uniqueplayId','QB_Flip'], how = 'left')
model_data8

Unnamed: 0,uniqueplayId,QB_Flip,rushType,Power_prob,Speed_prob,football_x,football_y,gameId,playId,frameId,...,end_weight,end_name,quarter,obvious_pass,tackle_end_engaged,tackle_qb_dist,end_qb_dist,starting_distance,ending_distance,pct_given_up
0,202109120163,0.0,Power,0.98633,0.01367,34.03,23.76,2021091201,63,6,...,242,Alex Highsmith,1,0,0,4.820290,6.804594,3.927467,2.438237,0.379183
1,202109120163,0.0,Power,0.98633,0.01367,34.03,23.76,2021091201,63,7,...,242,Alex Highsmith,1,0,0,4.816638,6.835671,3.927467,2.438237,0.379183
2,202109120163,0.0,Power,0.98633,0.01367,34.03,23.76,2021091201,63,8,...,242,Alex Highsmith,1,0,0,4.790918,6.876053,3.927467,2.438237,0.379183
3,202109120163,0.0,Power,0.98633,0.01367,34.03,23.76,2021091201,63,9,...,242,Alex Highsmith,1,0,0,4.766424,6.936195,3.927467,2.438237,0.379183
4,202109120163,0.0,Power,0.98633,0.01367,34.03,23.76,2021091201,63,10,...,242,Alex Highsmith,1,0,0,4.738143,7.041023,3.927467,2.438237,0.379183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178513,20211021002175,1.0,Power,,,92.41,29.85,2021102100,2175,25,...,315,Jordan Elliott,3,0,0,5.114626,6.142353,4.666155,4.936801,-0.058002
178514,20211021002175,1.0,Power,,,92.41,29.85,2021102100,2175,26,...,315,Jordan Elliott,3,0,0,5.084191,6.162735,4.666155,4.936801,-0.058002
178515,20211021002175,1.0,Power,,,92.41,29.85,2021102100,2175,27,...,315,Jordan Elliott,3,0,0,5.049287,6.169935,4.666155,4.936801,-0.058002
178516,20211021002175,1.0,Power,,,92.41,29.85,2021102100,2175,28,...,315,Jordan Elliott,3,0,0,5.002799,6.168249,4.666155,4.936801,-0.058002


In [88]:
## Going to need to remove any power plays where this is an NA
model_data8 = model_data8.dropna(subset = ['pct_given_up'])

In [111]:
power_speed_perc = model_data7.groupby(['end_name', 'uniqueplayId', 'QB_Flip']).agg({
    'rushType':'first'
}).reset_index().groupby('end_name')['rushType'].agg(
    power_perc = lambda x: x[x == 'Power'].count()/x.count(),
    speed_perc = lambda x: x[x == 'Speed'].count()/x.count(),
    total = 'count'
).reset_index()

In [112]:
overall_perc_avs = model_data7.groupby(['uniqueplayId', 'QB_Flip']).agg({
    'rushType':'first'
}).reset_index()['rushType'].value_counts(normalize=True)
overall_perc_avs

Power    0.704846
Speed    0.295154
Name: rushType, dtype: float64

In [139]:
power_speed_perc['Av_Power'] = overall_perc_avs[0]
power_speed_perc['Av_Speed'] = overall_perc_avs[1]
power_speed_perc['num_plays'] = power_speed_perc['total'].sum()
power_speed_perc['end_power_perc'] = power_speed_perc['power_perc']*(power_speed_perc['total']/500) + power_speed_perc['Av_Power']*(1-power_speed_perc['total']/500)
power_speed_perc['end_speed_perc'] = power_speed_perc['speed_perc']*(power_speed_perc['total']/500) + power_speed_perc['Av_Speed']*(1-power_speed_perc['total']/500)

In [172]:
speed = speed.merge(power_speed_perc[['end_name', 'end_power_perc', 'end_speed_perc']], how='left', on='end_name')

In [173]:
speed['dist_moved'] = speed.groupby(['uniqueplayId','QB_Flip'])['Tackle_dis'].cumsum()

In [174]:
# Aggregate for grouped by to prepare data for modeling - one row per tackle/end instance
# The row should have the distance of the tackle in one second, and multiple predictors
# The predictors are speed/power percent of the end, weight and height of end, one-hot offensive linesman, obvious passing situation, and maybe quarter
def full_agg(grouped_df):
    row = grouped_df[grouped_df['frameId']==grouped_df['ball_snap_frame']+10]
    return pd.DataFrame({
    'one_sec_dist': row['dist_moved'],
    'end_power_perc': row['end_power_perc'],
    'end_speed_perc':  row['end_speed_perc'],
    'end_height': row['end_height'],
    'end_weight': row['end_weight'],
    'tackle_name': row['tackle_name'],
    'obvious_pass': row['obvious_pass'],
    'quarter': row['quarter']
})

speed_data = speed.groupby(['uniqueplayId', 'QB_Flip'], group_keys=True).apply(full_agg).reset_index().drop(['level_2'], axis=1)

In [175]:
speed_data['end_height'] = [int(j)*12+int(i) for j, i in speed_data['end_height'].str.split('-')]
speed_data['tackle_name'] = speed_data['tackle_name'].str.replace(" ", "_")
speed_data

Unnamed: 0,uniqueplayId,QB_Flip,one_sec_dist,end_power_perc,end_speed_perc,end_height,end_weight,tackle_name,obvious_pass,quarter
0,202109090097,1.0,1.46,0.699278,0.300722,75,245,Tristan_Wirfs,0,1
1,202109120288,0.0,1.36,0.698062,0.301938,75,275,Mekhi_Becton,0,1
2,202109120578,0.0,0.76,0.702890,0.297110,77,262,Laremy_Tunsil,0,1
3,202109120776,1.0,1.44,0.705348,0.294652,77,265,David_Quessenberry,1,1
4,202109121176,1.0,1.79,0.715515,0.284485,77,277,Ryan_Ramczyk,0,1
...,...,...,...,...,...,...,...,...,...,...
2610,20211025003506,0.0,0.87,0.708722,0.291278,76,279,Terron_Armstead,1,4
2611,20211025003536,0.0,1.84,0.696291,0.303709,76,260,Terron_Armstead,0,4
2612,20211025003536,1.0,1.98,0.704062,0.295938,76,330,Ryan_Ramczyk,0,4
2613,20211025003660,0.0,1.95,0.696291,0.303709,76,260,Terron_Armstead,1,4


In [186]:
speed_dummies = pd.get_dummies(speed_data, prefix="", prefix_sep='', columns=['tackle_name'])
speed_dummies['tackle_name'] = speed_data['tackle_name']
speed_dummies

Unnamed: 0,uniqueplayId,QB_Flip,one_sec_dist,end_power_perc,end_speed_perc,end_height,end_weight,obvious_pass,quarter,Alejandro_Villanueva,...,Trent_Williams,Trenton_Brown,Tristan_Wirfs,Ty_Sambrailo,Tyre_Phillips,Tyron_Smith,Yasir_Durant,Yodny_Cajuste,Yosuah_Nijman,tackle_name
0,202109090097,1.0,1.46,0.699278,0.300722,75,245,0,1,0,...,0,0,1,0,0,0,0,0,0,Tristan_Wirfs
1,202109120288,0.0,1.36,0.698062,0.301938,75,275,0,1,0,...,0,0,0,0,0,0,0,0,0,Mekhi_Becton
2,202109120578,0.0,0.76,0.702890,0.297110,77,262,0,1,0,...,0,0,0,0,0,0,0,0,0,Laremy_Tunsil
3,202109120776,1.0,1.44,0.705348,0.294652,77,265,1,1,0,...,0,0,0,0,0,0,0,0,0,David_Quessenberry
4,202109121176,1.0,1.79,0.715515,0.284485,77,277,0,1,0,...,0,0,0,0,0,0,0,0,0,Ryan_Ramczyk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,20211025003506,0.0,0.87,0.708722,0.291278,76,279,1,4,0,...,0,0,0,0,0,0,0,0,0,Terron_Armstead
2611,20211025003536,0.0,1.84,0.696291,0.303709,76,260,0,4,0,...,0,0,0,0,0,0,0,0,0,Terron_Armstead
2612,20211025003536,1.0,1.98,0.704062,0.295938,76,330,0,4,0,...,0,0,0,0,0,0,0,0,0,Ryan_Ramczyk
2613,20211025003660,0.0,1.95,0.696291,0.303709,76,260,1,4,0,...,0,0,0,0,0,0,0,0,0,Terron_Armstead
