In [2]:
#import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#import needed function
from functions import add_column_from_other_df

In [4]:
#read necessary csvs
tracking_football = pd.read_csv('tracking_football.csv')
tracking_simple = pd.read_csv('tracking_simple.csv')
df_player_play_motion = pd.read_csv('player_play.csv')
df_player = pd.read_csv('players.csv')

In [5]:
#get the frameId of the ball snap and the frame the player was last moving before the ball snap 
#note: using frame before ball snap rather than ball snap, as some players not in motion moved exactly as the ball was snapped
last_motion_frame = (
    tracking_simple[tracking_simple['in_motion'] == 1]
    .groupby(['gameId', 'playId', 'nflId'])['frameId']
    .max()
    .sub(1)
    .reset_index()
)
snap_frame = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['frameId']
    .max()
    .reset_index()
)

df_player_play_motion = add_column_from_other_df(df_player_play_motion, last_motion_frame, 'frameId', 'last_motion_frameId')
df_player_play_motion = add_column_from_other_df(df_player_play_motion, snap_frame, 'frameId', 'snap_frameId')

In [6]:
#find the frame gap between the snap and the last 'in_motion' frame
df_player_play_motion['snap_motion_gap'] = df_player_play_motion['snap_frameId'] - df_player_play_motion['last_motion_frameId'] - 1 #account for adjustment above

In [7]:
#get initial y and final y coordinates (relative to the ball)
initial_rel_y_dis = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['dis_fb_y']
    .first()
    .reset_index()
)
end_rel_y_dis = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['dis_fb_y']
    .last()
    .reset_index()
)

df_player_play_motion = add_column_from_other_df(df_player_play_motion, initial_rel_y_dis, 'dis_fb_y', 'init_rel_y_dis')
df_player_play_motion = add_column_from_other_df(df_player_play_motion, end_rel_y_dis, 'dis_fb_y', 'end_rel_y_dis')

#calculate overall change in player's y coordinate
df_player_play_motion['overall_y_change'] = df_player_play_motion['init_rel_y_dis'] - df_player_play_motion['end_rel_y_dis']

#same steps but for x
initial_rel_x_dis = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['dis_fb_x']
    .first()
    .reset_index()
)
end_rel_x_dis = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['dis_fb_x']
    .last()
    .reset_index()
)

df_player_play_motion = add_column_from_other_df(df_player_play_motion, initial_rel_x_dis, 'dis_fb_x', 'init_rel_x_dis')
df_player_play_motion = add_column_from_other_df(df_player_play_motion, end_rel_x_dis, 'dis_fb_x', 'end_rel_x_dis')

df_player_play_motion['overall_x_change'] = (df_player_play_motion['init_rel_x_dis'] - df_player_play_motion['end_rel_x_dis'])

In [8]:
#find player's average speed during their motion
motion_speed_avg = (
    tracking_simple[tracking_simple['in_motion'] == 1]
    .groupby(['gameId', 'playId', 'nflId'])['s_std']
    .mean()
    .reset_index()
)
df_player_play_motion = add_column_from_other_df(df_player_play_motion, motion_speed_avg, 's_std', 'motion_s_avg')

In [9]:
#find player's total time in motion
motion_sum = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['in_motion']
    .sum()
    .reset_index()
)
df_player_play_motion = add_column_from_other_df(df_player_play_motion, motion_sum, 'in_motion', 'frames_in_motion')

In [10]:
# #Need some way to tell when a player switched direction

#get sign of first motion y frame
y_change_sign = (
    tracking_simple[tracking_simple['in_motion'] == 1]
    .groupby(['gameId', 'playId', 'nflId'])['y_change']
    .first()
    .apply(lambda x: 1 if x > 0 else -1)
    .reset_index(name='y_change_sign')
)

#merge initial sign back to the main dataframe
tracking_simple = tracking_simple.merge(y_change_sign, on=['gameId', 'playId', 'nflId'], how='left')

In [11]:
#column for if sign is different 
tracking_simple['y_sign_diff'] = (tracking_simple['y_change'] * tracking_simple['y_change_sign'] < 0)*1

#find earliest motion occurrence of the sign being different, return the y location
y_swap_loc = (
    tracking_simple[tracking_simple['y_sign_diff'] == 1]
    .groupby(['gameId', 'playId', 'nflId'])['dis_fb_y']
    .first()
    .reset_index(name='y_change_sign')
)

#if none, get the location at the snap
y_swap_loc_fallback = (
    tracking_simple.groupby(['gameId', 'playId', 'nflId'])['dis_fb_y']
    .last()
    .reset_index(name='y_change_sign')
)

#merge the two dataframes to ensure we get the y_change_sign values
y_swap_loc_merge = pd.merge(
    y_swap_loc_fallback, y_swap_loc,
    on=['gameId', 'playId', 'nflId'], how='left', suffixes=('_fallback', '_final')
)

#add an additional boolean column to note if there was any switch that occurred on the play
y_swap_loc_merge['y_flip_bool'] = (
    (y_swap_loc_merge['y_change_sign_final'].notna()) & 
    (y_swap_loc_merge['y_change_sign_fallback'].sub(y_swap_loc_merge['y_change_sign_final']).abs() >= 100) #has to go at least a foot in other direction
).astype(int)

#if 'y_change_sign_final' is NaN, replace with 'y_change_sign_fallback'
y_swap_loc_merge['y_change_sign'] = y_swap_loc_merge['y_change_sign_final'].fillna(y_swap_loc_merge['y_change_sign_fallback'])

#drop the extra columns
y_swap_loc_merge = y_swap_loc_merge[['gameId', 'playId', 'nflId', 'y_change_sign','y_flip_bool']]

In [12]:
#add the column to the dataframe
df_player_play_motion = add_column_from_other_df(df_player_play_motion, y_swap_loc_merge, 'y_change_sign', 'y_reverse_loc')
df_player_play_motion = add_column_from_other_df(df_player_play_motion, y_swap_loc_merge, 'y_flip_bool', 'y_reverse_bool')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small.rename(columns={col: 'placeholder_column_x09ds623n'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small.rename(columns={col: 'placeholder_column_x09ds623n'}, inplace=True)


In [13]:
#convert inMotionAtBallSnap to numeric column
df_player_play_motion['inMotionAtBallSnap'] = df_player_play_motion['inMotionAtBallSnap'] * 1
df_player_play_motion['shiftSinceLineset'] = df_player_play_motion['shiftSinceLineset'] * 1
df_player_play_motion['motionSinceLineset'] = df_player_play_motion['motionSinceLineset'] * 1

In [14]:
#start by getting the sums of the x_change sign for every player
x_dis_travel_pos = tracking_simple[tracking_simple['x_change'] > 1].groupby(['gameId', 'playId', 'nflId'])['x_change'].sum().abs().reset_index()
x_dis_travel_neg = tracking_simple[tracking_simple['x_change'] < 1].groupby(['gameId', 'playId', 'nflId'])['x_change'].sum().abs().reset_index()

#add to df_player_play_motion
df_player_play_motion = add_column_from_other_df(df_player_play_motion, x_dis_travel_pos, 'x_change', 'x_dis_travel_pos')
df_player_play_motion = add_column_from_other_df(df_player_play_motion, x_dis_travel_neg, 'x_change', 'x_dis_travel_neg')

In [15]:
#create a column to see if a player changes direction on a play#

#start by getting the sums of the y_change sign for every 
y_dis_travel_pos = tracking_simple[tracking_simple['y_change'] > 1].groupby(['gameId','playId','nflId'])['y_change'].sum().abs().reset_index()
y_dis_travel_neg = tracking_simple[tracking_simple['y_change'] < 1].groupby(['gameId','playId','nflId'])['y_change'].sum().abs().reset_index()

#add to df_player_play_motion
df_player_play_motion = add_column_from_other_df(df_player_play_motion, y_dis_travel_pos, 'y_change', 'y_dis_travel_pos')
df_player_play_motion = add_column_from_other_df(df_player_play_motion, y_dis_travel_neg, 'y_change', 'y_dis_travel_neg')

In [16]:
#find the y distance a player started from the tackles and how far they ended from them

#get the min/max coordinates for the tackles
tracking_tackles = pd.read_csv('tracking_tackles.csv')
tracking_tackles = (
    tracking_tackles
    .groupby(['gameId', 'playId'], as_index=False)
    .agg(y_std_t_min=('y_std', 'min'), y_std_t_max=('y_std', 'max'))
)

#add the coordinates to df_player_play_motion
df_player_play_motion = df_player_play_motion.merge(tracking_tackles, on=['gameId', 'playId'], how='inner')

In [17]:
#get initial y and final y coordinates (not relative to the ball)
initial_y_dis = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['y_std']
    .first()
    .reset_index()
)
end_y_dis = (
    tracking_simple
    .groupby(['gameId', 'playId', 'nflId'])['y_std']
    .last()
    .reset_index()
)

df_player_play_motion = add_column_from_other_df(df_player_play_motion, initial_y_dis, 'y_std', 'init_y_std')
df_player_play_motion = add_column_from_other_df(df_player_play_motion, end_y_dis, 'y_std', 'end_y_std')


In [18]:
#initial distance from tackle
t_diff_min = df_player_play_motion['y_std_t_min'] - df_player_play_motion['init_y_std']
t_diff_max = df_player_play_motion['init_y_std'] - df_player_play_motion['y_std_t_max']

df_player_play_motion['init_tackle_dis'] = np.where(
    abs(t_diff_min) < abs(t_diff_max),
    t_diff_min,
    t_diff_max
)

#final distance from tackle
t_diff_min = df_player_play_motion['y_std_t_min'] - df_player_play_motion['end_y_std']
t_diff_max = df_player_play_motion['end_y_std'] - df_player_play_motion['y_std_t_max']

df_player_play_motion['end_tackle_dis'] = np.where(
    abs(t_diff_min) < abs(t_diff_max),
    t_diff_min,
    t_diff_max
)

In [19]:
df_player_play_motion.head()

Unnamed: 0,gameId,playId,nflId,teamAbbr,hadRushAttempt,rushingYards,hadDropback,passingYards,sackYardsAsOffense,hadPassReception,...,x_dis_travel_pos,x_dis_travel_neg,y_dis_travel_pos,y_dis_travel_neg,y_std_t_min,y_std_t_max,init_y_std,end_y_std,init_tackle_dis,end_tackle_dis
0,2022090800,80,47857,BUF,0,0,0,0,0,0,...,2,411,1071,288,8732,9412,8062,8846,670,-114
1,2022090800,101,53079,BUF,0,0,0,0,0,0,...,240,28,539,211,8720,9395,9036,9367,-316,-28
2,2022090800,191,53079,BUF,0,0,0,0,0,0,...,76,6,4,71,9321,9955,9081,9024,240,297
3,2022090800,236,52536,BUF,0,0,0,0,0,1,...,31,12,474,0,9347,9481,7991,8472,1356,875
4,2022090800,299,43399,LA,0,0,0,0,0,0,...,178,47,24,514,2071,2703,2381,1898,-310,173


In [20]:
len(df_player_play_motion)

4484

In [21]:
#if y started off negative, flip y values so it starts off positive (makes same type of motion on left or right side register the same)
mask = df_player_play_motion['init_rel_y_dis'] < 0
df_player_play_motion['y_flipped'] = mask.astype(int)
df_player_play_motion.loc[mask, ['init_rel_y_dis', 'end_rel_y_dis', 'overall_y_change','y_reverse_loc']] *= -1

#also swap 'y_dis_travel_pos' and 'y_dis_travel_neg' for rows where the mask is true
df_player_play_motion.loc[mask, ['y_dis_travel_pos', 'y_dis_travel_neg']] = \
    df_player_play_motion.loc[mask, ['y_dis_travel_neg', 'y_dis_travel_pos']].values

In [22]:
#bool if player swapped direction
ratio = df_player_play_motion['y_dis_travel_pos'] / (df_player_play_motion['y_dis_travel_pos'] + df_player_play_motion['y_dis_travel_neg'])
df_player_play_motion['swapped_y_dir'] = ((ratio >= 0.15) & (ratio <= 0.85)).astype(int)

In [23]:
#column for their position
df_positions = df_player[['nflId','position']]
df_player_play_motion = df_player_play_motion.merge(df_positions, on=['nflId'], how='inner')

In [24]:
#note if player swapped sides of the field on the play
df_player_play_motion['swapped_side'] = (
    ((df_player_play_motion['init_rel_y_dis'] * df_player_play_motion['end_rel_y_dis'] < 0) &
     (df_player_play_motion['overall_y_change'] >= 150)) #ignore small fidgiting as swapping sides
    .astype(int)
)

In [25]:
#create dataframe with only columns used for clustering
df_clustering_cols = df_player_play_motion[[
    'shiftSinceLineset', 'motionSinceLineset', 'snap_motion_gap', 
    'init_rel_y_dis', 'end_rel_y_dis', 'overall_y_change', 
    'init_rel_x_dis', 'end_rel_x_dis', 'overall_x_change', 
    'motion_s_avg', 'frames_in_motion', 'swapped_side',
    'y_dis_travel_pos', 'y_dis_travel_neg','swapped_y_dir',
    'x_dis_travel_pos', 'x_dis_travel_neg',
    'inMotionAtBallSnap','init_tackle_dis','end_tackle_dis'
]]

In [26]:
#export dataframes
df_clustering_cols.to_csv('clustering_columns.csv', index=False)
df_player_play_motion.to_csv('df_player_play_motion.csv',index=False)