In [20]:
import pandas as pd
import numpy as np
import math
import pickle

In [3]:
df_track_2020 = pd.read_csv("data_sets/tracking2020.csv")
df_track_2019 = pd.read_csv("data_sets/tracking2019.csv")
df_track_2018 = pd.read_csv("data_sets/tracking2018.csv")

In [4]:
df_games = pd.read_csv("data_sets/games.csv")
df_plays = pd.read_csv("data_sets/plays.csv")

In [53]:
def process_play_data(df_plays):
    kick_plays_df = df_plays[(df_plays["specialTeamsPlayType"] == "Kickoff") & (df_plays["specialTeamsResult"] == "Return")]
    kick_plays_df = kick_plays_df[["gameId", "playId", "possessionTeam", "returnerId", "kickReturnYardage"]]  
    # Get rid of onside kicks
    kick_plays_df.dropna(axis=0, how='any', subset=['returnerId'], inplace=True)  
    # Get rid of kicks with multiple returners
    kick_plays_df.drop(kick_plays_df[kick_plays_df['returnerId'].str.contains(';')].index, inplace = True)  
    kick_plays_df["returnerId"] = kick_plays_df["returnerId"].astype('int')
    # Get rid of kicks with no return yards listed
    kick_plays_df.dropna(axis=0, how='any', subset=['kickReturnYardage'], inplace=True) 
    
    return kick_plays_df
    

In [54]:
def process_tracking_data(df_track_2018, df_track_2019, df_track_2020):
    tracking_df = pd.concat([df_track_2018, df_track_2019, df_track_2020])
    tracking_df = tracking_df[tracking_df["event"] == "kick_received"]
    # Get rid of tracking data on football
    tracking_df.drop(tracking_df[tracking_df['team'] == "football"].index, inplace = True) 
    
    tracking_df['dir'] = np.mod(90 - tracking_df['dir'], 360)  # Change 0 degrees to be pointing downfield
    standardize_tracking_data(tracking_df)
    add_velocity_vectors_tracking_data(tracking_df)
    
    tracking_df["nflId"] = tracking_df["nflId"].astype('int')
    tracking_df = tracking_df[['gameId','playId','nflId','team', 'x', 'y', 'v_x', 'v_y']]
        
    return tracking_df

In [55]:
def process_game_data(df_games):
    df_games_slim = df_games[['gameId', 'homeTeamAbbr', 'visitorTeamAbbr']]
    return df_games_slim

In [56]:
def standardize_tracking_data(df_tracking):
    # We always want the returning team to be going from left to right
    df_tracking.loc[df_tracking['playDirection'] == "right", 'x'] = 120-df_tracking.loc[df_tracking['playDirection'] == "right", 'x']
    df_tracking.loc[df_tracking['playDirection'] == "right", 'y'] = 160/3-df_tracking.loc[df_tracking['playDirection'] == "right", 'y']
    df_tracking.loc[df_tracking['playDirection'] == "right", 'dir'] = np.mod(180 + df_tracking.loc[df_tracking['playDirection'] == "right", 'dir'], 360)
    
    

In [57]:
def add_velocity_vectors_tracking_data(df_tracking):
    df_tracking["v_x"] = df_tracking["s"] * df_tracking["dir"].apply(math.radians).apply(math.cos)
    df_tracking["v_y"] = df_tracking["s"] * df_tracking["dir"].apply(math.radians).apply(math.sin)

In [58]:
def merge_tables(df_games, df_kick_plays, df_tracking):
    game_play_merge = pd.merge(df_games, df_kick_plays, how='inner')
    all_merge = pd.merge(game_play_merge, df_tracking, how='inner')
    return all_merge

In [59]:
def add_player_side(play_track_df):
    play_track_df["team_abbr"] = np.where(play_track_df["team"] == "home", play_track_df["homeTeamAbbr"], play_track_df["visitorTeamAbbr"])
    play_track_df["player_side"] = np.where(play_track_df["team_abbr"] == play_track_df["possessionTeam"], "kicking_team", "return_team")
    play_track_df["player_side"] = np.where(play_track_df["returnerId"] == play_track_df["nflId"], "returner", play_track_df["player_side"])
    

In [60]:
def process_data(df_track_2018, df_track_2019, df_track_2020, df_games, df_plays):
    kick_plays_df = process_play_data(df_plays)
    df_games_slim = process_game_data(df_games)
    tracking_df = process_tracking_data(df_track_2018, df_track_2019, df_track_2020)    
    play_track_df = merge_tables(df_games_slim, kick_plays_df, tracking_df)
    add_player_side(play_track_df)
    
    # Get rid of plays where there was not 22 players on field
    grouped_df = play_track_df.groupby(["gameId", "playId"]) 
    play_track_df = grouped_df.filter(lambda x: x['nflId'].count() == 22)
    
    return play_track_df
    

### Convert Data Frames to Model Inputs

In [61]:
def create_X_tensor(play_track_df):     
    grouped_df = play_track_df.groupby(["gameId", "playId"])
    print(len(grouped_df))
    train_x = np.zeros([len(grouped_df),11,10,10])
    
    i = 0
    for name, group in grouped_df:
        [[returner_x, returner_y, returner_Vx, returner_Vy]] = group.loc[group.player_side=="returner",['x', 'y','v_x','v_y']].values

        kick_team_ids = group[group.player_side == "kicking_team"].index
        return_team_ids = group[group.player_side == "return_team"].index

        for j, kick_team_id in enumerate(kick_team_ids):
            [kick_team_x, kick_team_y, kick_team_Vx, kick_team_Vy] = group.loc[kick_team_id,['x', 'y','v_x','v_y']].values

            [kick_team_returner_x, kick_team_returner_y] = group.loc[kick_team_id,['x', 'y']].values - np.array([returner_x, returner_y])
            [kick_team_returner_Vx, kick_team_returner_Vy] = group.loc[kick_team_id,['v_x', 'v_y']].values - np.array([returner_Vx, returner_Vy])

            train_x[i,j,:,:4] = group.loc[return_team_ids,['v_x','v_y','x', 'y']].values - np.array([kick_team_x, kick_team_y, kick_team_Vx, kick_team_Vy])
            train_x[i,j,:,-6:] = [kick_team_returner_Vx, kick_team_returner_Vy, kick_team_returner_x, kick_team_returner_y, kick_team_Vx, kick_team_Vy]
        i += 1
    
    return train_x
    
    
    

In [62]:
def create_y_train(play_track_df):
    min_idx_y = 71
    max_idx_y = 150
    
    train_y = play_track_df.groupby(["gameId", "playId"])["kickReturnYardage"].mean()
    train_y = train_y.to_frame()
    train_y.reset_index(level=["gameId", "playId"], inplace=True)
    
    train_y['YardIndex'] = train_y["kickReturnYardage"].apply(lambda val: val + 99)
    train_y['YardIndexClipped'] = train_y['YardIndex'].apply(
        lambda val: min_idx_y if val < min_idx_y else max_idx_y if val > max_idx_y else val)
    
    print('max yardIndex: ', train_y.YardIndex.max())
    print('max yardIndexClipped: ', train_y.YardIndexClipped.max())
    print('min yardIndex: ', train_y.YardIndex.min())
    print('min yardIndexClipped: ', train_y.YardIndexClipped.min())
    
    return train_y
    

In [69]:
def create_y_tensor(train_y, min_idx_y, max_idx_y):
    num_classes_y = max_idx_y - min_idx_y + 1
    y_vals = train_y["YardIndexClipped"].values
    y_tensor = np.zeros((len(y_vals), num_classes_y), np.int32)
    for i, yards in enumerate(y_vals):
        y_tensor[(i, yards.astype(np.int32) - min_idx_y)] = 1
    
    return y_tensor

In [71]:
y_tensor = create_y_tensor(train_y, 71, 150)

In [73]:
y_tensor.shape

(2764, 80)

In [64]:
play_track_df = process_data(df_track_2018, df_track_2019, df_track_2020, df_games, df_plays)

In [65]:
X_tensor = create_X_tensor(play_track_df)

2764


In [66]:
X_tensor.shape

(2764, 11, 10, 10)

In [67]:
train_y = create_y_train(play_track_df)

max yardIndex:  203.0
max yardIndexClipped:  150.0
min yardIndex:  85.0
min yardIndexClipped:  85.0


In [None]:
play_track_df.head(25)

In [19]:
train_y.to_pickle("train_y_df.data")

In [74]:
with open('X_tensor.data', 'wb') as f:
    pickle.dump(X_tensor, f)
with open('y_tensor.data', 'wb') as f:
    pickle.dump(y_tensor, f)