In [2]:
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.model_selection import train_test_split

In [3]:
df_track_2020 = pd.read_csv("data_sets/tracking2020.csv")
df_track_2019 = pd.read_csv("data_sets/tracking2019.csv")
df_track_2018 = pd.read_csv("data_sets/tracking2018.csv")

In [4]:
df_games = pd.read_csv("data_sets/games.csv")
df_plays = pd.read_csv("data_sets/plays.csv")

In [5]:
def process_play_data(df_plays, kickoff):
    '''
    Get the necessary columns from the plays dataframe and select only the rows that are for kickoffs where there
    was actually a return on the play
    '''
    if kickoff:       
        kick_plays_df = df_plays[(df_plays["specialTeamsPlayType"] == "Kickoff") & (df_plays["specialTeamsResult"] == "Return")]
    else:
        kick_plays_df = df_plays[(df_plays["specialTeamsPlayType"] == "Punt") & (df_plays["specialTeamsResult"] == "Return")]
    kick_plays_df = kick_plays_df[["gameId", "playId", "possessionTeam", "returnerId", "kickReturnYardage"]]  
    # Get rid of onside kicks
    kick_plays_df.dropna(axis=0, how='any', subset=['returnerId'], inplace=True)  
    # Get rid of kicks with multiple returners
    kick_plays_df.drop(kick_plays_df[kick_plays_df['returnerId'].str.contains(';')].index, inplace = True)  
    kick_plays_df["returnerId"] = kick_plays_df["returnerId"].astype('int')
    # Get rid of kicks with no return yards listed (likely because of fumble)
    kick_plays_df.dropna(axis=0, how='any', subset=['kickReturnYardage'], inplace=True) 
    
    return kick_plays_df
    

In [47]:
def process_tracking_data(df_track_2018, df_track_2019, df_track_2020, kickoff):
    '''
    Get the necessary data from the player tracking dataframes, standardize directions,
    and add additional features (velocity)
    '''
    tracking_df = pd.concat([df_track_2018, df_track_2019, df_track_2020])
    if kickoff:
        tracking_df = tracking_df[tracking_df["event"] == "kick_received"]
    else:
        tracking_df = tracking_df[tracking_df["event"] == "punt_received"]
    # Get rid of tracking data on football
    tracking_df.drop(tracking_df[tracking_df['team'] == "football"].index, inplace = True) 
    
    tracking_df['dir'] = np.mod(90 - tracking_df['dir'], 360)  # Change 0 degrees to be pointing downfield
    standardize_tracking_data(tracking_df)
    add_velocity_vectors_tracking_data(tracking_df)
    
    tracking_df["nflId"] = tracking_df["nflId"].astype('int')
    tracking_df = tracking_df[['gameId','playId','nflId','team', 'x', 'y', 'v_x', 'v_y']]
        
    return tracking_df

In [7]:
def standardize_tracking_data(df_tracking):
    '''
    Standardize the positions and directions of the play so that the returning team is always
    going from left to right
    '''
    df_tracking.loc[df_tracking['playDirection'] == "right", 'x'] = 120-df_tracking.loc[df_tracking['playDirection'] == "right", 'x']
    df_tracking.loc[df_tracking['playDirection'] == "right", 'y'] = 160/3-df_tracking.loc[df_tracking['playDirection'] == "right", 'y']
    df_tracking.loc[df_tracking['playDirection'] == "right", 'dir'] = np.mod(180 + df_tracking.loc[df_tracking['playDirection'] == "right", 'dir'], 360)

In [8]:
def add_velocity_vectors_tracking_data(df_tracking):
    '''
    Use speed and direction to get the player's velocity in the x, y direction and add these as columns to df
    '''
    df_tracking["v_x"] = df_tracking["s"] * df_tracking["dir"].apply(math.radians).apply(math.cos)
    df_tracking["v_y"] = df_tracking["s"] * df_tracking["dir"].apply(math.radians).apply(math.sin)

In [9]:
def process_game_data(df_games):
    '''
    Select the necessary columns from the games df
    '''
    df_games_slim = df_games[['gameId', 'homeTeamAbbr', 'visitorTeamAbbr']]
    return df_games_slim

In [10]:
def merge_tables(df_games, df_kick_plays, df_tracking):
    '''
    Merged the processed games, plays, and tracking data frames together
    '''
    game_play_merge = pd.merge(df_games, df_kick_plays, how='inner')
    all_merge = pd.merge(game_play_merge, df_tracking, how='inner')
    return all_merge

In [11]:
def add_player_side(play_track_df):
    '''
    Add column saying whether player is on kicking_team, returning_team, or is the returner
    '''
    play_track_df["team_abbr"] = np.where(play_track_df["team"] == "home", play_track_df["homeTeamAbbr"], play_track_df["visitorTeamAbbr"])
    play_track_df["player_side"] = np.where(play_track_df["team_abbr"] == play_track_df["possessionTeam"], "kicking_team", "return_team")
    play_track_df["player_side"] = np.where(play_track_df["returnerId"] == play_track_df["nflId"], "returner", play_track_df["player_side"])
    

In [48]:
def process_data(df_track_2018, df_track_2019, df_track_2020, df_games, df_plays, kickoff=True):
    '''
    Process all data to create a single dataframe with all necessary information for the model
    '''
    kick_plays_df = process_play_data(df_plays, kickoff)
    df_games_slim = process_game_data(df_games)
    tracking_df = process_tracking_data(df_track_2018, df_track_2019, df_track_2020, kickoff)    
    play_track_df = merge_tables(df_games_slim, kick_plays_df, tracking_df)
    add_player_side(play_track_df)
    
    # Get rid of plays where there was not 22 players on field (Sillie billlies)
    grouped_df = play_track_df.groupby(["gameId", "playId"]) 
    play_track_df = grouped_df.filter(lambda x: x['nflId'].count() == 22)
    
    return play_track_df
    

In [49]:
play_track_df = process_data(df_track_2018, df_track_2019, df_track_2020, df_games, df_plays)

In [50]:
punt_play_track_df = process_data(df_track_2018, df_track_2019, df_track_2020, df_games, df_plays, kickoff= False)

In [73]:
play_track_df.to_csv("play_track_kickoff.csv", index=False)
punt_play_track_df.to_csv("play_track_punt.csv", index=False)

## Convert Data Frames to Model Inputs

In [55]:
def create_X_tensor(play_track_df):
    '''
    Creates X input tensor for model. 
    Returns numpy array of shape (num_plays, 11, 10, 10)
    '''
    grouped_df = play_track_df.groupby(["gameId", "playId"])
    print(len(grouped_df))
    train_x = np.zeros([len(grouped_df),11,10,10])
    
    i = 0
    for name, group in grouped_df:
        [[returner_x, returner_y, returner_Vx, returner_Vy]] = group.loc[group.player_side=="returner",['x', 'y','v_x','v_y']].values

        kick_team_ids = group[group.player_side == "kicking_team"].index
        return_team_ids = group[group.player_side == "return_team"].index

        for j, kick_team_id in enumerate(kick_team_ids):
            [kick_team_x, kick_team_y, kick_team_Vx, kick_team_Vy] = group.loc[kick_team_id,['x', 'y','v_x','v_y']].values

            [kick_team_returner_x, kick_team_returner_y] = group.loc[kick_team_id,['x', 'y']].values - np.array([returner_x, returner_y])
            [kick_team_returner_Vx, kick_team_returner_Vy] = group.loc[kick_team_id,['v_x', 'v_y']].values - np.array([returner_Vx, returner_Vy])

            train_x[i,j,:,:4] = group.loc[return_team_ids,['v_x','v_y','x', 'y']].values - np.array([kick_team_x, kick_team_y, kick_team_Vx, kick_team_Vy])
            train_x[i,j,:,-6:] = [kick_team_returner_Vx, kick_team_returner_Vy, kick_team_returner_x, kick_team_returner_y, kick_team_Vx, kick_team_Vy]
        i += 1
    
    return train_x
    
    
    

In [56]:
def create_y_train(play_track_df):
    '''
    Create dataframe of y data for the model. 
    Adds 99 yards so that we can never have negative yards on play and also 
    "clips" yard gained so it can never be above or below certain values
    '''
    # These were set by original model, maybe I want to change these
    min_idx_y = 71    # Min yards gained = 28
    max_idx_y = 150   # Max yards gained = 51
    
    train_y = play_track_df.groupby(["gameId", "playId"])["kickReturnYardage"].mean()
    train_y = train_y.to_frame()
    train_y.reset_index(level=["gameId", "playId"], inplace=True)
    
    train_y['YardIndex'] = train_y["kickReturnYardage"].apply(lambda val: val + 99)
    train_y['YardIndexClipped'] = train_y['YardIndex'].apply(
        lambda val: min_idx_y if val < min_idx_y else max_idx_y if val > max_idx_y else val)
    
    print('max yardIndex: ', train_y.YardIndex.max())
    print('max yardIndexClipped: ', train_y.YardIndexClipped.max())
    print('min yardIndex: ', train_y.YardIndex.min())
    print('min yardIndexClipped: ', train_y.YardIndexClipped.min())
    
    return train_y
    

In [57]:
def create_y_tensor(train_y, min_idx_y, max_idx_y):
    '''
    For each play, create a one-hot encoded vector for yards gained
    Returns numpy array of shape (num_plays, num_y_classes) where num_y_classes
    is the number of different yards that can be predicted
    '''
    num_classes_y = max_idx_y - min_idx_y + 1
    y_vals = train_y["YardIndexClipped"].values
    y_tensor = np.zeros((len(y_vals), num_classes_y), np.int32)
    for i, yards in enumerate(y_vals):
        y_tensor[(i, yards.astype(np.int32) - min_idx_y)] = 1
    
    return y_tensor

In [76]:
def get_input_data(play_track_df, output_tag, test_size=0.1, save_tensors=True):
    X_tensor = create_X_tensor(play_track_df)
    print("X Shape", X_tensor.shape)
    
    train_y = create_y_train(play_track_df)
    y_tensor = create_y_tensor(train_y, 71, 150)
    print("y Shape", y_tensor.shape)
    
    X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = train_test_split(X_tensor, y_tensor, test_size=test_size)
    
    print("Train X", X_train_tensor.shape)
    print("Test X", X_test_tensor.shape)
    print("Train y", y_train_tensor.shape)
    print("Test y", y_test_tensor.shape)
    
    if save_tensors:
        with open(f'input_tensors/X_tensor_{output_tag}_train.data', 'wb') as f:
            pickle.dump(X_train_tensor, f)
        with open(f'input_tensors/X_tensor_{output_tag}_test.data', 'wb') as f:
            pickle.dump(X_test_tensor, f)
        with open(f'input_tensors/y_tensor_{output_tag}_train.data', 'wb') as f:
            pickle.dump(y_train_tensor, f)
        with open(f'input_tensors/y_tensor_{output_tag}_test.data', 'wb') as f:
            pickle.dump(y_test_tensor, f)

    
    return X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor

In [77]:
def get_input_data_combo(kick_play_track_df, punt_play_track_df, output_tag, test_size=0.1):
    X_train_kick_tensor, X_test_kick_tensor, y_train_kick_tensor, y_test_kick_tensor = get_input_data(kick_play_track_df, "kick", test_size=test_size, save_tensors=False)
    X_train_punt_tensor, X_test_punt_tensor, y_train_punt_tensor, y_test_punt_tensor = get_input_data(punt_play_track_df, "punt", test_size=test_size, save_tensors=False)
    
    X_train_comb_tensor = np.concatenate((X_train_kick_tensor, X_train_punt_tensor))
    X_test_comb_tensor = np.concatenate((X_test_kick_tensor, X_test_punt_tensor))
    y_train_comb_tensor = np.concatenate((y_train_kick_tensor, y_train_punt_tensor))
    y_test_comb_tensor = np.concatenate((y_test_kick_tensor, y_test_punt_tensor))
    
    print("X_train_combo", X_train_comb_tensor.shape)
    print("X_test_combo", X_test_comb_tensor.shape)
    print("y_train_combo", y_train_comb_tensor.shape)
    print("y_test_combo", y_test_comb_tensor.shape)
    
    shuffle = np.arange(len(X_train_comb_tensor))
    np.random.shuffle(shuffle)
    X_train_comb_tensor = X_train_comb_tensor[shuffle]
    y_train_comb_tensor = y_train_comb_tensor[shuffle]
    
    print("X_train_combo Shuffled", X_train_comb_tensor.shape)
    print("y_train_combo Shuffled", y_train_comb_tensor.shape)
    
    with open(f'input_tensors/X_tensor_comb_{output_tag}_train.data', 'wb') as f:
        pickle.dump(X_train_comb_tensor, f)
    with open(f'input_tensors/X_tensor_comb_{output_tag}_test.data', 'wb') as f:
        pickle.dump(X_test_comb_tensor, f)
    with open(f'input_tensors/y_tensor_comb_{output_tag}_train.data', 'wb') as f:
        pickle.dump(y_train_comb_tensor, f)
    with open(f'input_tensors/y_tensor_comb_{output_tag}_test.data', 'wb') as f:
        pickle.dump(y_test_comb_tensor, f)
    
    
    return X_train_comb_tensor, X_test_comb_tensor, y_train_comb_tensor, y_test_comb_tensor
    

### Inputs For Kickoff Only

In [63]:
X_tensor = create_X_tensor(play_track_df)
X_tensor.shape

2764


(2764, 11, 10, 10)

In [18]:
train_y = create_y_train(play_track_df)
y_tensor = create_y_tensor(train_y, 71, 150)
y_tensor.shape

max yardIndex:  203.0
max yardIndexClipped:  150.0
min yardIndex:  85.0
min yardIndexClipped:  85.0


(2764, 80)

In [19]:
with open('X_tensor.data', 'wb') as f:
    pickle.dump(X_tensor, f)
with open('y_tensor.data', 'wb') as f:
    pickle.dump(y_tensor, f)

In [41]:
X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = train_test_split(X_tensor, y_tensor, test_size=0.1)

In [42]:
print(X_train_tensor.shape)
print(X_test_tensor.shape)
print(y_train_tensor.shape)
print(y_test_tensor.shape)

(2487, 11, 10, 10)
(277, 11, 10, 10)
(2487, 80)
(277, 80)


In [43]:
with open('X_tensor_train.data', 'wb') as f:
    pickle.dump(X_train_tensor, f)
with open('X_tensor_test.data', 'wb') as f:
    pickle.dump(X_test_tensor, f)
with open('y_tensor_train.data', 'wb') as f:
    pickle.dump(y_train_tensor, f)
with open('y_tensor_test.data', 'wb') as f:
    pickle.dump(y_test_tensor, f)

## Inputs for Punts Only

In [58]:
X_punt_tensor = create_X_tensor(punt_play_track_df)
X_punt_tensor.shape

2259


(2259, 11, 10, 10)

In [59]:
train_punt_y = create_y_train(punt_play_track_df)
y_punt_tensor = create_y_tensor(train_punt_y, 71, 150)
y_punt_tensor.shape

max yardIndex:  198.0
max yardIndexClipped:  150.0
min yardIndex:  86.0
min yardIndexClipped:  86.0


(2259, 80)

In [60]:
X_train_punt_tensor, X_test_punt_tensor, y_train_punt_tensor, y_test_punt_tensor = train_test_split(X_punt_tensor, y_punt_tensor, test_size=0.1)

In [61]:
print(X_train_punt_tensor.shape)
print(X_test_punt_tensor.shape)
print(y_train_punt_tensor.shape)
print(y_test_punt_tensor.shape)

(2033, 11, 10, 10)
(226, 11, 10, 10)
(2033, 80)
(226, 80)


In [62]:
with open('input_tensors/X_tensor_punt_train.data', 'wb') as f:
    pickle.dump(X_train_punt_tensor, f)
with open('input_tensors/X_tensor_punt_test.data', 'wb') as f:
    pickle.dump(X_test_punt_tensor, f)
with open('input_tensors/y_tensor_punt_train.data', 'wb') as f:
    pickle.dump(y_train_punt_tensor, f)
with open('input_tensors/y_tensor_punt_test.data', 'wb') as f:
    pickle.dump(y_test_punt_tensor, f)

## Inputs for Combined Punt Kick

In [64]:
X_tensor = create_X_tensor(play_track_df)
print(X_tensor.shape)
train_y = create_y_train(play_track_df)
y_tensor = create_y_tensor(train_y, 71, 150)
print(y_tensor.shape)
X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = train_test_split(X_tensor, y_tensor, test_size=0.1)

2764
(2764, 11, 10, 10)
max yardIndex:  203.0
max yardIndexClipped:  150.0
min yardIndex:  85.0
min yardIndexClipped:  85.0
(2764, 80)


In [65]:
print(X_train_tensor.shape)
print(X_test_tensor.shape)
print(y_train_tensor.shape)
print(y_test_tensor.shape)

(2487, 11, 10, 10)
(277, 11, 10, 10)
(2487, 80)
(277, 80)


In [66]:
X_punt_tensor = create_X_tensor(punt_play_track_df)
print(X_punt_tensor.shape)
train_punt_y = create_y_train(punt_play_track_df)
y_punt_tensor = create_y_tensor(train_punt_y, 71, 150)
print(y_punt_tensor.shape)
X_train_punt_tensor, X_test_punt_tensor, y_train_punt_tensor, y_test_punt_tensor = train_test_split(X_punt_tensor, y_punt_tensor, test_size=0.1)

2259
(2259, 11, 10, 10)
max yardIndex:  198.0
max yardIndexClipped:  150.0
min yardIndex:  86.0
min yardIndexClipped:  86.0
(2259, 80)


In [70]:
print(X_train_punt_tensor.shape)
print(X_test_punt_tensor.shape)
print(y_train_punt_tensor.shape)
print(y_test_punt_tensor.shape)

(2033, 11, 10, 10)
(226, 11, 10, 10)
(2033, 80)
(226, 80)


In [68]:
X_train_comb_tensor = np.concatenate((X_train_tensor, X_train_punt_tensor))
X_test_comb_tensor = np.concatenate((X_test_tensor, X_test_punt_tensor))
y_train_comb_tensor = np.concatenate((y_train_tensor, y_train_punt_tensor))
y_test_comb_tensor = np.concatenate((y_test_tensor, y_test_punt_tensor))

In [69]:
print(X_train_comb_tensor.shape)
print(X_test_comb_tensor.shape)
print(y_train_comb_tensor.shape)
print(y_test_comb_tensor.shape)

(4520, 11, 10, 10)
(503, 11, 10, 10)
(4520, 80)
(503, 80)


In [71]:
with open('input_tensors/X_tensor_comb_train.data', 'wb') as f:
    pickle.dump(X_train_comb_tensor, f)
with open('input_tensors/X_tensor_comb_test.data', 'wb') as f:
    pickle.dump(X_test_comb_tensor, f)
with open('input_tensors/y_tensor_comb_train.data', 'wb') as f:
    pickle.dump(y_train_comb_tensor, f)
with open('input_tensors/y_tensor_comb_test.data', 'wb') as f:
    pickle.dump(y_test_comb_tensor, f)

In [74]:
len(X_train_comb_tensor)

4520

In [None]:
np.arange()

In [78]:
X_train_comb_tensor, X_test_comb_tensor, y_train_comb_tensor, y_test_comb_tensor = get_input_data_combo(play_track_df, punt_play_track_df, "", test_size=0.1)

2764
X Shape (2764, 11, 10, 10)
max yardIndex:  203.0
max yardIndexClipped:  150.0
min yardIndex:  85.0
min yardIndexClipped:  85.0
y Shape (2764, 80)
Train X (2487, 11, 10, 10)
Test X (277, 11, 10, 10)
Train y (2487, 80)
Test y (277, 80)
2259
X Shape (2259, 11, 10, 10)
max yardIndex:  198.0
max yardIndexClipped:  150.0
min yardIndex:  86.0
min yardIndexClipped:  86.0
y Shape (2259, 80)
Train X (2033, 11, 10, 10)
Test X (226, 11, 10, 10)
Train y (2033, 80)
Test y (226, 80)
X_train_combo (4520, 11, 10, 10)
X_test_combo (503, 11, 10, 10)
y_train_combo (4520, 80)
y_test_combo (503, 80)
X_train_combo Shuffled (4520, 11, 10, 10)
y_train_combo Shuffled (4520, 80)
