In [1]:
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.model_selection import train_test_split

In [2]:
df_track_2020 = pd.read_csv("data_sets/tracking2020.csv")
df_track_2019 = pd.read_csv("data_sets/tracking2019.csv")
df_track_2018 = pd.read_csv("data_sets/tracking2018.csv")

In [3]:
df_games = pd.read_csv("data_sets/games.csv")
df_plays = pd.read_csv("data_sets/plays.csv")

In [8]:
kick_play_track_df = pd.read_csv("play_track_kickoff.csv")
punt_play_track_df = pd.read_csv("play_track_punt.csv")

In [53]:
def process_contact_tracking_data(df_track_2018, df_track_2019, df_track_2020):
    '''
    Get the necessary data from the player tracking dataframes, standardize directions,
    and add additional features (velocity)
    '''
    tracking_df = pd.concat([df_track_2018, df_track_2019, df_track_2020])
    tracking_df = tracking_df[tracking_df["event"] == "first_contact"]
    # Get rid of tracking data on football
    tracking_df.drop(tracking_df[tracking_df['team'] == "football"].index, inplace = True) 
    
    tracking_df['dir'] = np.mod(90 - tracking_df['dir'], 360)  # Change 0 degrees to be pointing downfield
    standardize_tracking_data(tracking_df)
    add_velocity_vectors_tracking_data(tracking_df)
    
    tracking_df["nflId"] = tracking_df["nflId"].astype('int')
    tracking_df["contact_x"] = tracking_df["x"]
    tracking_df = tracking_df[['gameId','playId','nflId','contact_x']]
        
    return tracking_df

In [19]:
def standardize_tracking_data(df_tracking):
    '''
    Standardize the positions and directions of the play so that the returning team is always
    going from left to right
    '''
    df_tracking.loc[df_tracking['playDirection'] == "right", 'x'] = 120-df_tracking.loc[df_tracking['playDirection'] == "right", 'x']
    df_tracking.loc[df_tracking['playDirection'] == "right", 'y'] = 160/3-df_tracking.loc[df_tracking['playDirection'] == "right", 'y']
    df_tracking.loc[df_tracking['playDirection'] == "right", 'dir'] = np.mod(180 + df_tracking.loc[df_tracking['playDirection'] == "right", 'dir'], 360)

In [20]:
def add_velocity_vectors_tracking_data(df_tracking):
    '''
    Use speed and direction to get the player's velocity in the x, y direction and add these as columns to df
    '''
    df_tracking["v_x"] = df_tracking["s"] * df_tracking["dir"].apply(math.radians).apply(math.cos)
    df_tracking["v_y"] = df_tracking["s"] * df_tracking["dir"].apply(math.radians).apply(math.sin)

In [90]:
def process_first_contact_data(catch_df, df_track_2018, df_track_2019, df_track_2020):
    contact_tracking_df = process_contact_tracking_data(df_track_2018, df_track_2019, df_track_2020)
    merged_catch_kick = pd.merge(catch_df, contact_tracking_df, how='inner')
    
    merged_catch_kick['yards_to_contact'] = merged_catch_kick['contact_x'] - merged_catch_kick['x']
    
    # Get rid of plays where there was not 22 players on field (Sillie billlies)
    grouped_df = merged_catch_kick.groupby(["gameId", "playId"]) 
    merged_catch_kick = grouped_df.filter(lambda x: x['nflId'].count() == 22)
    
    
    return merged_catch_kick
    
    

In [79]:
def create_X_tensor(play_track_df):
    '''
    Creates X input tensor for model. 
    Returns numpy array of shape (num_plays, 11, 10, 10)
    '''
    grouped_df = play_track_df.groupby(["gameId", "playId"])
    print(len(grouped_df))
    train_x = np.zeros([len(grouped_df),11,10,10])
    
    i = 0
    for name, group in grouped_df:
        [[returner_x, returner_y, returner_Vx, returner_Vy]] = group.loc[group.player_side=="returner",['x', 'y','v_x','v_y']].values

        kick_team_ids = group[group.player_side == "kicking_team"].index
        return_team_ids = group[group.player_side == "return_team"].index

        for j, kick_team_id in enumerate(kick_team_ids):
            [kick_team_x, kick_team_y, kick_team_Vx, kick_team_Vy] = group.loc[kick_team_id,['x', 'y','v_x','v_y']].values

            [kick_team_returner_x, kick_team_returner_y] = group.loc[kick_team_id,['x', 'y']].values - np.array([returner_x, returner_y])
            [kick_team_returner_Vx, kick_team_returner_Vy] = group.loc[kick_team_id,['v_x', 'v_y']].values - np.array([returner_Vx, returner_Vy])

            train_x[i,j,:,:4] = group.loc[return_team_ids,['v_x','v_y','x', 'y']].values - np.array([kick_team_x, kick_team_y, kick_team_Vx, kick_team_Vy])
            train_x[i,j,:,-6:] = [kick_team_returner_Vx, kick_team_returner_Vy, kick_team_returner_x, kick_team_returner_y, kick_team_Vx, kick_team_Vy]
        i += 1
    
    return train_x

In [80]:
def create_y_train_contact(play_track_df):
    '''
    Create dataframe of y data for the model. 
    Adds 99 yards so that we can never have negative yards on play and also 
    "clips" yard gained so it can never be above or below certain values
    '''
    # These were set by original model, maybe I want to change these
    min_idx_y = 71    # Min yards gained = 28
    max_idx_y = 150   # Max yards gained = 51
    
    train_y = play_track_df[play_track_df['player_side'] == 'returner'][["gameId", "playId", "yards_to_contact"]]
    
    train_y['YardIndex'] = train_y["yards_to_contact"].apply(lambda val: val + 99)
    train_y['YardIndexClipped'] = train_y['YardIndex'].apply(
        lambda val: min_idx_y if val < min_idx_y else max_idx_y if val > max_idx_y else val)
    
    print('max yardIndex: ', train_y.YardIndex.max())
    print('max yardIndexClipped: ', train_y.YardIndexClipped.max())
    print('min yardIndex: ', train_y.YardIndex.min())
    print('min yardIndexClipped: ', train_y.YardIndexClipped.min())
    
    return train_y

In [81]:
def create_y_tensor(train_y, min_idx_y, max_idx_y):
    '''
    For each play, create a one-hot encoded vector for yards gained
    Returns numpy array of shape (num_plays, num_y_classes) where num_y_classes
    is the number of different yards that can be predicted
    '''
    num_classes_y = max_idx_y - min_idx_y + 1
    y_vals = train_y["YardIndexClipped"].values
    y_tensor = np.zeros((len(y_vals), num_classes_y), np.int32)
    for i, yards in enumerate(y_vals):
        y_tensor[(i, yards.astype(np.int32) - min_idx_y)] = 1
    
    return y_tensor

In [95]:
def get_input_data(play_track_df, output_tag, test_size=0.1, save_tensors=True):
    X_tensor = create_X_tensor(play_track_df)
    print("X Shape", X_tensor.shape)
    
    train_y = create_y_train(play_track_df)
    y_tensor = create_y_tensor(train_y, 71, 150)
    print("y Shape", y_tensor.shape)
    
    X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = train_test_split(X_tensor, y_tensor, test_size=test_size)
    
    print("Train X", X_train_tensor.shape)
    print("Test X", X_test_tensor.shape)
    print("Train y", y_train_tensor.shape)
    print("Test y", y_test_tensor.shape)
    
    if save_tensors:
        with open(f'input_tensors/X_tensor_{output_tag}_train.data', 'wb') as f:
            pickle.dump(X_train_tensor, f)
        with open(f'input_tensors/X_tensor_{output_tag}_test.data', 'wb') as f:
            pickle.dump(X_test_tensor, f)
        with open(f'input_tensors/y_tensor_{output_tag}_train.data', 'wb') as f:
            pickle.dump(y_train_tensor, f)
        with open(f'input_tensors/y_tensor_{output_tag}_test.data', 'wb') as f:
            pickle.dump(y_test_tensor, f)

    
    return X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor

In [100]:
def get_input_data_combo(kick_play_track_df, punt_play_track_df, output_tag, test_size=0.1):
    X_train_kick_tensor, X_test_kick_tensor, y_train_kick_tensor, y_test_kick_tensor = get_input_data(kick_play_track_df, "kick", test_size=test_size, save_tensors=False)
    X_train_punt_tensor, X_test_punt_tensor, y_train_punt_tensor, y_test_punt_tensor = get_input_data(punt_play_track_df, "punt", test_size=test_size, save_tensors=False)
    
    X_train_comb_tensor = np.concatenate((X_train_kick_tensor, X_train_punt_tensor))
    X_test_comb_tensor = np.concatenate((X_test_kick_tensor, X_test_punt_tensor))
    y_train_comb_tensor = np.concatenate((y_train_kick_tensor, y_train_punt_tensor))
    y_test_comb_tensor = np.concatenate((y_test_kick_tensor, y_test_punt_tensor))
    
    print("X_train_combo", X_train_comb_tensor.shape)
    print("X_test_combo", X_test_comb_tensor.shape)
    print("y_train_combo", y_train_comb_tensor.shape)
    print("y_test_combo", y_test_comb_tensor.shape)
    
    shuffle = np.arange(len(X_train_comb_tensor))
    np.random.shuffle(shuffle)
    X_train_comb_tensor = X_train_comb_tensor[shuffle]
    y_train_comb_tensor = y_train_comb_tensor[shuffle]
    
    print("X_train_combo Shuffled", X_train_comb_tensor.shape)
    print("y_train_combo Shuffled", y_train_comb_tensor.shape)
    
    with open(f'input_tensors/X_tensor_comb_{output_tag}_train.data', 'wb') as f:
        pickle.dump(X_train_comb_tensor, f)
    with open(f'input_tensors/X_tensor_comb_{output_tag}_test.data', 'wb') as f:
        pickle.dump(X_test_comb_tensor, f)
    with open(f'input_tensors/y_tensor_comb_{output_tag}_train.data', 'wb') as f:
        pickle.dump(y_train_comb_tensor, f)
    with open(f'input_tensors/y_tensor_comb_{output_tag}_test.data', 'wb') as f:
        pickle.dump(y_test_comb_tensor, f)
    
    
    return X_train_comb_tensor, X_test_comb_tensor, y_train_comb_tensor, y_test_comb_tensor
    
    
    
    
    
    
    

In [97]:
kick_contact_df = process_first_contact_data(kick_play_track_df, df_track_2018, df_track_2019, df_track_2020)

In [92]:
kick_contact_df.head(30)

Unnamed: 0,gameId,homeTeamAbbr,visitorTeamAbbr,playId,possessionTeam,returnerId,kickReturnYardage,nflId,team,x,y,v_x,v_y,team_abbr,player_side,contact_x,yards_to_contact
0,2018090600,PHI,ATL,677,ATL,44979,30.0,37267,away,53.07,37.21,-6.592228,1.66809,ATL,kicking_team,42.51,-10.56
1,2018090600,PHI,ATL,677,ATL,44979,30.0,38707,away,44.11,24.92,-9.304336,3.798924,ATL,kicking_team,28.36,-15.75
2,2018090600,PHI,ATL,677,ATL,44979,30.0,40191,away,43.9,21.75,-8.566566,3.099363,ATL,kicking_team,29.46,-14.44
3,2018090600,PHI,ATL,677,ATL,44979,30.0,40408,home,39.07,41.27,-0.822248,2.456015,PHI,return_team,37.84,-1.23
4,2018090600,PHI,ATL,677,ATL,44979,30.0,42450,away,45.07,49.79,-9.179993,0.011215,ATL,kicking_team,28.81,-16.26


In [93]:
punt_contact_df = process_first_contact_data(punt_play_track_df, df_track_2018, df_track_2019, df_track_2020)

In [102]:
punt_contact_df.shape

(40018, 17)

In [98]:
X_train_kick_tensor, X_test_kick_tensor, y_train_kick_tensor, y_test_kick_tensor = get_input_data(kick_contact_df, "contact_kick", test_size=0.1, save_tensors=True)

2333
X Shape (2333, 11, 10, 10)
max yardIndex:  180.01
max yardIndexClipped:  150.0
min yardIndex:  96.07
min yardIndexClipped:  96.07
y Shape (2333, 80)
Train X (2099, 11, 10, 10)
Test X (234, 11, 10, 10)
Train y (2099, 80)
Test y (234, 80)


In [None]:
X_train_punt_tensor, X_test_punt_tensor, y_train_punt_tensor, y_test_punt_tensor = get_input_data(punt_contact_df, "contact_punt", test_size=0.1, save_tensors=True)

In [101]:
X_train_comb_tensor, X_test_comb_tensor, y_train_comb_tensor, y_test_comb_tensor = get_input_data_combo(kick_contact_df, punt_contact_df, "contact", test_size=0.1)

2333
X Shape (2333, 11, 10, 10)
max yardIndex:  180.01
max yardIndexClipped:  150.0
min yardIndex:  96.07
min yardIndexClipped:  96.07
y Shape (2333, 80)
Train X (2099, 11, 10, 10)
Test X (234, 11, 10, 10)
Train y (2099, 80)
Test y (234, 80)
1819
X Shape (1819, 11, 10, 10)
max yardIndex:  165.19
max yardIndexClipped:  150.0
min yardIndex:  91.12
min yardIndexClipped:  91.12
y Shape (1819, 80)
Train X (1637, 11, 10, 10)
Test X (182, 11, 10, 10)
Train y (1637, 80)
Test y (182, 80)
X_train_combo (3736, 11, 10, 10)
X_test_combo (416, 11, 10, 10)
y_train_combo (3736, 80)
y_test_combo (416, 80)
X_train_combo Shuffled (3736, 11, 10, 10)
y_train_combo Shuffled (3736, 80)
