# Import Libraries

In [69]:
import pandas as pd
import numpy as np
import os
import pickle
import math
# data: https://www.kaggle.com/c/nfl-big-data-bowl-2021/data

# Functions

In [70]:
def file_paths(path='./nfl-big-data-bowl-2021/weeks/'):
    """get file paths of all raw nfl game data split by week"""
    
    return sorted(['.' + os.sep + os.path.relpath(path) + os.sep + p for p in os.listdir(path) if not p.startswith('.')])

In [71]:
def add_team(x):
    """Get the home/away team abbrev for a given play via apply from home_away data dict. Level 1"""
    
    g_id = x['gameId']
    t_id = home_away[g_id]  # 0 for home, 1 for away
    if x['team'] == 'home':
        return t_id[0]
    if x['team'] == 'away':
        return t_id[1]

In [72]:
def add_possession(x):
    """Get the possession for a given play via apply from possession data dict. Level 1"""
    possession = off_def.get(x['gameId'])
    possession = possession.get(x['playId'])
    return possession

In [73]:
def ball_loc(x):
    """Get the ball specs for a given play via apply from ball_specs data dict. Level 1"""
    gameId = x['gameId']
    playId = x['playId']
    frameId = x['frameId']
    vals = ball_specs[gameId][playId].get(frameId, (0, 0, 0, 0, 0))
    return vals

In [74]:
def invert_x_play_right(x):
    """Invert x coord of plays with direction = 'right'. Level 1"""
    if x['playDirection'] == 'right':
        return 120 - x['x']
    else:
        return x['x']

def invert_y_play_right(x):
    """Invert y coord of plays with direction = 'right'. Level 1"""
    if x['playDirection'] == 'right':
        return 53.3 - x['y']
    else:
        return x['y']

def invert_o_play_right(x):
    """Invert o metric of plays with direction = 'right'. Level 1"""

    if x['playDirection'] == 'right':
        return 360 - x['o']
    else:
        return x['o']

def invert_dir_play_right(x):
    """Invert dir metric of plays with direction = 'right'. Level 1"""

    if x['playDirection'] == 'right':
        return 360 - x['dir']
    else:
        return x['dir']

def invert_ball_x_play_right(x):
    """Invert x coord of ball for plays with direction = 'right'. Level 1"""
    if x['playDirection'] == 'right':
        return 120 - x['ball_x']
    else:
        return x['ball_x']

def invert_ball_y_play_right(x):
    """Invert y coord of ball for plays with direction = 'right'. Level 1"""
    if x['playDirection'] == 'right':
        return 53.3 - x['ball_y']
    else:
        return x['ball_y']

In [75]:
def get_dist(x1, y1, x2, y2):
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    return dist

calc_distances = np.vectorize(get_dist)

In [76]:
def dis_to_ball(x):
    """Calculate given player's distance to the ball. Level 2"""
    # player
    x1 = x['x']
    y1 = x['y']

    # ball
    x2 = x['ball_x']
    y2 = x['ball_y']

    dist = get_dist(x1, y1, x2, y2)
    return dist

# Data Generators

In [77]:
def gen_week_data(start=1, stop=17, level=0):
    paths = file_paths()                                                                                        # paths of all .csv files
    weeks = list(range(start, (stop+1)))                                                                        # convert week range to list

    if level == 0:
        for week in weeks:                                                                                          # iterate over lists
            if any([(f'week{str(week)}.csv') in p for p in paths]):                                                 # check if file path exists for week

                data = pd.read_csv(file_paths()[np.array([(f'week{str(week)}.csv') in p for p in paths]).argmax()]) # get index location from list containing all file paths of specified week
                data['week'] = week                                                                                 # assign week column to output 
                yield data                                                                                          # return data for given week
    else:
        for week in weeks:
            path=f'./data/level_{level}/l{level}_week_{week}.csv'
            data = pd.read_csv(path, compression='zip')
            yield data

# Data Storage

Ball locations

In [78]:
path = './data/stores/ball_specs.pkl'
with open(path, 'rb') as f:
    ball_specs = pickle.load(f)

Game location dictionary

In [79]:
path = './data/stores/home_away.pkl'
with open(path, 'rb') as f:
    home_away = pickle.load(f)

Possession dictionary

In [80]:
path = './data/stores/possession.pkl'
with open(path, 'rb') as f:
    off_def = pickle.load(f)

Targeted on play dictionary

In [81]:
path = './data/stores/targeted.pkl'
with open(path, 'rb') as f:
    targeted = pickle.load(f)

In [82]:
path = './data/stores/pass_complete.pkl'
with open(path, 'rb') as f:
    pass_complete = pickle.load(f)

# Data subset

Isolating only the rows necessary for making predictions on the web portal. No predictions can be made before the ball is snapped or after the pass is thrown. The goal is to predict every frame for every eligable receiver on the field between these two points in time.

In [83]:
def snap_to_pass(play_data):
    play_data.reset_index(drop=True, inplace=True)
    pass_data = pd.DataFrame(data=[], columns=play_data.columns)
    
    if 'pass_forward' in play_data['event'].unique() and any(play_data['event'].str.startswith('pass_outcome')):
        # get frameId where pass originates
        start = play_data[play_data['event'] == 'ball_snap']['frameId'].idxmin()
        start = play_data.loc[start, 'frameId']
        # get frameId where pass outcome occurs
        stop = play_data[play_data['event'].str.startswith('pass_outcome')]['frameId'].idxmin()
        stop = play_data.loc[stop, 'frameId']
        # filter pass data
        pass_data = play_data[(play_data['frameId'] >= start) & (play_data['frameId'] <= stop)]
        pass_data.reset_index(drop=True, inplace=True)
        
    return pass_data

Subset the data

In [84]:
weeks = gen_week_data(level=2)

for week in weeks:
    passes = pd.DataFrame()
    w = week['week'].iloc[0]
    games = week['gameId'].unique()
    for game in games:
        plays = week[week['gameId'] == game]['playId'].unique()
        for play in plays:
            play = week[(week['gameId'] == game) & (week['playId'] == play)]
            
            if 'pass_forward' in play['event'].unique() and 'pass_tipped' not in play['event'].unique():
                play = snap_to_pass(play)
                passes = pd.concat([passes, play], ignore_index=True)
                
    passes.to_csv(f'./data/subset/web/week_{w}_passes.csv', compression='zip', index=False)
    print(f'Week {w} complete')

Week 1 complete
Week 2 complete
Week 3 complete
Week 4 complete
Week 5 complete
Week 6 complete
Week 7 complete
Week 8 complete
Week 9 complete
Week 10 complete
Week 11 complete
Week 12 complete
Week 13 complete
Week 14 complete
Week 15 complete
Week 16 complete
Week 17 complete


Processing all 17 weeks will take an exceptionally long time. Will spin up a few google colab notebooks and parallel process 5 at a time. With 17 iterations of the following code block processing time will be significantly reduced. Will then save the processed .csv files to my local HD for the final processing steps.

In [87]:
for w in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]:
    output = pd.DataFrame()
    week = pd.read_csv(f'./data/subset/web/week_{w}_passes.csv', compression='zip')
    i = week['week'].iloc[0]
    print(f'Starting week {i}')

    games = week['gameId'].unique()
    g = 0

    for game in games:
        g += 1
        plays = week[week['gameId'] == game]['playId'].unique()

        for play in plays:
            frames = week[(week['gameId'] == game) & (week['playId'] == play)]['frameId'].unique()

            for frame in frames:
                
                df = week[(week['gameId'] == game) & (week['playId'] == play) & (week['frameId'] == frame)].copy()
                df.reset_index(drop=True, inplace=True)

                football = df[df['displayName'] == 'Football']
                football.reset_index(drop=True, inplace=True)

                p_0 = df.loc[(df['possession'] == 0) & (df['displayName'] != 'Football'),:].copy()
                p_0.reset_index(drop=True, inplace=True)

                p_1 = df.loc[(df['possession'] == 1) & (df['displayName'] != 'Football'),:].copy()
                p_1.reset_index(drop=True, inplace=True)

                if p_0.shape[0] > 0 and p_1.shape[0] > 0:
                    # Calculate offenses distance to defenders
                    closest_to_def = []
                    for def_player in range(p_0.shape[0]):
                        
                        loc_x = p_0.iloc[def_player]['x']
                        loc_y = p_0.iloc[def_player]['y']

                        opp_loc_x = p_1['x']
                        opp_loc_y = p_1['y']

                        dist_vect = calc_distances(
                            np.full_like(opp_loc_x, loc_x),
                            np.full_like(opp_loc_y, loc_y),
                            opp_loc_x,
                            opp_loc_y
                        )
                        closest_idx = dist_vect.argmin()
                        closest_dist = dist_vect[closest_idx]

                        closest_stats = p_1.loc[closest_idx, ['s', 'o', 'a', 'dis', 'dir']]
                        closest_stats['dist'] = closest_dist
                        
                        closest_to_def.append(closest_stats.to_list())
                        
                    def_oppo_stats = pd.DataFrame(closest_to_def, columns=closest_stats.index).add_prefix('opp_')
                    def_oppo_stats = pd.concat([p_0[['gameId', 'playId', 'frameId', 'displayName']], def_oppo_stats], axis=1)

                    # Calculate defenders distance to offense
                    closest_to_off = []
                    for off_player in range(p_1.shape[0]):
                        
                        loc_x = p_1.iloc[off_player]['x']
                        loc_y = p_1.iloc[off_player]['y']

                        opp_loc_x = p_0['x']
                        opp_loc_y = p_0['y']

                        dist_vect = calc_distances(
                            np.full_like(opp_loc_x, loc_x),
                            np.full_like(opp_loc_y, loc_y),
                            opp_loc_x,
                            opp_loc_y
                        )

                        closest_idx = dist_vect.argmin()
                        closest_dist = dist_vect[closest_idx]

                        closest_stats = p_0.loc[closest_idx, ['s', 'o', 'a', 'dis', 'dir']]
                        closest_stats['dist'] = closest_dist
                        
                        closest_to_off.append(closest_stats.to_list())
                        
                    off_oppo_stats = pd.DataFrame(closest_to_off, columns=closest_stats.index).add_prefix('opp_')
                    off_oppo_stats = pd.concat([p_1[['gameId', 'playId', 'frameId', 'displayName']], off_oppo_stats], axis=1)

                    # merge all
                    football = pd.DataFrame([[game, play, frame, 'Football', 0, 0, 0, 0, 0, 0]], columns=off_oppo_stats.columns)
                    both_teams = pd.concat([def_oppo_stats, off_oppo_stats, football], ignore_index=True)

                    output = pd.concat([output, both_teams], ignore_index=True)
        print(f'Week {w} - Game {g}/{len(games)} completed')  

    output.to_csv(f'./data/subset/web/processed_week_{w}_passes.csv', index=False, compression='zip')


Starting week 14
Week 14 - Game 1/16 completed
Week 14 - Game 2/16 completed
Week 14 - Game 3/16 completed
Week 14 - Game 4/16 completed
Week 14 - Game 5/16 completed
Week 14 - Game 6/16 completed
Week 14 - Game 7/16 completed
Week 14 - Game 8/16 completed
Week 14 - Game 9/16 completed
Week 14 - Game 10/16 completed
Week 14 - Game 11/16 completed
Week 14 - Game 12/16 completed
Week 14 - Game 13/16 completed
Week 14 - Game 14/16 completed
Week 14 - Game 15/16 completed
Week 14 - Game 16/16 completed
Starting week 15
Week 15 - Game 1/16 completed
Week 15 - Game 2/16 completed
Week 15 - Game 3/16 completed
Week 15 - Game 4/16 completed
Week 15 - Game 5/16 completed
Week 15 - Game 6/16 completed
Week 15 - Game 7/16 completed
Week 15 - Game 8/16 completed
Week 15 - Game 9/16 completed
Week 15 - Game 10/16 completed
Week 15 - Game 11/16 completed
Week 15 - Game 12/16 completed
Week 15 - Game 13/16 completed
Week 15 - Game 14/16 completed
Week 15 - Game 15/16 completed
Week 15 - Game 16/16 co

Join the nearest opponent locations with the original row. Data will be read for modeling after dealing with missing values.

In [42]:
for n in range(1, 18):

    week = pd.read_csv(f'./data/subset/web/week_{n}_passes.csv', compression='zip')
    week.reset_index(drop=True, inplace=True)
    games = week['gameId'].unique()

    week['pass_complete'] = 0


    for game in games:
        plays = week[week['gameId'] == game]['playId'].unique()

        for play in plays:
            completion = pass_complete[game][play]['pass']
            if completion:
                completion = 1
            else:
                completion = 0
            
            compl_shape = week.loc[(week['gameId'] == game) & (week['playId'] == play)]['pass_complete']

            week.loc[(week['gameId'] == game) & (week['playId'] == play), 'pass_complete'] = np.full_like(compl_shape, completion)
    
    opponents = pd.read_csv(f'./data/subset/web/processed_week_{n}_passes.csv', compression='zip')

    output = pd.merge(left=opponents, right=week, left_on=['gameId', 'playId', 'frameId', 'displayName'], right_on=['gameId', 'playId', 'frameId', 'displayName'], how='inner')

    output.to_csv(f'./data/subset/web/week_{n}.csv', index=False, compression='zip')
    w = week['week'].iloc[0]
    print(f'Week {w} complete')

Week 1 complete
Week 2 complete
Week 3 complete
Week 4 complete
Week 5 complete
Week 6 complete
Week 7 complete
Week 8 complete
Week 9 complete
Week 10 complete
Week 11 complete
Week 12 complete
Week 13 complete
Week 14 complete
Week 15 complete
Week 16 complete
Week 17 complete


A larger subset of data is now cleaned and ready to be predicted by the model. These predictions will be for the catch percentage. The catch percentage will then multiplied by the estimated expected points added. With estimated expected points added * catch percentage a prediction for the "optimal" receiver can be made. 

For example,

Lets say two receivers are both 10 yards down the field. Both with an estimated expected points added (EPA) of +1.0 points. Receiver A has an estimated catch percentage of 65%. Receiver B has an estimated catch percentage of 35%. The adjusted estimated EPA for Receiver A will be 0.65 points because he has a 65% chance of increasing the expected points of the given drive by 1 point. Reciver B will have an adjusted estimated EPA of +0.35 points. Therefore, Receiver A will be the "optimal receiver" for the given play.