# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import math
# data: https://www.kaggle.com/c/nfl-big-data-bowl-2021/data

## Helper Functions

Original Data's file path

In [2]:
def file_paths(path='./nfl-big-data-bowl-2021/weeks/'):
    """get file paths of all raw nfl game data split by week"""
    
    return sorted(['.' + os.sep + os.path.relpath(path) + os.sep + p for p in os.listdir(path) if not p.startswith('.')])

Find team name abbreviation from home_away data store

In [3]:
def add_team(x):
    """Get the home/away team abbrev for a given play via apply from home_away data dict. Level 1"""
    
    g_id = x['gameId']
    t_id = home_away[g_id]  # 0 for home, 1 for away
    if x['team'] == 'home':
        return t_id[0]
    if x['team'] == 'away':
        return t_id[1]

Differentiate between offense and defense. Requires off_def data store

In [4]:
def add_possession(x):
    """Get the possession for a given play via apply from possession data dict. Level 1"""

    possession = off_def.get(x['gameId'])
    possession = possession.get(x['playId'])
    return possession

Apply ball specs to all player rows. Requires ball_specs data store

In [5]:
def ball_loc(x):
    """Get the ball specs for a given play via apply from ball_specs data dict. Level 1"""
    
    gameId = x['gameId']
    playId = x['playId']
    frameId = x['frameId']
    vals = ball_specs[gameId][playId].get(frameId, (0, 0, 0, 0, 0))
    return vals

Hemogenize play direction for modeling and consistency of graphing.

In [6]:
def invert_x_play_right(x):
    """Invert x coord of plays with direction = 'right'. Level 1"""

    if x['playDirection'] == 'right':
        return 120 - x['x']
    else:
        return x['x']

def invert_y_play_right(x):
    """Invert y coord of plays with direction = 'right'. Level 1"""

    if x['playDirection'] == 'right':
        return 53.3 - x['y']
    else:
        return x['y']

def invert_o_play_right(x):
    """Invert o metric of plays with direction = 'right'. Level 1"""

    if x['playDirection'] == 'right':
        return 360 - x['o']
    else:
        return x['o']

def invert_dir_play_right(x):
    """Invert dir metric of plays with direction = 'right'. Level 1"""

    if x['playDirection'] == 'right':
        return 360 - x['dir']
    else:
        return x['dir']

def invert_ball_x_play_right(x):
    """Invert x coord of ball for plays with direction = 'right'. Level 1"""

    if x['playDirection'] == 'right':
        return 120 - x['ball_x']
    else:
        return x['ball_x']

def invert_ball_y_play_right(x):
    """Invert y coord of ball for plays with direction = 'right'. Level 1"""
    
    if x['playDirection'] == 'right':
        return 53.3 - x['ball_y']
    else:
        return x['ball_y']

Linear distance calculation

In [7]:
def get_dist(x1, y1, x2, y2):
    """Returns linear distance between 2 pairs of x, y coordinates."""
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    return dist

# Vectorized version of the get_dist function. Used when calculating nearest opponent's distance
calc_distances = np.vectorize(get_dist)

Makes a new column containing the distance of the ball

In [8]:
def dis_to_ball(x):
    """Calculate given player's distance to the ball. Level 1"""

    # player
    x1 = x['x']
    y1 = x['y']

    # ball
    x2 = x['ball_x']
    y2 = x['ball_y']

    dist = get_dist(x1, y1, x2, y2)
    return dist

Add targeted receiver to all rows. Used for modeling and metric calculations down stream.

In [9]:
def apply_target(x):
    """Build targeted column from targeted data dictonary. Level 2"""
    
    x.reset_index(drop=True, inplace=True)
    games = x['gameId'].unique()
    for i, game in enumerate(games):
        plays = x[x['gameId']==game]['playId'].unique()                                                           # iterate thru games
        for play in plays:                                                # iterate thru plays
            x.loc[(x['gameId'] == game) & (x['playId'] == play), 'target_loc'] = targeted[game][play]        # add targeted receiver to all rows of each play
    return x

## Data Generator

Week data

In [10]:
def gen_week_data(start=1, stop=17, level=0):
    paths = file_paths()                                                                                            # paths of all .csv files
    weeks = list(range(start, (stop+1)))                                                                            # convert week range to list

    if level == 0:
        for week in weeks:                                                                                          # iterate over lists
            if any([(f'week{str(week)}.csv') in p for p in paths]):                                                 # check if file path exists for week

                data = pd.read_csv(file_paths()[np.array([(f'week{str(week)}.csv') in p for p in paths]).argmax()]) # get index location from list containing all file paths of specified week
                data['week'] = week                                                                                 # assign week column to output 
                yield data                                                                                          # return data for given week
    else:
        for week in weeks:
            path=f'./data/level_{level}/l{level}_week_{week}.csv'
            data = pd.read_csv(path, compression='zip')
            yield data

## Data Imports

In [11]:
games = pd.read_csv('./nfl-big-data-bowl-2021/games.csv')
games.head(2)

Unnamed: 0,gameId,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr
0,2018090600,9/6/18,20:20:00,PHI,ATL
1,2018090901,9/9/18,13:00:00,CLE,PIT


In [12]:
plays = pd.read_csv('./nfl-big-data-bowl-2021/plays.csv')
plays.head(2)

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,playType,yardlineSide,yardlineNumber,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018090600,75,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,15,ATL,play_type_pass,ATL,20,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
1,2018090600,146,(13:10) M.Ryan pass incomplete short right to ...,1,1,10,ATL,play_type_pass,PHI,39,...,0.0,13:10:00,49.0,,,I,0,0,-0.37236,False


## Data Storage

Level 1 Metrics (Non-conditional). All can be taken from original data.

Ball location dictionary. Very slow. 15 mil rows.

In [13]:
weeks = gen_week_data()
loc_cols = ['x', 'y', 's', 'a', 'dis']
_ = {}
w=0
for week in weeks:
    w += 1
    ball_specs = week[week['displayName']=='Football'].groupby(by=['gameId', 'playId', 'frameId'], as_index=False)[loc_cols].mean()
            # filter by game, play, and frame. 
            # Then select location specs for ball for all frames in all plays in all games in all weeks generator
    for g in ball_specs['gameId'].unique():
        _[g] = {}
        for p in ball_specs[ball_specs['gameId']==g]['playId'].unique():
            _[g][p] = {}
            for f in ball_specs[(ball_specs['gameId']==g) & (ball_specs['playId']==p)]['frameId'].unique():
                __ = np.array(ball_specs[(ball_specs['gameId']==g) & (ball_specs['playId']==p) & (ball_specs['frameId']==f)][loc_cols]).reshape(-1)
                _[g][p][f] = __

    print(f'{w} weeks completed') 
ball_specs = _      # rename for clarity

# pickle it
path = './data/stores/ball_specs.pkl'
with open(path, 'wb') as f:
    pickle.dump(ball_specs, f)

1 weeks completed
2 weeks completed
3 weeks completed
4 weeks completed
5 weeks completed
6 weeks completed
7 weeks completed
8 weeks completed
9 weeks completed
10 weeks completed
11 weeks completed
12 weeks completed
13 weeks completed
14 weeks completed
15 weeks completed
16 weeks completed
17 weeks completed


In [14]:
path = './data/stores/ball_specs.pkl'
with open(path, 'rb') as f:
    ball_specs = pickle.load(f)

Game location dictionary. Home/Away

In [15]:
home_away = dict(zip(
    games['gameId'], zip(games['homeTeamAbbr'], games['visitorTeamAbbr'])   # dict of gameIds (keys) with home/away abbrev as values
    ))

# pickle it
path = './data/stores/home_away.pkl'
with open(path, 'wb') as f:
    pickle.dump(home_away, f)

In [16]:
path = './data/stores/home_away.pkl'
with open(path, 'rb') as f:
    home_away = pickle.load(f)

Possession play dictionary. Offense/Defense

In [17]:
off_def = {}                                                                        # nested dict. key = gameId, key2 = playId, value = possession
for k, v, t in zip(plays['gameId'], plays['playId'], plays['possessionTeam']):
    if k in off_def.keys():                                                         # if gameId exists in off_def dict, add key/value pair of playId & possession
        off_def[k][v] = t
    else:                                                                           # if gameId is not in off_def dict, create new dict containing k/v of playId & possession
        off_def[k] = {v:t}

# pickle it
path = './data/stores/possession.pkl'
with open(path, 'wb') as f:
    pickle.dump(off_def, f)

In [18]:
path = './data/stores/possession.pkl'
with open(path, 'rb') as f:
    off_def = pickle.load(f)

# Process Data

## Level 1: Basics

In [19]:
weeks = gen_week_data()
for week in weeks:
    w = week['week'].iloc[0]
    print(f'Processing week {w}')

    # ball specs
    week['ball_x'] = week.apply(ball_loc, axis=1).map(lambda x: x[0])   # x
    week['ball_y'] = week.apply(ball_loc, axis=1).map(lambda x: x[1])   # y
    week['ball_s'] = week.apply(ball_loc, axis=1).map(lambda x: x[2])   # s
    week['ball_a'] = week.apply(ball_loc, axis=1).map(lambda x: x[3])   # a
    week['ball_dis'] = week.apply(ball_loc, axis=1).map(lambda x: x[4]) # dis
    print(f'Week {w} ball location extracted')

    # team details
    week['team_name'] = week.apply(add_team, axis=1)                    # team abbrev
    week['possession'] = week.apply(add_possession, axis=1)             # offense/defense
    week['possession'] = np.where(week['team_name'] == week['possession'], 1, 0)    # binarize
    print(f'Week {w} team metrics extracted')

    # play direction
    week['x'] = week.apply(invert_x_play_right, axis=1)                 # for graphing/modling
    week['y'] = week.apply(invert_y_play_right, axis=1)
    week['o'] = week.apply(invert_o_play_right, axis=1)
    week['dir'] = week.apply(invert_dir_play_right, axis=1)
    print(f'Week {w} directionality normalized')

    # get distance to ball
    week['dist_ball'] = week.apply(dis_to_ball, axis=1)                 # calculate distance to ball for all players

    # export
    week.to_csv(f'./data/level_1/l1_week_{w}.csv', index=False, compression='zip')
    print(f'Week {w} Exported')
    print('=' * 100)
    

Processing week 1
Week 1 ball location extracted
Week 1 team metrics extracted
Week 1 directionality normalized
Week 1 Exported
Processing week 2
Week 2 ball location extracted
Week 2 team metrics extracted
Week 2 directionality normalized
Week 2 Exported
Processing week 3
Week 3 ball location extracted
Week 3 team metrics extracted
Week 3 directionality normalized
Week 3 Exported
Processing week 4
Week 4 ball location extracted
Week 4 team metrics extracted
Week 4 directionality normalized
Week 4 Exported
Processing week 5
Week 5 ball location extracted
Week 5 team metrics extracted
Week 5 directionality normalized
Week 5 Exported
Processing week 6
Week 6 ball location extracted
Week 6 team metrics extracted
Week 6 directionality normalized
Week 6 Exported
Processing week 7
Week 7 ball location extracted
Week 7 team metrics extracted
Week 7 directionality normalized
Week 7 Exported
Processing week 8
Week 8 ball location extracted
Week 8 team metrics extracted
Week 8 directionality nor

## Data Store: Estimate Pass Target

In [20]:
weeks = gen_week_data(level=1)  # use level 1 data
pass_events = ['pass_arrived', 'pass_outcome_incomplete', 'pass_outcome_interception', 'pass_outcome_caught', 'pass_outcome_touchdown']
targeted = {}

for week in weeks:                                                  # iterate weeks
    w = week['week'].iloc[0]
    games = week['gameId'].unique()

    for game in games:                                              # iterate games
        plays = week[week['gameId'] == game]['playId'].unique()
        
        for play in plays:                                          # iterate through plays
            _ = week[
                (week['gameId'] == game) &
                (week['playId'] == play) &
                (week['possession'] == 1) & 
                (week['event'].isin(pass_events))
            ].copy()                                                # get play data
            _.reset_index(drop=True, inplace=True)

            if _.shape[0] > 0:
                i = _['dist_ball'].idxmin()                             # find offensive player with min dist to ball @ time ball arrives
                p_name = _.loc[i,'displayName']
            else:
                p_name = 'qb sacked'

            if game not in targeted.keys():                         # export to dict
                targeted[game] = {play: p_name}
            else:
                targeted[game][play] = p_name
    
    print(f'Week {w} complete')

# pickle it
path = './data/stores/targeted.pkl'
with open(path, 'wb') as f:
    pickle.dump(targeted, f)

Week 1 complete
Week 2 complete
Week 3 complete
Week 4 complete
Week 5 complete
Week 6 complete
Week 7 complete
Week 8 complete
Week 9 complete
Week 10 complete
Week 11 complete
Week 12 complete
Week 13 complete
Week 14 complete
Week 15 complete
Week 16 complete
Week 17 complete


In [21]:
path = './data/stores/targeted.pkl'
with open(path, 'rb') as f:
    targeted = pickle.load(f)

## Level 2: Targeted Receiver Metrics

Add targeted receiver to each row

In [22]:
weeks = gen_week_data(level=1)

for week in weeks:
    w = week['week'].iloc[0]    

    week['target_loc'] = ''
    week = apply_target(week)
    print(f'Week {w} targeted players assigned')

    # export
    week.to_csv(f'./data/level_2/l2_week_{w}.csv', index=False, compression='zip')
    print(f'Week {w} Exported.')
    print('=' * 50)

Week 1 targeted players assigned
Week 1 Exported.
Week 2 targeted players assigned
Week 2 Exported.
Week 3 targeted players assigned
Week 3 Exported.
Week 4 targeted players assigned
Week 4 Exported.
Week 5 targeted players assigned
Week 5 Exported.
Week 6 targeted players assigned
Week 6 Exported.
Week 7 targeted players assigned
Week 7 Exported.
Week 8 targeted players assigned
Week 8 Exported.
Week 9 targeted players assigned
Week 9 Exported.
Week 10 targeted players assigned
Week 10 Exported.
Week 11 targeted players assigned
Week 11 Exported.
Week 12 targeted players assigned
Week 12 Exported.
Week 13 targeted players assigned
Week 13 Exported.
Week 14 targeted players assigned
Week 14 Exported.
Week 15 targeted players assigned
Week 15 Exported.
Week 16 targeted players assigned
Week 16 Exported.
Week 17 targeted players assigned
Week 17 Exported.


## Data Store: Pass Outcome

In [23]:
weeks = gen_week_data(level=1)

pass_complete = dict()

pass_caught = {'pass_outcome_caught', 'pass_outcome_touchdown'}
for week in weeks:
    w = week['week'].iloc[0]
    games = week['gameId'].unique()
    g = 0
    
    for game in games:
        g += 1
        plays = week[week['gameId'] == game]['playId'].unique()
        
        for play in plays:
            play_events = set(
                week[(week['gameId'] == game) & (week['playId'] == play)]['event']
                )
            
            snap = week[
                (week['gameId'] == game) & 
                (week['playId'] == play) &
                (week['event'] == 'ball_snap')]['frameId'].iloc[0]                  # frameId where ball snap occurs
            
            los = week[
                (week['gameId'] == game) & 
                (week['playId'] == play) &
                (week['frameId'] == snap) &
                (week['displayName'] == 'Football')]['x'].iloc[0]                   # x coordinate of ball when snap occurs (los)


            pass_forward = week[
                (week['gameId'] == game) & 
                (week['playId'] == play) &
                (week['frameId'] > snap) &
                (week['displayName'] == 'Football') &
                (week['x'] < los)]['frameId']                                       # moment ball passes the los
            

            if pass_forward.shape[0] > 0:
                pass_forward = pass_forward.min()
            else:
                pass_forward = week[
                    (week['gameId'] == game) & 
                    (week['playId'] == play)]['frameId'].max()
            
            # label pass
            complete = False
            if play_events.intersection(pass_caught):
                complete = True
            
            if game not in pass_complete.keys():
                pass_complete[game] = {play: {'pass': complete, 'los': los, 'pass_frame': pass_forward}}
            else:
                pass_complete[game][play] = {'pass': complete, 'los': los, 'pass_frame': pass_forward}
        print(f'Week {w} - {g}/{len(games)} complete')
        
# pickle it
path = './data/stores/pass_complete.pkl'
with open(path, 'wb') as f:
    pickle.dump(pass_complete, f)

Week 1 - 1/13 complete
Week 1 - 2/13 complete
Week 1 - 3/13 complete
Week 1 - 4/13 complete
Week 1 - 5/13 complete
Week 1 - 6/13 complete
Week 1 - 7/13 complete
Week 1 - 8/13 complete
Week 1 - 9/13 complete
Week 1 - 10/13 complete
Week 1 - 11/13 complete
Week 1 - 12/13 complete
Week 1 - 13/13 complete
Week 2 - 1/16 complete
Week 2 - 2/16 complete
Week 2 - 3/16 complete
Week 2 - 4/16 complete
Week 2 - 5/16 complete
Week 2 - 6/16 complete
Week 2 - 7/16 complete
Week 2 - 8/16 complete
Week 2 - 9/16 complete
Week 2 - 10/16 complete
Week 2 - 11/16 complete
Week 2 - 12/16 complete
Week 2 - 13/16 complete
Week 2 - 14/16 complete
Week 2 - 15/16 complete
Week 2 - 16/16 complete
Week 3 - 1/16 complete
Week 3 - 2/16 complete
Week 3 - 3/16 complete
Week 3 - 4/16 complete
Week 3 - 5/16 complete
Week 3 - 6/16 complete
Week 3 - 7/16 complete
Week 3 - 8/16 complete
Week 3 - 9/16 complete
Week 3 - 10/16 complete
Week 3 - 11/16 complete
Week 3 - 12/16 complete
Week 3 - 13/16 complete
Week 3 - 14/16 comp

In [24]:
path = './data/stores/pass_complete.pkl'
with open(path, 'rb') as f:
    pass_complete = pickle.load(f)

### Data subset for model building:

In [25]:
# def pass_to_catch(play_data):
#     play_data.reset_index(drop=True, inplace=True)
#     pass_data = pd.DataFrame(data=[], columns=play_data.columns)
    
#     if 'pass_forward' in play_data['event'].unique() and any(play_data['event'].str.startswith('pass_outcome')):
#         # get frameId where pass originates
#         start = play_data[play_data['event'].str.startswith('pass_forward')]['frameId'].idxmin()
#         start = play_data.loc[start, 'frameId']
#         # get frameId where pass outcome occurs
#         stop = play_data[play_data['event'].str.startswith('pass_outcome')]['frameId'].idxmin()
#         stop = play_data.loc[stop, 'frameId']
#         # filter pass data
#         pass_data = play_data[(play_data['frameId'] >= start) & (play_data['frameId'] <= stop)]
#         pass_data.reset_index(drop=True, inplace=True)
        
#     return pass_data

Export Subset

In [26]:
# weeks = gen_week_data(level=2)

# for week in weeks:
#     passes = pd.DataFrame()
#     w = week['week'].iloc[0]
#     games = week['gameId'].unique()
#     for game in games:
#         plays = week[week['gameId'] == game]['playId'].unique()
#         for play in plays:
#             play = week[(week['gameId'] == game) & (week['playId'] == play)]
            
#             if 'pass_forward' in play['event'].unique() and 'pass_tipped' not in play['event'].unique():
#                 play = pass_to_catch(play)
#                 passes = pd.concat([passes, play], ignore_index=True)
                
#     passes.to_csv(f'./data/subset/week_{w}_passes.csv', compression='zip', index=False)
#     print(f'Week {w} complete')

## Level 3: Calculate Nearest Opponent Distance

Processing all 17 weeks will take an exceptionally long time. Will spin up google colab notebooks and parallel process 5 at a time. With 17 iterations of the following code block processing time will be significantly reduced. Will then save the processed .csv files to my local HD and continue pre-processing the data.

In [27]:
for w in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]:
    output = pd.DataFrame()
    week = pd.read_csv(f'./data/level_2/l2_week_{w}.csv', compression='zip')
    # week = pd.read_csv(f'./data/subset/week_{w}_passes.csv', compression='zip')
    i = week['week'].iloc[0]
    print(f'Starting week {i}')

    games = week['gameId'].unique()
    g = 0

    for game in games:
        g += 1
        plays = week[week['gameId'] == game]['playId'].unique()

        for play in plays:
            frames = week[(week['gameId'] == game) & (week['playId'] == play)]['frameId'].unique()

            for frame in frames:
                
                # Data Subset at a given frame
                df = week[(week['gameId'] == game) & (week['playId'] == play) & (week['frameId'] == frame)].copy()
                df.reset_index(drop=True, inplace=True)

                # Football DF
                football = df[df['displayName'] == 'Football']
                football.reset_index(drop=True, inplace=True)

                # Defensive Players DF
                p_0 = df.loc[(df['possession'] == 0) & (df['displayName'] != 'Football'),:].copy()
                p_0.reset_index(drop=True, inplace=True)

                # Offensive Players DF
                p_1 = df.loc[(df['possession'] == 1) & (df['displayName'] != 'Football'),:].copy()
                p_1.reset_index(drop=True, inplace=True)

                if p_0.shape[0] > 0 and p_1.shape[0] > 0:
                    # Calculate offenses distance to defenders
                    closest_to_def = []
                    for def_player in range(p_0.shape[0]):
                        
                        loc_x = p_0.iloc[def_player]['x']
                        loc_y = p_0.iloc[def_player]['y']

                        opp_loc_x = p_1['x']
                        opp_loc_y = p_1['y']

                        # Vectorized distance calculator
                        dist_vect = calc_distances(
                            np.full_like(opp_loc_x, loc_x), # build vect to calculate X distances for player vs all oppon
                            np.full_like(opp_loc_y, loc_y), # build vect to calculate Y distances for player vs all oppon
                            opp_loc_x,
                            opp_loc_y
                        )
                        closest_idx = dist_vect.argmin()        # find the closest defender's index
                        closest_dist = dist_vect[closest_idx]

                        closest_stats = p_1.loc[closest_idx, ['s', 'o', 'a', 'dis', 'dir']]
                        closest_stats['dist'] = closest_dist
                        
                        closest_to_def.append(closest_stats.to_list())
                        
                    def_oppo_stats = pd.DataFrame(closest_to_def, columns=closest_stats.index).add_prefix('opp_')
                    def_oppo_stats = pd.concat([p_0[['gameId', 'playId', 'frameId', 'displayName']], def_oppo_stats], axis=1)

                    # Calculate defenders distance to offense
                    closest_to_off = []
                    for off_player in range(p_1.shape[0]):
                        
                        loc_x = p_1.iloc[off_player]['x']
                        loc_y = p_1.iloc[off_player]['y']

                        opp_loc_x = p_0['x']
                        opp_loc_y = p_0['y']

                        dist_vect = calc_distances(
                            np.full_like(opp_loc_x, loc_x),
                            np.full_like(opp_loc_y, loc_y),
                            opp_loc_x,
                            opp_loc_y
                        )

                        closest_idx = dist_vect.argmin()
                        closest_dist = dist_vect[closest_idx]

                        closest_stats = p_0.loc[closest_idx, ['s', 'o', 'a', 'dis', 'dir']]
                        closest_stats['dist'] = closest_dist
                        
                        closest_to_off.append(closest_stats.to_list())
                        
                    off_oppo_stats = pd.DataFrame(closest_to_off, columns=closest_stats.index).add_prefix('opp_')
                    off_oppo_stats = pd.concat([p_1[['gameId', 'playId', 'frameId', 'displayName']], off_oppo_stats], axis=1)

                    # merge all
                    football = pd.DataFrame([[game, play, frame, 'Football', 0, 0, 0, 0, 0, 0]], columns=off_oppo_stats.columns)
                    both_teams = pd.concat([def_oppo_stats, off_oppo_stats, football], ignore_index=True)   # recompile frame with dist

                    output = pd.concat([output, both_teams], ignore_index=True)                             # save
        print(f'Week {w} - Game {g}/{len(games)} completed')  

    output.to_csv(f'./data/level_3/l3_week_{w}.csv', index=False, compression='zip')


Starting week 1
Week 1 - Game 1/13 completed
Week 1 - Game 2/13 completed
Week 1 - Game 3/13 completed
Week 1 - Game 4/13 completed
Week 1 - Game 5/13 completed
Week 1 - Game 6/13 completed
Week 1 - Game 7/13 completed
Week 1 - Game 8/13 completed
Week 1 - Game 9/13 completed
Week 1 - Game 10/13 completed
Week 1 - Game 11/13 completed
Week 1 - Game 12/13 completed
Week 1 - Game 13/13 completed
Starting week 2
Week 2 - Game 1/16 completed
Week 2 - Game 2/16 completed
Week 2 - Game 3/16 completed
Week 2 - Game 4/16 completed
Week 2 - Game 5/16 completed
Week 2 - Game 6/16 completed
Week 2 - Game 7/16 completed
Week 2 - Game 8/16 completed
Week 2 - Game 9/16 completed
Week 2 - Game 10/16 completed
Week 2 - Game 11/16 completed
Week 2 - Game 12/16 completed
Week 2 - Game 13/16 completed
Week 2 - Game 14/16 completed
Week 2 - Game 15/16 completed
Week 2 - Game 16/16 completed
Starting week 3
Week 3 - Game 1/16 completed
Week 3 - Game 2/16 completed
Week 3 - Game 3/16 completed
Week 3 - Game

Re-Inspect opponent distance frame exported above.

In [28]:
o = 1
opponents = pd.read_csv(f'./data/level_3/l3_week_{w}.csv', compression='zip')
opponents.head()

Unnamed: 0,gameId,playId,frameId,displayName,opp_s,opp_o,opp_a,opp_dis,opp_dir,opp_dist
0,2018123002,51,1,Clay Matthews,0.0,279.5,0.0,0.0,61.2,3.182766
1,2018123002,51,1,Tramon Williams,0.0,267.89,0.0,0.0,248.82,15.523427
2,2018123002,51,1,Bashaud Breeland,0.0,267.89,0.0,0.0,248.82,2.710295
3,2018123002,51,1,Blake Martinez,0.0,279.5,0.0,0.0,61.2,5.828979
4,2018123002,51,1,Kyler Fackrell,0.0,267.89,0.0,0.0,248.82,3.589721


Load all nearest opponents data

In [29]:
_ = pd.DataFrame()
for o in range(1, 18):
    opponents = pd.read_csv(f'./data/level_3/l3_week_{w}.csv', compression='zip')
    _ = pd.concat([_, opponents])

Shape of opponent location data

In [30]:
_.shape

(17810560, 10)

Original data shape

In [33]:
__ = pd.concat(gen_week_data(level=2))

In [35]:
__.shape

(18309388, 29)

Number of football observations (not included in dist calc above)

In [34]:
football = __[__['displayName'] == 'Football']
football.shape

(1247642, 29)

Looks like we could potentially have error values in 748,000+ observations or about 4% of the data. Will continue on but come back to clear this up at a later date. Its possible the errors exist in the source data. Its possible one of the merges is being done imporperly. \
Possible Solutions:
* Make unique identifier column for game/play, game/play/frame, and game/play/frame/player. This could help reduce loops in some cases.
* Data validation in the original loop


In [38]:
_.shape[0] - (__.shape[0] - football.shape[0])

748814

Merge nearest opponent info into subset for modeling

In [39]:
for n in range(1, 18):

    week = pd.read_csv(f'./data/level_2/l2_week_{n}.csv', compression='zip')
    week = week.drop_duplicates()
    week.reset_index(drop=True, inplace=True)
    games = week['gameId'].unique()

    week['pass_complete'] = 0

    print(f'Input shape: {week.shape}')

    # Add pass result
    for game in games:
        plays = week[week['gameId'] == game]['playId'].unique()

        for play in plays:
            completion = pass_complete[game][play]['pass']
            if completion:
                completion = 1
            else:
                completion = 0
            
            compl_shape = week.loc[(week['gameId'] == game) & (week['playId'] == play)]['pass_complete']

            week.loc[(week['gameId'] == game) & (week['playId'] == play), 'pass_complete'] = np.full_like(compl_shape, completion)
    
    # Merge opponent data and week data
    opponents = pd.read_csv(f'./data/level_3/l3_week_{n}.csv', compression='zip')

    if opponents.shape[0] != week.shape[0]:
        print(f'Week {n} has inconsistent data.')
        print(f'Week shape: {week.shape}')
        print(f'Opponents shape: {opponents.shape}')

    output = pd.merge(left=opponents, right=week, left_on=['gameId', 'playId', 'frameId', 'displayName'], right_on=['gameId', 'playId', 'frameId', 'displayName'], how='inner')
    print(f'Output shape: {output.shape}')
    print('='*50)

    output.to_csv(f'./data/level_4/l4_week_{n}.csv', index=False, compression='zip')
    w = week['week'].iloc[0]
    print(f'Week {w} complete')

Input shape: (986022, 30)
Week 1 has inconsistent data.
Week shape: (986022, 30)
Opponents shape: (986012, 10)
Output shape: (986012, 36)
Week 1 complete
Input shape: (1230925, 30)
Week 2 has inconsistent data.
Week shape: (1230925, 30)
Opponents shape: (1231793, 10)
Output shape: (1231793, 36)
Week 2 complete
Input shape: (1168345, 30)
Week 3 has inconsistent data.
Week shape: (1168345, 30)
Opponents shape: (1166768, 10)
Output shape: (1166760, 36)
Week 3 complete
Input shape: (1205527, 30)
Week 4 has inconsistent data.
Week shape: (1205527, 30)
Opponents shape: (1204638, 10)
Output shape: (1206010, 36)
Week 4 complete
Input shape: (1171908, 30)
Week 5 has inconsistent data.
Week shape: (1171908, 30)
Opponents shape: (1171900, 10)
Output shape: (1171900, 36)
Week 5 complete
Input shape: (1072563, 30)
Week 6 has inconsistent data.
Week shape: (1072563, 30)
Opponents shape: (1070026, 10)
Output shape: (1069999, 36)
Week 6 complete
Input shape: (982583, 30)
Week 7 has inconsistent data.


Input and output shapes are very close in size. Will want to go back and clean up a few things, for sure. Will see what effect this has on the predictions and turn back to clean up some of the missing data at a later point. Unique identifiers is likely the way to go.

This was quite the undertaking. Parsing thru this data was incredibly computationally expensive. Parallel processing was needed often. A different streamlined version of the data cleaning process will be built (with data store pickled data). I will need to rebuild all these features for the final web based product. I will not be able to subset the data if I want the whole play to be available on the web app.