In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfl-big-data-bowl-2025/players.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_7.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_9.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_6.csv
/kaggle/input/nfl-big-data-bowl-2025/games.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_8.csv
/kaggle/input/nfl-big-data-bowl-2025/player_play.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_4.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_3.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_5.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_1.csv
/kaggle/input/nfl-big-data-bowl-2025/plays.csv
/kaggle/input/nfl-big-data-bowl-2025/tracking_week_2.csv


In [3]:
# This is where my own code starts

# Import the first two raw data files
plays_raw = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/plays.csv')
games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/games.csv')

# Add the game by game info to the play data
plays_raw = pd.merge(plays_raw, games, on='gameId', how='left')

# Add the player-specific info
players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/players.csv')
player_play = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/player_play.csv')

# Testing with the Week 1-3 tracking data
track_1 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_1.csv')


In [4]:
track_keep_cols = ['gameId', 'playId', 'nflId',
                   'time', 'event'
                  ]

track_1 = track_1[track_keep_cols]
track_1['Week'] = 1

track_2 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_2.csv')
track_2 = track_2[track_keep_cols]
track_2['Week'] = 2

track_3 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_3.csv')
track_3 = track_3[track_keep_cols]
track_3['Week'] = 3

In [5]:
track_4 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_4.csv')
track_4 = track_4[track_keep_cols]
track_4['Week'] = 4

track_5 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_5.csv')
track_5 = track_5[track_keep_cols]
track_5['Week'] = 5

track_6 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_6.csv')
track_6 = track_6[track_keep_cols]
track_6['Week'] = 6

track_7 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_7.csv')
track_7 = track_7[track_keep_cols]
track_7['Week'] = 7

track_8 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2025/tracking_week_8.csv')
track_8 = track_8[track_keep_cols]
track_8['Week'] = 8

track_1_8 = pd.concat([track_1, track_2, track_3, track_4, track_5,
                      track_6, track_7, track_8], axis=0).reset_index(drop=True)

testing_df = track_1_8

In [6]:
# Link the players df so we can tell what position the motion comes from
testing_df = pd.merge(testing_df, players[['position','nflId']], on='nflId', how='left')

# Add an indicator for offense or defense (special teams not included)

offense_pos = ['QB', 'WR', 'RB', 'TE', 'C', 'G', 'T', 'FB']
defense_pos = ['DT', 'NT', 'DE', 'CB', 'FS', 'SS', 'ILB', 'MLB', 'OLB']

testing_df['posType'] = np.where(testing_df['position'].isin(offense_pos), 'O', 'D')

We want to take the player tracking data, which shows each player's frame-by-frame movements, and link it with the play-by-play data. This will allow us to see what the outcome of the play was. The rest of these two cells help clean and modify the data for my purposes.

In [7]:
# Filtering offense data
offense_track_raw = testing_df[testing_df['posType'] == 'O']

# Merge this dataframe with the Player plays_raw table, showing motion, route, etc. indicators
link_cols = ['gameId', 'playId', 'nflId']
add_cols = ['inMotionAtBallSnap', 'motionSinceLineset', 
            'shiftSinceLineset','wasRunningRoute'] # TWO TYPES OF MOTION: at the snap, and any time pre-snap

offense_track_raw = pd.merge(offense_track_raw, player_play[link_cols + add_cols], \
                           on = link_cols)

# Make new columns for the player IDs that go in motion/shift/routes, so we can find the number of 
# unique ones for each play
offense_track_raw['playerMotion'] = np.where(offense_track_raw['motionSinceLineset'] == True,
                                             offense_track_raw['nflId'], "")
offense_track_raw['playerShift'] = np.where(offense_track_raw['shiftSinceLineset'] == True, 
                                            offense_track_raw['nflId'], "")
offense_track_raw['playerSnapMotion'] = np.where(offense_track_raw['inMotionAtBallSnap'] == True, 
                                                 offense_track_raw['nflId'], "")
offense_track_raw['playerRoute'] = np.where((offense_track_raw['motionSinceLineset'] == True) 
                                            & (offense_track_raw['wasRunningRoute'] == True),
                                            offense_track_raw['nflId'], "")

In [8]:
# The data didn't include a column for the offensive team's score, only the home and away team's
# score, so I needed to create that column

plays_raw['possTeamScore'] = np.where(plays_raw['possessionTeam'] == plays_raw['homeTeamAbbr'],
                                  plays_raw['preSnapHomeScore'],
                                  plays_raw['preSnapVisitorScore'])
plays_raw['nonPossTeamScore'] = np.where(plays_raw['possessionTeam'] == plays_raw['homeTeamAbbr'],
                                  plays_raw['preSnapVisitorScore'],
                                  plays_raw['preSnapHomeScore'])

# Same thing for win probability added

plays_raw['winProbAdded'] = np.where(plays_raw['possessionTeam'] == plays_raw['homeTeamAbbr'],
                                  plays_raw['homeTeamWinProbabilityAdded'],
                                  plays_raw['visitorTeamWinProbilityAdded'])

# Converting the game clock column to sub-columns for more specificity

plays_raw['minute'] = plays_raw['gameClock'].str[:2].astype(int)
plays_raw['second'] = plays_raw['gameClock'].str[3:].astype(int)
plays_raw['totalSecElapsed'] = (((plays_raw['quarter'] - 1) * (15 * 60)) +
                                ((15 * 60) - (plays_raw['minute'] * 60) - plays_raw['second']))

The function below is to find the number of players that went in motion on a play, and the number of these players who ran routes. While this can be important, I also created an indicator for whether *any* player went in motion.

In [9]:
def find_motions_and_routes(tracking_data):
    ''' 
    Creates two dataframes with player motion data for each play.
    
    Inputs: a dataframe (which should be the player tracking table) that needs to have 
    the columns identifying the unique game, play, and player, and needs to have an indicator
    for motion for each player.
    
    Outputs: the dataframe with the numbers for each play, and the modified version 
    of the tracking dataframe.
    '''
    
    tracking_data = pd.merge(tracking_data, plays_raw[['playId', 'gameId', 
                                                       'passResult', 'possessionTeam']], 
                             on=['playId', 'gameId'], how='left')
        
    motion_players = (
        tracking_data.groupby(['playId', 'gameId'])[['playerMotion','playerShift','playerSnapMotion','playerRoute']]
        .nunique()
        .reset_index()
    )

    motion_players.columns = ['playId', 'gameId', 'uniqueMotionPlayers', 'uniqueShiftPlayers', 'uniqueSnapMotionPlayers', 'motionsAndRoutes']
    
    # Since the empty string "" counts as a unique value, subtract 1 from each entry
    motion_players['uniqueMotionPlayers'] -= 1
    motion_players['uniqueSnapMotionPlayers'] -= 1
    motion_players['uniqueShiftPlayers'] -= 1
    motion_players['motionsAndRoutes'] -= 1
    
    motion_players['anyMotion'] = motion_players['uniqueMotionPlayers'] >= 1
    motion_players['anySnapMotion'] = motion_players['uniqueSnapMotionPlayers'] >= 1
    motion_players['anyShift'] = motion_players['uniqueShiftPlayers'] >= 1


    return motion_players, tracking_data

The following section is very important: this is how we determine which plays to categorize as 'hurry up' plays. 

Things that identify no-huddle:
- No indicator for 'huddle_break_offense'
- Late in second quarter
- Late in fourth quarter and possession team trailing or tied
- Marked as No Huddle in the play description
- High time remaining on play clock (not including this one yet)


Things that mean the play should not be considered:
- There was a timeout before the play
- Ball went out of bounds or incomplete on previous play
- The two-minute warning was before the play
- The play is a two-point conversion
- It's the second quarter and the team has given up on scoring

In [10]:
def modify_plays_for_hurry_up(play_data):
    ''' 
    Filter out irrelevant plays, modify the df for my purposes, and determine which
    plays should be classified as hurry up 

    Input: raw play-by-play dataframe

    Outputs: A modified version of the df, with a reduced overall number of columns but an
    indicator for hurryUp
    '''

    # I want to limit this analysis to only regulation time, because overtime games represent
    # a small proportion of plays and can have atypical patterns
    play_data = play_data[play_data['quarter'] <= 4]
    play_data = play_data[play_data['playNullifiedByPenalty'] == 'N']

    play_data = play_data.sort_values(['gameId', 'totalSecElapsed'])

    play_data['winProbAdded'] = play_data['winProbAdded'] * 100
    
    trailing_fourth_q = [(play_data['totalSecElapsed'] >= (60 * 55)), # less than 5 min in game
                         (play_data['possTeamScore'] - play_data['nonPossTeamScore'] <= 0), # poss team losing/tied
                         (play_data['possTeamScore'] - play_data['nonPossTeamScore'] >= -8), # one score game
                         (play_data['qbKneel'] == False),
                         (play_data['qbSpike'] == False)
                        ]
    
    late_second_q = [(play_data['totalSecElapsed'] > (60 * 28)),
                     (play_data['totalSecElapsed'] < (60 * 30)),
                     (play_data['qbKneel'] == False),
                     (play_data['qbSpike'] == False)
                    ]
    
    #high_play_clock = [(play_data['playClockAtSnap'] >= 25)]
    
    manual = [(play_data['playDescription'].str.contains('No Huddle'))]
    
    hu_conditions = [trailing_fourth_q, late_second_q, manual]
                     
    play_data['hurryUp'] = False
    
    for sub_conditions in hu_conditions:
        play_data['hurryUp'] |= pd.concat(sub_conditions, axis=1).all(axis=1)
                                             
    keep_cols = ['gameId', 'playId', # linking data
                 'possessionTeam', 'receiverAlignment', 
                 'passResult', 'dropbackType', # columns to filter on
                 'pff_runPassOption', 'pff_passCoverage', 'pff_manZone', # columns to filter on
                 'playDescription', 'quarter',  'totalSecElapsed', 'down', 'yardlineNumber', # descriptive
                 'possTeamScore','nonPossTeamScore', # score data
                 'playClockAtSnap', # timing info
                 'expectedPointsAdded', 'winProbAdded', # success metrics
                 'hurryUp']

                 
    return play_data[keep_cols]

In [11]:
play_by_play = modify_plays_for_hurry_up(plays_raw)

In [12]:
play_motion_players, offense_track = find_motions_and_routes(offense_track_raw)
offense_track = pd.merge(offense_track, play_by_play[['gameId','playId','possessionTeam',
                                               'hurryUp', 'passResult']], 
                       on=['gameId','playId'], how='left')

Below, I'm finding the exact times that certain important events for each play occurred, based on the player tracking df. The first two lists represent these important events. I remove all events that aren't in those lists, and find the time the play actually began (which sometimes isn't explicitly listed).

In [13]:
main_events = ['line_set', 'man_in_motion', 'ball_snap', 'play_over']
play_over_events = ['pass_outcome_incomplete',
                   'pass_outcome_touchdown', 'touchdown', 'safety'
                    'qb_sack', 'qb_slide', 'qb_spike',
                   'tackle',
                   'touchback', 'out_of_bounds']

offense_track['event'] = np.where(offense_track['event'].isin(play_over_events),
                               'play_over',
                               offense_track['event'])

# Find the first entry for each play, and the corresponding time. Represents start of play
play_starts = offense_track[['gameId', 'playId', 'time']].drop_duplicates(['gameId', 'playId'])
play_starts['event'] = 'play_start'

event_times = offense_track[(offense_track['event'].isin(main_events))].drop_duplicates(
    ['gameId', 'playId', 'event'])[['gameId', 'playId', 'event', 'time']]

event_times = pd.concat([event_times, play_starts]).sort_values('time')

# Add milliseconds if absent
event_times['time'] = event_times['time'].apply(lambda x: x if '.' in x else x + '.0')

event_times['time'] = pd.to_datetime(event_times['time'])

event_times

Unnamed: 0,gameId,playId,event,time
3198096,2022090800,56,play_start,2022-09-09 00:23:48.200
3198171,2022090800,56,line_set,2022-09-09 00:23:55.700
3198241,2022090800,56,ball_snap,2022-09-09 00:24:02.700
3198283,2022090800,56,play_over,2022-09-09 00:24:06.900
3200208,2022090800,80,play_start,2022-09-09 00:24:24.500
...,...,...,...,...
22719563,2022103100,3727,line_set,2022-11-01 03:05:45.400
22719577,2022103100,3727,ball_snap,2022-11-01 03:05:46.800
22720291,2022103100,3748,play_start,2022-11-01 03:06:18.000
22720335,2022103100,3748,line_set,2022-11-01 03:06:22.400


Now I have the exact times that a player went in motion, the ball was snapped, and the play was over. Based on these I can calculate how much time elapsed between each of these events. I do this, and then merge the time differences dataframe with the one that shows whether there was motion on the play and the one that has the play-by-play descriptive data. This means the time_diffs will have essentially all the important info for my analysis after this cell runs.

In [14]:
# Find the time elapsed between line set, motion, and ball snap
time_diffs = event_times.pivot(index=['gameId', 'playId'], columns='event', values='time').assign(
    startToSnap=lambda x: (x['ball_snap'] - x['play_start']).dt.total_seconds(), 
    motionToSnap=lambda x: (x['ball_snap'] - x['man_in_motion']).dt.total_seconds(),
    playDuration=lambda x: (x['play_over'] - x['ball_snap']).dt.total_seconds()
).reset_index()[['gameId', 'playId', 'startToSnap', 'motionToSnap', 'playDuration']]

# Add the motion info for each play, which I calculated earlier
time_diffs = pd.merge(time_diffs, 
                      play_motion_players[['gameId', 'playId', 
                                           'anyMotion','anyShift', 'anySnapMotion']], 
                      on=['gameId', 'playId'], 
                      how='left')

# Add descriptive info for each play, directly from the play-by-play file
time_diffs = pd.merge(time_diffs,
                     play_by_play[['gameId', 'playId', 
                                   'possessionTeam', 'pff_manZone', 'receiverAlignment',
                                   'dropbackType', 'passResult',# Data to aggregate by
                                  'hurryUp', 'playClockAtSnap','totalSecElapsed', # Timing info
                                  'expectedPointsAdded', 'winProbAdded']],
                     on=['gameId', 'playId'],
                     how='left')

time_diffs

Unnamed: 0,gameId,playId,startToSnap,motionToSnap,playDuration,anyMotion,anyShift,anySnapMotion,possessionTeam,pff_manZone,receiverAlignment,dropbackType,passResult,hurryUp,playClockAtSnap,totalSecElapsed,expectedPointsAdded,winProbAdded
0,2022090800,56,14.5,,4.2,True,False,False,BUF,Zone,2x2,TRADITIONAL,C,False,12.0,0.0,0.004420,0.003121
1,2022090800,80,8.7,2.7,5.0,True,False,True,BUF,Zone,3x2,SCRAMBLE,R,True,16.0,31.0,0.823571,1.349665
2,2022090800,101,10.5,0.9,3.9,True,False,True,BUF,Zone,2x1,,,False,13.0,66.0,0.562363,1.484972
3,2022090800,122,11.2,,6.1,False,False,False,BUF,Zone,2x1,TRADITIONAL,C,False,9.0,105.0,0.399209,1.903171
4,2022090800,167,10.1,,3.9,False,False,False,BUF,Zone,3x2,TRADITIONAL,C,False,9.0,186.0,1.373441,3.186998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14584,2022103100,3596,8.8,,4.5,False,False,False,CLE,Man,3x0,,,False,4.0,3401.0,-0.660984,-0.001246
14585,2022103100,3674,8.0,,5.0,False,False,False,CLE,Man,2x1,,,False,18.0,3458.0,0.562885,0.002257
14586,2022103100,3697,7.4,,2.7,False,False,False,CLE,Man,2x1,QB_SNEAK,,False,21.0,3480.0,0.806727,-0.323867
14587,2022103100,3727,5.6,,,False,False,False,CLE,,,UNKNOWN,,False,1.0,3524.0,0.000000,0.009934


In [15]:
# Add the column to help calculate completion percentage
time_diffs['completion'] = np.where(time_diffs['passResult'] == 'C', 1, 
                                    np.where(time_diffs['passResult'] == 'I', 0,
                                    np.where(time_diffs['passResult'] == 'IN', 0, np.nan)))


# I don't care whether the directions of these plays were left or right,
# so I'm making a new column to remove the differences
time_diffs['dropback'] = np.where(
    time_diffs['dropbackType'].str.startswith('DESIGNED_ROLLOUT'), 'DESIGNED_ROLLOUT',
    np.where(time_diffs['dropbackType'].str.startswith('SCRAMBLE_ROLLOUT'), 'SCRAMBLE_ROLLOUT',
    time_diffs['dropbackType'])
)


# Create the variable for the amount of time from the end of one play
# to the start of another. The end of the last play is calculated through
# adding the seconds elapsed at the start of the last play to the duration of the play.

time_diffs = time_diffs.sort_values(['gameId', 'totalSecElapsed'])

time_diffs['endOfLastPlay'] = (time_diffs['totalSecElapsed'] + time_diffs['playDuration']).shift(1)
time_diffs['totalTimeToSnap'] = time_diffs['totalSecElapsed'] - time_diffs['endOfLastPlay']

time_diffs['totalTimeToSnap'] = np.where(
    time_diffs['gameId'] != time_diffs['gameId'].shift(1),  # Check if game_id changes
    np.nan,                                   # Set NaN if true
    time_diffs['totalTimeToSnap']                # Otherwise keep the calculated value
)    

time_diffs = time_diffs.drop(columns=['totalSecElapsed','endOfLastPlay'])


only_hurryUp = time_diffs[time_diffs['hurryUp'] == True]
only_normal = time_diffs[time_diffs['hurryUp'] == False]

# This will be useful when I disaggregate the data later
hurry_up_sets = [only_hurryUp, only_normal]

The following code is used to examine the timing and effectiveness differences between plays that run motion and hurry-up. I can measure effectiveness in multiple ways: EPA, win probability added, completion rate, and more. 

In [16]:
time_diffs.groupby('hurryUp')[['anyMotion', 'startToSnap', 'totalTimeToSnap']].mean().reset_index()

Unnamed: 0,hurryUp,anyMotion,startToSnap,totalTimeToSnap
0,False,0.334529,10.980868,26.854143
1,True,0.193237,7.984811,17.455058


In [17]:
time_diffs.groupby('hurryUp')[['expectedPointsAdded', 'winProbAdded', 'completion']].mean().reset_index()

Unnamed: 0,hurryUp,expectedPointsAdded,winProbAdded,completion
0,False,-0.018266,0.056,0.658818
1,True,-0.052246,-0.11286,0.608696


These two cells show:
1. In hurry-up situations, there is far less motion (19% of plays compared to 33%), and teams take less time to prepare for each play. startToSnap and totalTimeToSnap are two different ways of measuring the amount of time they take to prepare, and both measured in seconds.
2. In hurry-up situations, teams are significantly less effective, no matter which metric you look at. Completion percentage (which automatically only includes passing plays) drops from 65.9% to 60.9%. Meanwhile, EPA and WPA per play (which include all plays) drop by substantial amounts.

In [24]:
by_team_success = time_diffs.groupby(['possessionTeam']).agg(
    meanEPA=('expectedPointsAdded', 'mean'),
    meanWPA=('winProbAdded', 'mean'),
    pctMotion=('anyMotion', 'mean'),
    pctHurryUp=('hurryUp', 'mean'),
    passComp=('completion', 'mean'),
    count=('possessionTeam', 'count')
).reset_index()

print(by_team_success['pctMotion'].corr(by_team_success['meanEPA']))
print(by_team_success['pctMotion'].corr(by_team_success['meanWPA']))

#by_team_success

0.25545330710775666
0.3351523569613505


This demonstrates that there is a relatively large correlation between the amount of motion that teams run and their success. 0.255 and 0.335 are pretty substantial considering the number of factors that go into a football play, and the fact that EPA and WPA are the main success metrics that contribute to a team winning. However, the correlation makes sense because running motion has recently gained a lot of popularity as coaches have realized the benefits of it. 

If we take this info together with the previous two cells, it's pretty clear that in hurry up situations, teams will not be able to be as effective as normal. They are under time pressure and run motion at around half the frequency as normal, so they cannot fully utilize it as they would normally. This is broad takeaway from this project: there is clear evidence of motion being effective and how it is dampened in hurry up situations. It is slightly ironic because these situations are when teams need to be *most* effective as they seek to win or tie the game. Nonetheless, it highlights the importance of good time management by head coaches and quarterbacks.

In the next few cells, I'll look at more ways to break this data down, and explore how motion is more or less successful depending on other features of a play.

In [19]:
def find_motion_effectiveness(dfList, indepVariable, depVariable):
    ''' 
    Disaggregates my dataframe based on specific features of a play, and
    calculates how motion improves teams' success rates accordingly.
    
    Inputs: 
    - a list of dataframes (should be one for hurry-up plays and one for normal).
    - an independent variable (the one I want to disaggregate by).
    - a dependent variable (the success metric for a play -- EPA or WPA)

    Outputs:
    - strings that show the percentage change in the dependent variable 
    when motion is run versus when it is not, and the p-value for the
    one-sided independent t-test. 
    '''

    for df in dfList:
        for level in np.unique(df[indepVariable].astype(str)):  

            # Filter the df for only the observations meeting one criteria
            subset = df[df[indepVariable] == level]

            motion = subset[subset['anyMotion'] == True]
            noMotion = subset[subset['anyMotion'] == False]
        
            sampleSize = min(len(motion), len(noMotion))

            # Dismiss the edge case categories due to a lack of reliable data
            if sampleSize < 10:
               continue;

            # Find the percentage change in the dependent variable
            # between when motion was run on the play and when it wasn't
            pctChange = (np.mean(motion[depVariable]) -
                         np.mean(noMotion[depVariable])) / abs(np.mean(noMotion[depVariable]))

            # Run a one-sided independent t-test to test if the difference in means is significant
            t_stat, p_value = ttest_ind(motion[depVariable], noMotion[depVariable])

            speed = np.unique(df['hurryUp'])[0]

            # Print out the results of the tests
            print(f'[Hurry Up: {speed}, {indepVariable}: {level}]: {pctChange*100:.2f}% change, p-value: {p_value / 2:.2f}, {sampleSize}')


In [20]:
# Separate by type of defensive coverage
time_diffs[time_diffs['pff_manZone'].isin(['Man', 'Zone'])].groupby(
    ['pff_manZone', 'hurryUp', 'anySnapMotion']).agg(
    meanEPA = ('expectedPointsAdded', 'mean'),
    meanWPA = ('winProbAdded', 'mean'),
    count = ('expectedPointsAdded', 'count')).reset_index()

find_motion_effectiveness(hurry_up_sets, 'pff_manZone', 'expectedPointsAdded')


[Hurry Up: True, pff_manZone: Man]: 352.10% change, p-value: 0.05, 127
[Hurry Up: True, pff_manZone: Other]: -9.56% change, p-value: 0.47, 25
[Hurry Up: True, pff_manZone: Zone]: -216.59% change, p-value: 0.23, 328
[Hurry Up: False, pff_manZone: Man]: 286.26% change, p-value: 0.00, 1088
[Hurry Up: False, pff_manZone: Other]: -31.15% change, p-value: 0.41, 192
[Hurry Up: False, pff_manZone: Zone]: 189.38% change, p-value: 0.00, 2728


Here, we can see motion is typically more effective when facing man coverage by the defense (when each defensive player follows one offensive player, rather than guarding a pre-determined zone of the field). This is in line with conventional football knowledge: one of the main reasons for running motion in the first place is to identify when the defense is playing man coverage, and man coverage is typically easier to beat if you know it's coming, relative to zone. If the defense is in man and a receiver motions before the snap, a defensive player will follow them, and that can give the defensive coverage away.

However, even in zone coverage, running motion has a significant improvement on performance *when the team is not in a hurry up situation*. The p-value for both man and zone coverage in non-hurry-up situations is below 0.05, and there is a positive change in EPA per play as well. However, in hurry up situations, the p-values are all above 0.05. This suggests we are not seeing any significant effect of motion there, even in man coverage, which is supposed to be the main source of its success. The results here provide further evidence for the wide gap between hurry up and normal situations.

In [21]:
# Separate by type of dropback
by_dropback_EPA = time_diffs.groupby(['dropback','hurryUp','anyMotion']).agg(
    meanEPA=('expectedPointsAdded', 'mean'),
    meanWPA=('winProbAdded', 'mean'),
    passComp=('completion', 'mean'),
    countAll=('dropback', 'count')
).reset_index()

find_motion_effectiveness(hurry_up_sets, 'dropback', 'expectedPointsAdded')

[Hurry Up: True, dropback: DESIGNED_ROLLOUT]: -184.83% change, p-value: 0.23, 91
[Hurry Up: True, dropback: SCRAMBLE]: 20.30% change, p-value: 0.44, 28
[Hurry Up: True, dropback: TRADITIONAL]: 81.42% change, p-value: 0.24, 357
[Hurry Up: False, dropback: DESIGNED_ROLLOUT]: 129.71% change, p-value: 0.00, 1312
[Hurry Up: False, dropback: DESIGNED_RUN]: -23.77% change, p-value: 0.46, 33
[Hurry Up: False, dropback: SCRAMBLE]: 2023.96% change, p-value: 0.27, 154
[Hurry Up: False, dropback: SCRAMBLE_ROLLOUT]: 87.56% change, p-value: 0.23, 63
[Hurry Up: False, dropback: TRADITIONAL]: 294.42% change, p-value: 0.00, 2414
[Hurry Up: False, dropback: UNKNOWN]: -72.59% change, p-value: 0.10, 26


This section focused on the action the quarterback takes after receiving the snap. It's not super important to understand these designations, but I would draw your attention to the two sections with the largest sample sizes: DESIGNED_ROLLOUT and TRADITIONAL. These situations have significant positive changes when motion is run in normal situations, but not in hurry up situations. In fact, it is difficult to find any disaggregation of the data where there is a significant effect of motion on team success in hurry up situations.

Running motion seems to help teams more when they run a traditional dropback, rather than a designed rollout or run. Scrambles by definition are unplanned, so it's difficult to make conclusions about how teams could change their strategy based on that data. But overall, they should focus on using motion for traditional dropback passing plays.

In [22]:
# Separate by type of personnel grouping
by_alignment = time_diffs.groupby(['receiverAlignment', 'hurryUp', 'anyMotion']).agg(
    averageEPA=('expectedPointsAdded', 'mean'),
    count_all=('expectedPointsAdded', 'size')
).reset_index()

find_motion_effectiveness(hurry_up_sets, 'receiverAlignment', 'expectedPointsAdded')

[Hurry Up: True, receiverAlignment: 2x1]: 108.87% change, p-value: 0.06, 22
[Hurry Up: True, receiverAlignment: 2x2]: 334.22% change, p-value: 0.21, 220
[Hurry Up: True, receiverAlignment: 3x1]: 85.77% change, p-value: 0.32, 176
[Hurry Up: True, receiverAlignment: 3x2]: -1038.64% change, p-value: 0.01, 54
[Hurry Up: False, receiverAlignment: 1x1]: 301.92% change, p-value: 0.06, 23
[Hurry Up: False, receiverAlignment: 2x0]: -38500.90% change, p-value: 0.26, 15
[Hurry Up: False, receiverAlignment: 2x1]: 216.38% change, p-value: 0.18, 461
[Hurry Up: False, receiverAlignment: 2x2]: 361.37% change, p-value: 0.00, 1652
[Hurry Up: False, receiverAlignment: 3x1]: 166.66% change, p-value: 0.00, 1475
[Hurry Up: False, receiverAlignment: 3x2]: 334.64% change, p-value: 0.04, 336
[Hurry Up: False, receiverAlignment: 4x1]: 81.52% change, p-value: 0.10, 42


Receiver alignment refers to the number of receivers on each side of the field, so 3x1 would be three receivers on the left side and one on the right.

Similar situation here, where most of the receiver alignments have significant benefits from motion in normal situations but not in hurry up. The exception to this is the 3x2 alignment, but this actually leads to a negative change when you run motion. I'm pretty sure that this alignment in a hurry up situation would basically only used in desperate times where the offense is using as many receivers as possible and just trying to throw the ball as far as possible, so I wouldn't draw any large conclusions from that. We see pretty substantial improvements in normal situations when using motion across the board, with the largest changes coming in 2x2 and 3x2 alignments. These are more typical of passing plays than run plays, indicating motion is more effective on passing plays (which is definitiely in line with intuition).

In [23]:
# One-sided independent t-test to ensure that the motion differences in hurry up situations is 
# not significant, while in normal situations it is.

for df in hurry_up_sets:
    motion = df[df['anyMotion'] == True]
    noMotion = df[df['anyMotion'] == False]

    sampleSize = min(len(motion), len(noMotion))

    pctChange = (np.mean(motion['expectedPointsAdded']) - np.mean(noMotion['expectedPointsAdded'])) / abs(np.mean(noMotion['expectedPointsAdded']))
            
    t_stat, p_value = ttest_ind(motion['expectedPointsAdded'], noMotion['expectedPointsAdded'])
        
    speed = np.unique(df['hurryUp'])[0]
    print(f'[Hurry Up: {speed}]: {pctChange*100:.2f}% change, p-value: {p_value / 2:.2f}, {sampleSize}')


[Hurry Up: True]: 51.17% change, p-value: 0.34, 480
[Hurry Up: False]: 225.88% change, p-value: 0.00, 4010


Here I'm just making sure that the fine-grained findings from above are also true on a broad level. They are, because we see a statistically significant positive effect of running motion in normal situations but not in hurry up situations.

The next two cells are supposed to graph an example play, with different colors corresponding to the movements of various players across the field during the play. However, I didn't include some of these columns in my analysis above, because they took up unnecessary space. Since I'm going to continue working on this project after the class, I'm leaving them in here for now.

In [None]:
in_motion = offense_track[offense_track['motionSinceLineset'] == True]

in_motion = in_motion[['gameId', 'playId', 'nflId', 'displayName', 'frameType',
                      'time', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event','position']]

print((in_motion[['gameId', 'playId', 'nflId']].drop_duplicates()))

# This is to set up a plot with each player's movement (below)

jeudy = in_motion[(in_motion['playId'] == 401) & (in_motion['nflId'] == 52423) & (in_motion['gameId'] == 2022091200)]# & (in_motion['frameType'] == 'AFTER_SNAP')]
#fant = in_motion[(in_motion['playId'] == 401) & (in_motion['nflId'] == 47803) & (in_motion['gameId'] == 2022091200)]# & (in_motion['frameType'] == 'AFTER_SNAP')]
wilson = offense_track[(offense_track['playId'] == 401) & (offense_track['nflId'] == 38605) & (offense_track['gameId'] == 2022091200)]# & (offense_track['frameType'] == 'AFTER_SNAP')]
sutton = offense_track[(offense_track['playId'] == 401) & (offense_track['nflId'] == 46109) & (offense_track['gameId'] == 2022091200)]# & (offense_track['frameType'] == 'AFTER_SNAP')]
center = offense_track[(offense_track['playId'] == 401) & (offense_track['nflId'] == 52491) & (offense_track['gameId'] == 2022091200)]# & (offense_track['frameType'] == 'AFTER_SNAP')]

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(jeudy['y'], jeudy['x'], marker='o', linestyle='-', color='b', label="jeudy")
plt.plot(wilson['y'], wilson['x'], marker='o', linestyle='-', color='g', label="wilson")
plt.plot(sutton['y'], sutton['x'], marker='o', linestyle='-', color='r', label="sutton")
plt.plot(center['y'], center['x'], marker='o', linestyle='-', color='y', label="center")
#plt.plot(penny['y'], penny['x'], marker='o', linestyle='-', color='brown', label="penny")

plt.title("Path Traveled in 2D Plane")
plt.xlabel("Y Coordinate")
plt.ylabel("X Coordinate")
plt.xlim(0, 53.3)
plt.gca().invert_yaxis()
plt.legend()
plt.grid(True)
plt.show()