In [13]:
#NFL Big Data Bowl Mini Lab: 

#In week 1 of the 2023 season, how far is the defender from the receiver in catch and incomplete scenarios?

In [29]:
#-----------STEP 1: Getting Setup----------- 
#os, glob, json, pathlib are all Python utility modules, not analysis tools per se.
#They’re mostly used to load, find, and organize your Big Data Bowl files before you start analysis.
#pandas allows us to see raw data as structured tables 
#numpy allows for fast methematical computation 

import os, glob, json, pathlib
import pandas as pd
import numpy as np
from pathlib import Path

In [12]:
#-----------STEP 2: Create your Kaggle API and get the input/output data----------------

#Download the kaggle json from your Kaggle account
#Go to https://www.kaggle.com/
#Click your profile icon (top-right) → choose “Settings.”
#Scroll to the section “API.”
#Click the blue button “Create New API Token.”
#Kaggle downloads a file automatically called kaggle.json (it might go into your Downloads folder).
#I put mine in a folder entitled .kaggle on my Desktop. Do the same if you want to follow the code explicitly. 

cfg_dir = r"/Users/grantstarnes/Desktop/IUI Grad/TESM-S 501/Lab10-15/kaggle"
# Point Kaggle to this folder and test to ensure you have access. If you are error free, you are good to go! 
os.environ["KAGGLE_CONFIG_DIR"] = cfg_dir

# The next command prints the current version of the Kaggle command-line tool.
# If you see something like "Kaggle API 1.x.x", it’s working.
# If you see "command not found", you may need to install Kaggle (`pip install kaggle`)
!kaggle --version

# This lists all files available in the NFL Big Data Bowl 2026 competition.
# If your API key is correct, you'll see a list of CSV and ZIP files.
# If you see "401: Unauthorized", your kaggle.json file isn’t linked correctly.
!kaggle competitions files -c nfl-big-data-bowl-2026-analytics

# This command downloads all competition files into a folder called "data"
# inside your current working directory.
# The "-p data" flag means “put the files into the ‘data’ folder”.
# If "data" doesn’t exist, it will be created automatically.
!kaggle competitions download -c nfl-big-data-bowl-2026-analytics -p data

Kaggle API 1.7.4.5
Next Page Token = CfDJ8IaGWDgvvrBFtGGva9hUIY78wPvqm5xVYG6iaKymUz5SDr7L8toFyMO2T3a0L7NE4Jg9Dz0KPUtrEsp-2NwLPPc
name                                                                                    size  creationDate                
--------------------------------------------------------------------------------  ----------  --------------------------  
114239_nfl_competition_files_published_analytics_final/train/input_2023_w01.csv     48950314  2025-09-23 18:36:28.263000  
114239_nfl_competition_files_published_analytics_final/train/input_2023_w02.csv     49485029  2025-09-23 18:36:28.263000  
114239_nfl_competition_files_published_analytics_final/train/input_2023_w03.csv     51062128  2025-09-23 18:36:28.263000  
114239_nfl_competition_files_published_analytics_final/train/input_2023_w04.csv     46685806  2025-09-23 18:36:28.263000  
114239_nfl_competition_files_published_analytics_final/train/input_2023_w05.csv     43574971  2025-09-23 18:36:28.263000  
114239_nfl

In [31]:
# Universal way to locate the training data folder in which you just saved your kaggle information 
from pathlib import Path

# Start in the folder where Kaggle downloads files
data_dir = Path("data")

# Automatically find the "train" subfolder no matter what its parent folder is called
train_path = Path("/Users/grantstarnes/Desktop/IUI Grad/TESM-S 501/Lab10-15/data")

# Build file paths
input_file = train_path / "input_2023_w01.csv"
output_file = train_path / "output_2023_w01.csv"

print("Input file found at:", input_file)
print("Output file found at:", output_file)

data_path = Path("/Users/grantstarnes/Desktop/IUI Grad/TESM-S 501/Lab10-15/data/*train*")

# Load week 1 files
input_file = os.path.join(data_path, "input_2023_w01.csv")
output_file = os.path.join(data_path, "output_2023_w01.csv")

Input file found at: /Users/grantstarnes/Desktop/IUI Grad/TESM-S 501/Lab10-15/data/input_2023_w01.csv
Output file found at: /Users/grantstarnes/Desktop/IUI Grad/TESM-S 501/Lab10-15/data/output_2023_w01.csv


In [32]:
# Read a sample and the shape of the data you have created 
input_df = pd.read_csv(input_file)
output_df = pd.read_csv(output_file)

print(f"Rows: {input_df.shape[0]:,}, Columns: {input_df.shape[1]:,}")
print("Columns:", list(input_df.columns[:12]), "...")  # show first 12 columns
print(f"Rows: {output_df.shape[0]:,}, Columns: {output_df.shape[1]:,}")
print("Columns:", list(output_df.columns[:12]), "...")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/grantstarnes/Desktop/IUI Grad/TESM-S 501/Lab10-15/data/*train*/input_2023_w01.csv'

In [21]:
#---STEP 3: Let's go get the supplementary data---# 

# This block loads the Big Data Bowl supplementary data file from your Downloads folder.
# It works for either CSV or Parquet formats and automatically adjusts based on the file type.

# Import required Python libraries.

from pathlib import Path    # Modern way to handle file paths (works on Windows/Mac/Linux)
import pandas as pd         # Pandas is used to load and manipulate data tables


# Define where the file is stored on your computer.

# Path.home() → gives you your user’s home directory (e.g., C:\Users\<yourname>\ on Windows)
# / "Downloads" → navigates into your Downloads folder
# / "supplementary_data.csv" → adds the actual filename
# If your file has a different name (e.g., supplementary_data_week1.csv), change it here.

supp_path = Path.home() / "Downloads" / "supplementary_data.csv"

# Load the data based on its file type.

# The .suffix attribute extracts the file extension (e.g., ".csv", ".parquet", ".xlsx", etc.)
# The code checks which kind of file it is and chooses the right pandas reader.

if supp_path.suffix == ".csv":
    supp = pd.read_csv(supp_path)          # loads a comma-separated text file
elif supp_path.suffix == ".parquet":
    supp = pd.read_parquet(supp_path)      # loads a Parquet file (common for large datasets)
else:
    raise ValueError(f"Unsupported file type: {supp_path.suffix}")  # stops if unknown file type

# Preview the first few rows of the dataset.

# head() prints the first five rows so you can verify it loaded correctly.
# It’s a quick sanity check before merging or analyzing the data.

#Filter to Week 1 and make an explicit COPY 
supp_week1 = supp.loc[supp["week"] == 1].copy()
supp_week1.head()

  supp = pd.read_csv(supp_path)          # loads a comma-separated text file


Unnamed: 0,game_id,season,week,game_date,game_time_eastern,home_team_abbr,visitor_team_abbr,play_id,play_description,quarter,...,team_coverage_type,penalty_yards,pre_penalty_yards_gained,yards_gained,expected_points,expected_points_added,pre_snap_home_team_win_probability,pre_snap_visitor_team_win_probability,home_team_win_probability_added,visitor_team_win_probility_added
0,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,3461,(10:46) (Shotgun) J.Goff pass deep left to J.R...,4,...,COVER_2_ZONE,,18,18,-0.664416,2.945847,0.834296,0.165704,-0.081149,0.081149
1,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,461,(7:30) J.Goff pass short right to J.Reynolds t...,1,...,COVER_6_ZONE,,21,21,1.926131,1.345633,0.544618,0.455382,-0.029415,0.029415
2,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1940,(:09) (Shotgun) J.Goff pass incomplete deep ri...,2,...,COVER_2_ZONE,,0,0,0.281891,-0.081964,0.771994,0.228006,0.000791,-0.000791
3,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1711,"(:45) (No Huddle, Shotgun) P.Mahomes pass deep...",2,...,COVER_2_ZONE,,26,26,3.452352,2.342947,0.663187,0.336813,0.041843,-0.041843
4,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1588,(1:54) (Shotgun) P.Mahomes pass incomplete dee...,2,...,COVER_4_ZONE,,0,0,1.921525,-0.324035,0.615035,0.384965,6.1e-05,-6.1e-05


In [23]:
#---STEP 4: Standardize data types for looking them up later---# 

# This step ensures your joins and merges across datasets (input, output, supplementary)
# will work reliably and not break due to mismatched data types.

# Loop through each of the main dataframes.

# We have three key datasets:
#   - input_df : tracking data BEFORE the throw
#   - output_df : tracking data AFTER the throw
#   - supp_week1 : play-level metadata (pass results, yards, coverage, etc.)
#
# This loop runs the same cleanup on each one.

for df in [input_df, output_df, supp_week1]:
    
    # For each dataset, check for key columns: game_id, play_id
   
    # These columns identify unique plays and are critical for merges later.
    # Some files might be missing one of them, so we check "if k in df.columns" first.

    for k in ["game_id", "play_id"]:
        if k in df.columns:

            # Convert the column to a numeric type (Int64)
          
            # pd.to_numeric(..., errors="coerce") forces everything to numeric;
            # any invalid values become NaN instead of breaking.
            # .astype("Int64") ensures a consistent integer type that supports NaN.
            #
            # This avoids common merge issues where one dataframe has strings ("2023090700")
            # and another has integers (2023090700).

            df.loc[:, k] = pd.to_numeric(df[k], errors="coerce").astype("Int64")

# ✅ At this point:
# All "game_id" and "play_id" columns across input_df, output_df, and supp_week1
# are now numeric and standardized (safe for merging).

# Extract a reference table of player metadata from input_df.

# The input data contains static information about each player on a play,
# such as which side they’re on (Offense/Defense), their role (Targeted Receiver, Passer, etc.),
# and their name.
#
# We'll keep just these identifiers for easy merging later when analyzing output_df.

roles = (
    input_df[["game_id", "play_id", "nfl_id", "player_side", "player_role", "player_name"]]
    .drop_duplicates()  # removes duplicate rows so we have one record per player per play
)

# roles now acts like a lookup table:
# You can merge it with output_df to know each player's role and side during post-throw analysis.

In [24]:
#---STEP 5: Let's dig in on each pass play---#

# Create play-level reference points for passing plays from the input tracking data:
# (a) where the quarterback threw from,
# (b) who the intended receiver was,
# (c) where that receiver was positioned at the throw,
# (d) where the ball was expected to land.
# These features let us later connect spatial data to outcomes (caught vs. not caught).

# (a) THROW SNAPSHOT → last frame before the ball is thrown for the passer

qb_throw = (
    input_df.loc[input_df["player_role"] == "Passer"]              # only quarterback rows
            .sort_values(["game_id","play_id","frame_id"])         # order frames chronologically
            .groupby(["game_id","play_id"], as_index=False)        # group by unique play
            .tail(1)[["game_id","play_id","nfl_id","x","y"]]       # keep the last frame (throw moment)
            .rename(columns={"nfl_id":"passer_nfl_id",             # rename columns for clarity
                             "x":"throw_x","y":"throw_y"})
)
# → qb_throw now holds each play’s QB location (x, y) at the exact throw frame.



# (b) TARGETED RECEIVER ID → identify who the pass was intended for

target_rec = (
    input_df.loc[input_df["player_role"] == "Targeted Receiver",   # filter to intended receivers
                 ["game_id","play_id","nfl_id"]]                   # keep identifiers only
            .drop_duplicates(["game_id","play_id"])                # ensure one receiver per play
            .rename(columns={"nfl_id":"target_nfl_id"})            # rename for clarity
)
# → target_rec is a simple lookup table linking each play to its targeted receiver’s nfl_id.


# (c) RECEIVER SNAPSHOT AT THROW → where the target receiver was pre-throw

rec_at_throw = (
    input_df.loc[input_df["player_role"] == "Targeted Receiver"]   # receiver’s tracking data
            .sort_values(["game_id","play_id","frame_id"])         # order frames chronologically
            .groupby(["game_id","play_id"], as_index=False)
            .tail(1)[["game_id","play_id","nfl_id","x","y","s","a","dir"]]  # last frame before throw
            .rename(columns={"nfl_id":"target_nfl_id",             # rename for clarity and joinability
                             "x":"rec_throw_x","y":"rec_throw_y",
                             "s":"rec_throw_s","a":"rec_throw_a","dir":"rec_throw_dir"})
)
# → rec_at_throw adds spatial (x,y) and movement features (speed, acceleration, direction)
#   for the targeted receiver at the moment the ball leaves the QB’s hand.


# (d) BALL LANDING LOCATION → where the ball is expected to land

ball_land = (
    input_df[["game_id","play_id","ball_land_x","ball_land_y"]]    # select landing coordinates
             .dropna(subset=["ball_land_x","ball_land_y"])         # remove rows missing landing data
             .drop_duplicates(["game_id","play_id"])               # one row per play
)
#   ball_land gives one landing coordinate per play, used to compare with receiver position
#   and later evaluate accuracy or completion probability.

In [25]:
#---STEP 6: Extract key outcome-level metadata for each play---#

# We'll keep only the columns that describe *what happened* on the play, not the tracking details.

# Select the outcome-level columns from supp_week1

# supp_week1 is your Week 1 version of the supplementary file, one row per pass play.
# Columns include contextual and result information (pass result, yards gained, EPA, etc.).

play_meta = supp_week1[
    ["game_id",                 # unique game identifier
     "play_id",                 # unique play identifier (within each game)
     "pass_result",             # categorical outcome: 'C' (complete), 'I' (incomplete), 'IN' (interception), etc.
     "yards_gained",            # numeric measure of play outcome in yards
     "expected_points_added"]   # EPA: expected change in scoring probability due to the play
]
# Result:

# play_meta is now a compact DataFrame containing one record per passing play
# with the most important *result* variables.
# This smaller table can be merged onto your tracking data (by game_id + play_id)
# to attach outcome labels to each play.

# Example shape and columns:
# >>> play_meta.head()
#     game_id   play_id pass_result  yards_gained  expected_points_added
# 0  2023090700     3461          C            18               2.945847
# 1  2023090700      461          C            21               1.345633
# 2  2023090700     1940          I             0              -0.081964

In [26]:
#---STEP 7: Tag each player’s tracking data with side/role, then isolate post-throw trajectories for the targeted receiver and all defenders---#

# Combine post-throw tracking data with player metadata (offense/defense, roles),
# then isolate trajectories for (a) the targeted receiver and (b) all defenders.
# These will be used to measure distances and movement patterns after the ball is thrown.

# (1) Annotate output rows with player side and role

out_tagged = output_df.merge(
    roles,                              # lookup table from input_df with player_side, player_role, and player_name
    on=["game_id","play_id","nfl_id"],  # join on the shared identifiers
    how="left",                         # keep all tracking rows from output_df
    validate="many_to_one"              # ensures many tracking frames link to one player-role record
)
# Result: each row in output_df now has added columns like:
#     player_side ('Offense' or 'Defense')
#     player_role ('Targeted Receiver', 'Passer', etc.)
# This provides context for interpreting movements after the throw.

# (2) Extract the targeted receiver's trajectory (after throw)

receiver_traj = (
    out_tagged
        .merge(target_rec, on=["game_id","play_id"], how="inner")  # attach target_nfl_id for each play
        .query("nfl_id == target_nfl_id")                          # keep only rows for that targeted player
        [["game_id","play_id","frame_id","nfl_id","x","y"]]         # keep essential tracking columns
        .rename(columns={"x":"rec_x","y":"rec_y"})                  # rename for clarity (receiver coordinates)
)
# Result: receiver_traj contains x/y positions of the targeted receiver across frames
# after the throw — useful for modeling the receiver’s movement toward the catch point.

# (3) Extract all defenders’ trajectories

defenders_traj = (
    out_tagged
        .query("player_side == 'Defense'")                         # filter to defensive players
        [["game_id","play_id","frame_id","nfl_id","x","y"]]         # retain tracking identifiers and coordinates
        .rename(columns={"x":"def_x","y":"def_y","nfl_id":"def_nfl_id"})  # rename for clarity
)
# Result: defenders_traj contains x/y positions for every defender at each frame
# after the throw — used to calculate nearest-defender distances to the receiver.

In [27]:
#---STEP 8: Build play-level anchors from pre-throw tracking---#

# (a) QB throw location, (b) targeted receiver ID,
# (c) receiver’s location/motion at throw, (d) ball landing spot.

# (a) THROW SNAPSHOT — QB’s last pre-throw frame (the release moment)
qb_throw = (
    input_df.loc[input_df["player_role"] == "Passer"]              # keep only QB rows
            .sort_values(["game_id", "play_id", "frame_id"])       # order frames within each play
            .groupby(["game_id", "play_id"], as_index=False)       # group per play
            .tail(1)                                               # take the last pre-throw frame
            [["game_id", "play_id", "nfl_id", "x", "y"]]           # keep IDs + position
            .rename(columns={                                       # rename for clarity
                "nfl_id": "passer_nfl_id",
                "x": "throw_x",
                "y": "throw_y"
            })
)
# Result: one row per play with QB’s (x,y) at the throw.

# (b) TARGETED RECEIVER ID — who the pass was intended for (one per play)
target_rec = (
    input_df.loc[input_df["player_role"] == "Targeted Receiver",   # intended receiver rows
                 ["game_id", "play_id", "nfl_id"]]
            .drop_duplicates(["game_id", "play_id"])                # ensure 1 receiver per play
            .rename(columns={"nfl_id": "target_nfl_id"})            # clearer name for later joins
)
# Result: lookup table (game_id, play_id) -> target_nfl_id.

# (c) RECEIVER SNAPSHOT AT THROW — target’s last pre-throw frame (position & motion)
rec_at_throw = (
    input_df.loc[input_df["player_role"] == "Targeted Receiver"]   # target receiver rows
            .sort_values(["game_id", "play_id", "frame_id"])       # order frames
            .groupby(["game_id", "play_id"], as_index=False)
            .tail(1)                                               # last pre-throw frame for target
            [["game_id","play_id","nfl_id","x","y","s","a","dir"]] # pos + kinematics
            .rename(columns={
                "nfl_id": "target_nfl_id",
                "x": "rec_throw_x",
                "y": "rec_throw_y",
                "s": "rec_throw_s",
                "a": "rec_throw_a",
                "dir": "rec_throw_dir"
            })
)
# Result: target’s (x,y,s,a,dir) at throw time per play.

# (d) BALL LANDING — expected landing coordinates (one per play, first non-null)
ball_land = (
    input_df[["game_id", "play_id", "ball_land_x", "ball_land_y"]] # select landing cols
            .dropna(subset=["ball_land_x", "ball_land_y"])         # require both coords
            .drop_duplicates(["game_id", "play_id"])               # one landing per play
)
# Result: (game_id, play_id) -> ball_land_x, ball_land_y.

In [30]:
#---STEP 9 Set up distance calculations---#

# Combine post-throw tracking data with player metadata, then separate trajectories for
# (a) the targeted receiver and (b) all defenders. This sets up distance calculations later.

# (1) Annotate each row of output_df with player side and role
out_tagged = output_df.merge(
    roles,                              # lookup of player info from input_df
    on=["game_id","play_id","nfl_id"],  # join keys for matching records
    how="left",                         # keep all tracking rows from output_df
    validate="many_to_one"              # ensures each player maps to one role record
)
#  out_tagged = output_df + context columns like player_side and player_role

# (2) Receiver trajectory after throw (target only)
receiver_traj = (
    out_tagged
        .merge(target_rec, on=["game_id","play_id"], how="inner")  # adds target_nfl_id per play
        .query("nfl_id == target_nfl_id")                          # keeps only the targeted receiver
        [["game_id","play_id","frame_id","nfl_id","x","y"]]         # select position data
        .rename(columns={"x":"rec_x","y":"rec_y"})                  # rename for clarity
)
#  receiver_traj holds the x/y positions of the targeted receiver across all post-throw frames.

# (3) Defenders’ trajectories (all defenders)
defenders_traj = (
    out_tagged
        .query("player_side == 'Defense'")                         # filter to defensive players
        [["game_id","play_id","frame_id","nfl_id","x","y"]]         # select core tracking columns
        .rename(columns={"x":"def_x","y":"def_y","nfl_id":"def_nfl_id"})
)
#  defenders_traj holds x/y positions for every defensive player per frame, ready to compare to receiver_traj.

In [33]:
#---STEP 10 Find the nearest defender---#

# Pair each receiver frame with all defenders in the same frame ---
# This creates all possible defender–receiver pairs for each frame of each play.
pairwise = defenders_traj.merge(
    receiver_traj,                      # receiver trajectory (target receiver only)
    on=["game_id", "play_id", "frame_id"],  # join on game, play, and frame
    how="inner"                         # keep only frames that appear in both datasets
)

# Calculate Euclidean distance between defender and receiver ---
# np.hypot computes sqrt(dx**2 + dy**2) safely and efficiently.
pairwise["dist_to_receiver"] = np.hypot(
    pairwise["def_x"] - pairwise["rec_x"],  # horizontal distance difference
    pairwise["def_y"] - pairwise["rec_y"]   # vertical distance difference
)

# Identify the nearest defender per frame ---
# Group by frame to find the minimum distance (closest defender) for that receiver.
nearest_per_frame = (
    pairwise
    .groupby(["game_id", "play_id", "frame_id"], as_index=False)
    .agg(min_def_dist=("dist_to_receiver", "min"))  # get smallest defender distance
)

# nearest_per_frame now contains:
# one row per frame with the minimum distance between the receiver and the closest defender.

In [37]:
#---STEP 11: Get the closest defender at the final frame---#

# For each play, find the maximum frame_id (the final frame)
last_frames = (
    receiver_traj
    .groupby(["game_id", "play_id"], as_index=False)["frame_id"]
    .max()
    .rename(columns={"frame_id": "last_output_frame"})
)

# Get the closest defender distance (min_def_dist) at the *final frame* of each play.
final_min = (
    nearest_per_frame
    .merge(
        last_frames,                             # contains last frame info per play (with last_output_frame)
        on=["game_id", "play_id"],               # join by game and play IDs
        how="inner"                              # only keep plays that appear in both
    )
    .query("frame_id == last_output_frame")      # keep only the *last frame* for each play
    .rename(columns={"min_def_dist": "final_min_def_dist"})  # rename for clarity
    [["game_id", "play_id", "final_min_def_dist"]]            # keep only relevant columns
)

# Compute the *average* closest-defender distance throughout each play.
mean_min = (
    nearest_per_frame
    .groupby(["game_id", "play_id"], as_index=False)["min_def_dist"]
    .mean()                                     # mean distance per play across all frames
    .rename(columns={"min_def_dist": "mean_min_def_dist"})   # rename for clarity
)


# Find how many frames (moments) each play has — useful for weighting or pacing.
frame_count = (
    receiver_traj
    .groupby(["game_id", "play_id"], as_index=False)["frame_id"]
    .nunique()                                  # number of unique frames for that receiver in each play
    .rename(columns={"frame_id": "n_output_frames"})          # rename for clarity
)

# Combine all summary metrics into one table 
# Merge mean distance, final-frame distance, and frame count into a single DataFrame.
play_dist_summ = (
    mean_min
    .merge(final_min, on=["game_id", "play_id"], how="left")  # add final-frame min distance
    .merge(frame_count, on=["game_id", "play_id"], how="left")# add frame count per play
)

# play_dist_summ now includes:
#  - mean_min_def_dist: average closest-defender distance across frames
#  - final_min_def_dist: distance at the last frame of the play
#  - n_output_frames: total frames in that play

In [38]:
#---STEP 11: Combine into 1 Table---#

# Merge coverage metrics (distances) with metadata and event-level info.
play_level = (
    play_dist_summ
    .merge(play_meta, on=["game_id", "play_id"], how="left")      # adds play outcomes (e.g., result, yards gained)
    .merge(qb_throw, on=["game_id", "play_id"], how="left")       # adds QB throw location (x, y at release)
    .merge(rec_at_throw, on=["game_id", "play_id"], how="left")   # adds receiver location at time of throw
    .merge(ball_land, on=["game_id", "play_id"], how="left")      # adds ball landing point (where pass ends)
    .merge(target_rec, on=["game_id", "play_id"], how="left")     # adds receiver target ID or metadata
)

# Goal: Measure straight-line distance between the QB's throw point and where the ball lands.
# np.hypot(dx, dy) computes the Euclidean distance sqrt(dx^2 + dy^2).
play_level["air_yards_est"] = np.hypot(
    play_level["ball_land_x"] - play_level["throw_x"],   # horizontal distance (x-axis)
    play_level["ball_land_y"] - play_level["throw_y"],   # vertical distance (y-axis)
)

# play_level now contains:
#   • mean_min_def_dist : average closest-defender distance per play
#   • final_min_def_dist : final-frame defender distance
#   • n_output_frames : number of frames analyzed
#   • play_meta fields :  play outcomes (yards, completion, etc.)
#   • qb_throw, rec_at_throw, ball_land : spatial event coordinates
#   • air_yards_est : estimated air yards from throw to landing

In [39]:
#---STEP 12: Results Summary---# 

summary = (
    play_level
    # Create a new Boolean column 'caught' 
    # .assign() adds or modifies columns; lambda d: d[...] references the DataFrame itself.
    # .eq("C") checks whether 'pass_result' equals "C" (caught), returning True/False.
    .assign(caught=lambda d: d["pass_result"].eq("C"))

    # Group plays by whether the pass was caught 
    # This separates plays into caught (True) and not caught (False) groups.
    .groupby("caught")[["final_min_def_dist", "mean_min_def_dist", "air_yards_est", "n_output_frames"]]

    # Describe each group statistically
    # .describe() computes count, mean, std, min, quartiles, and max for each numeric column.
    .describe()
    .round(3)  # round all numeric results to 3 decimal places for readability
)

# --- Display the results ---
print(summary)


       final_min_def_dist                                                    \
                    count   mean    std    min    25%    50%    75%     max   
caught                                                                        
False               251.0  2.079  1.942  0.020  0.851  1.386  2.623  14.281   
True                502.0  3.591  2.413  0.261  1.812  3.087  4.686  14.830   

       mean_min_def_dist         ... air_yards_est         n_output_frames  \
                   count   mean  ...           75%     max           count   
caught                           ...                                         
False              251.0  2.608  ...        31.598  61.840           251.0   
True               502.0  4.219  ...        23.764  51.271           502.0   

                                                    
          mean    std  min  25%   50%    75%   max  
caught                                              
False   13.339  6.226  5.0  9.0  11.0  16.00  34.0  
T