In [1]:
# Imports
from pathlib import Path
import sys
import pandas as pd

In [None]:
"""Must download the dataset from Kaggle and place it in the parent directory as 
'nfl_playbyPlay_dataset_2009_2018.csv' before running this script."""

print('Loading CSV (this may take a while for large files)...')
df = pd.read_csv("../nfl_playbyPlay_dataset_2009_2018.csv", low_memory=False)
print("Shape of original dataset: ", df.shape)

Loading CSV (this may take a while for large files)...
Shape of original dataset:  (449371, 255)


In [11]:
# Get distribution of play types
play_dist = df['play_type'].value_counts()
play_dist_pct = df['play_type'].value_counts(normalize=True) * 100

# Display counts and percentages side by side
dist_df = pd.DataFrame({
    'Count': play_dist,
    'Percentage': play_dist_pct.round(2)
}).sort_values('Count', ascending=False)

print("Play Type Distribution:")
print("-" * 50)
print(dist_df.to_string(float_format=lambda x: f"{x:,.2f}" if isinstance(x, float) else f"{x:,}"))

Play Type Distribution:
--------------------------------------------------
              Count  Percentage
play_type                      
pass         186677       42.77
run          132692       30.40
no_play       42431        9.72
kickoff       25552        5.85
punt          23914        5.48
extra_point   10934        2.50
field_goal     9777        2.24
qb_kneel       3830        0.88
qb_spike        690        0.16


In [12]:
# Filter to keep only 'pass' and 'run' plays
keep_types = {'pass', 'run'}
filtered = df[df['play_type'].isin(keep_types)].copy()
print(filtered.shape)

(319369, 255)


In [13]:
# Get distribution of play types
play_dist = filtered['play_type'].value_counts()
play_dist_pct = filtered['play_type'].value_counts(normalize=True) * 100

# Display counts and percentages side by side
dist_df = pd.DataFrame({
    'Count': play_dist,
    'Percentage': play_dist_pct.round(2)
}).sort_values('Count', ascending=False)

print("Play Type Distribution:")
print("-" * 50)
print(dist_df.to_string(float_format=lambda x: f"{x:,.2f}" if isinstance(x, float) else f"{x:,}"))

Play Type Distribution:
--------------------------------------------------
            Count  Percentage
play_type                    
pass       186677       58.45
run        132692       41.55


In [15]:
# Save filtered dataset to new csv file
out = Path("../filtered_dataset.csv")
filtered.to_csv(out, index=False)