# 01 — Data Acquisition

Fetch all NFL data from `nfl_data_py` and cache locally as parquet files.

**Data sources:**
- Weekly player stats (2015-2024)
- Seasonal rosters (2015-2024)
- Contract data (from Over The Cap)
- Snap counts (2015-2024)
- Player ID crosswalk
- PFR seasonal stats — passing, rushing, receiving, defense (2018-2024)

In [1]:
import sys
sys.path.insert(0, '..')

from src.data_loader import (
    load_weekly_stats, load_rosters, load_contracts, load_snap_counts,
    load_players, load_ids, load_pfr_stats, load_all, DATA_DIR
)

print(f"Data directory: {DATA_DIR}")

E:\Python\Python38-32\lib\site-packages\numpy\.libs\libopenblas.D6ALFJ4QQDWP6YNOQJNPYL27LRE6SILT.gfortran-win32.dll
E:\Python\Python38-32\lib\site-packages\numpy\.libs\libopenblas_v0.3.21-gcc_8_3_0.dll


Data directory: G:\ai\nfl\data


## Fetch All Datasets

Set `force_refresh=True` to re-download from the internet.

In [2]:
FORCE = False  # Set True to re-download everything

datasets = load_all(force_refresh=FORCE)

print("\n=== Dataset Summary ===")
for name, df in datasets.items():
    print(f"{name:20s}: {df.shape[0]:>8,} rows x {df.shape[1]:>3} cols")

Loading cached weekly_stats from G:\ai\nfl\data\weekly_stats.parquet
Loading cached rosters from G:\ai\nfl\data\rosters.parquet
Loading cached contracts from G:\ai\nfl\data\contracts.parquet
Loading cached snap_counts from G:\ai\nfl\data\snap_counts.parquet
Loading cached players from G:\ai\nfl\data\players.parquet
Loading cached ids from G:\ai\nfl\data\ids.parquet
Loading cached pfr_pass from G:\ai\nfl\data\pfr_pass.parquet
Loading cached pfr_rush from G:\ai\nfl\data\pfr_rush.parquet
Loading cached pfr_rec from G:\ai\nfl\data\pfr_rec.parquet
Loading cached pfr_def from G:\ai\nfl\data\pfr_def.parquet

=== Dataset Summary ===
weekly_stats        :   54,479 rows x  53 cols
rosters             :   30,050 rows x  37 cols
contracts           :   50,164 rows x  25 cols
snap_counts         :  250,336 rows x  16 cols
players             :   24,356 rows x  39 cols
ids                 :   12,187 rows x  35 cols
pfr_pass            :      750 rows x  37 cols
pfr_rush            :    2,471 rows x 

## Explore Each Dataset

In [3]:
weekly = datasets['weekly_stats']
print("Weekly Stats columns:")
print(weekly.columns.tolist())
print(f"\nSeasons: {sorted(weekly['season'].unique())}")
print(f"Sample player_id: {weekly['player_id'].iloc[0]}")
weekly.head(3)

Weekly Stats columns:
['player_id', 'player_name', 'player_display_name', 'position', 'position_group', 'headshot_url', 'recent_team', 'season', 'week', 'season_type', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr']

Seasons: [2015, 2016, 2017, 2018, 2019, 2020, 202

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,recent_team,season,week,season_type,...,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr
0,00-0007091,,Matt Hasselbeck,QB,QB,https://static.www.nfl.com/image/private/f_aut...,IND,2015,4,REG,...,0.0,,0,,,,,0.0,15.38,15.38
1,00-0007091,,Matt Hasselbeck,QB,QB,https://static.www.nfl.com/image/private/f_aut...,IND,2015,5,REG,...,0.0,,0,,,,,0.0,16.219999,16.219999
2,00-0007091,,Matt Hasselbeck,QB,QB,https://static.www.nfl.com/image/private/f_aut...,IND,2015,11,REG,...,0.0,,0,,,,,0.0,13.32,13.32


In [4]:
contracts = datasets['contracts']
print("Contracts columns:")
print(contracts.columns.tolist())
print(f"\nRows with gsis_id: {contracts['gsis_id'].notna().sum():,} / {len(contracts):,}")
print(f"Rows with apy_cap_pct: {contracts['apy_cap_pct'].notna().sum():,}")
contracts.nlargest(5, 'apy_cap_pct')[['player', 'position', 'team', 'apy_cap_pct', 'apy', 'year_signed', 'gsis_id']]

Contracts columns:
['player', 'position', 'team', 'is_active', 'year_signed', 'years', 'value', 'apy', 'guaranteed', 'apy_cap_pct', 'inflated_value', 'inflated_apy', 'inflated_guaranteed', 'player_page', 'otc_id', 'gsis_id', 'date_of_birth', 'height', 'weight', 'college', 'draft_year', 'draft_round', 'draft_overall', 'draft_team', 'cols']

Rows with gsis_id: 46,685 / 50,164
Rows with apy_cap_pct: 50,164


Unnamed: 0,player,position,team,apy_cap_pct,apy,year_signed,gsis_id
0,Joe Burrow,QB,Bengals,0.245,55.0,2023,00-0036442
1,Aaron Rodgers,QB,GB/NYJ,0.241,50.271667,2022,00-0023459
2,Josh Allen,QB,Bills,0.236,43.0,2021,00-0034857
3,Russell Wilson,QB,Broncos,0.235,49.0,2022,00-0029263
4,Dak Prescott,QB,Cowboys,0.235,60.0,2024,00-0033077


In [5]:
snaps = datasets['snap_counts']
print("Snap Count columns:")
print(snaps.columns.tolist())
print(f"\nSeasons: {sorted(snaps['season'].unique())}")
print(f"Unique players: {snaps['pfr_player_id'].nunique():,}")
snaps.head(3)

Snap Count columns:
['game_id', 'pfr_game_id', 'season', 'game_type', 'week', 'player', 'pfr_player_id', 'position', 'team', 'opponent', 'offense_snaps', 'offense_pct', 'defense_snaps', 'defense_pct', 'st_snaps', 'st_pct']

Seasons: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Unique players: 5,905


Unnamed: 0,game_id,pfr_game_id,season,game_type,week,player,pfr_player_id,position,team,opponent,offense_snaps,offense_pct,defense_snaps,defense_pct,st_snaps,st_pct
0,2015_01_BAL_DEN,201509130den,2015,REG,1,Louis Vasquez,VasqLo20,G,DEN,BAL,70.0,1.0,0.0,0.0,5.0,0.17
1,2015_01_BAL_DEN,201509130den,2015,REG,1,Matt Paradis,ParaMa00,C,DEN,BAL,70.0,1.0,0.0,0.0,5.0,0.17
2,2015_01_BAL_DEN,201509130den,2015,REG,1,Ryan Harris,HarrRy20,T,DEN,BAL,70.0,1.0,0.0,0.0,5.0,0.17


In [6]:
players = datasets['players']
print("Players columns:")
print(players.columns.tolist())
print(f"\nWith gsis_id: {players['gsis_id'].notna().sum():,}")
print(f"With pfr_id: {players['pfr_id'].notna().sum():,}")
print(f"Both: {(players['gsis_id'].notna() & players['pfr_id'].notna()).sum():,}")
players[['gsis_id', 'display_name', 'position', 'pfr_id']].head(5)

Players columns:
['gsis_id', 'display_name', 'common_first_name', 'first_name', 'last_name', 'short_name', 'football_name', 'suffix', 'esb_id', 'nfl_id', 'pfr_id', 'pff_id', 'otc_id', 'espn_id', 'smart_id', 'birth_date', 'position_group', 'position', 'ngs_position_group', 'ngs_position', 'height', 'weight', 'headshot', 'college_name', 'college_conference', 'jersey_number', 'rookie_season', 'last_season', 'latest_team', 'status', 'ngs_status', 'ngs_status_short_description', 'years_of_experience', 'pff_position', 'pff_status', 'draft_year', 'draft_round', 'draft_pick', 'draft_team']

With gsis_id: 24,356
With pfr_id: 22,194
Both: 22,194


Unnamed: 0,gsis_id,display_name,position,pfr_id
0,00-0028830,Isaako Aaitui,NT,AaitIs00
1,00-0038389,Israel Abanikanda,RB,AbanIs00
2,00-0024644,Jon Abbate,LB,
3,ABB498348,Vince Abbott,K,abbotvin01
4,00-0031021,Jared Abbrederis,WR,AbbrJa00


In [7]:
for stat_type in ['pass', 'rush', 'rec', 'def']:
    df = datasets[f'pfr_{stat_type}']
    print(f"PFR {stat_type}: {df.shape[0]:>5} rows x {df.shape[1]:>2} cols | Seasons: {sorted(df['season'].unique())}")
    print(f"  Columns: {df.columns.tolist()}")
    print()

PFR pass:   750 rows x 37 cols | Seasons: [2018, 2019, 2020, 2021, 2022, 2023, 2024]
  Columns: ['player', 'team', 'pass_attempts', 'throwaways', 'spikes', 'drops', 'drop_pct', 'bad_throws', 'bad_throw_pct', 'season', 'pfr_id', 'pocket_time', 'times_blitzed', 'times_hurried', 'times_hit', 'times_pressured', 'pressure_pct', 'batted_balls', 'on_tgt_throws', 'on_tgt_pct', 'rpo_plays', 'rpo_yards', 'rpo_pass_att', 'rpo_pass_yards', 'rpo_rush_att', 'rpo_rush_yards', 'pa_pass_att', 'pa_pass_yards', 'intended_air_yards', 'intended_air_yards_per_pass_attempt', 'completed_air_yards', 'completed_air_yards_per_completion', 'completed_air_yards_per_pass_attempt', 'pass_yards_after_catch', 'pass_yards_after_catch_per_completion', 'scrambles', 'scramble_yards_per_attempt']

PFR rush:  2471 rows x 19 cols | Seasons: [2018, 2019, 2020, 2021, 2022, 2023, 2024]
  Columns: ['season', 'player', 'pfr_id', 'tm', 'age', 'pos', 'g', 'gs', 'att', 'yds', 'td', 'x1d', 'ybc', 'ybc_att', 'yac', 'yac_att', 'brk_tkl

In [8]:
rosters = datasets['rosters']
print("Roster columns:")
print(rosters.columns.tolist())
print(f"\nPositions: {sorted(rosters['position'].dropna().unique())}")
print(f"Seasons: {sorted(rosters['season'].unique())}")

Roster columns:
['season', 'team', 'position', 'depth_chart_position', 'jersey_number', 'status', 'player_name', 'first_name', 'last_name', 'birth_date', 'height', 'weight', 'college', 'player_id', 'espn_id', 'sportradar_id', 'yahoo_id', 'rotowire_id', 'pff_id', 'pfr_id', 'fantasy_data_id', 'sleeper_id', 'years_exp', 'headshot_url', 'ngs_position', 'week', 'game_type', 'status_description_abbr', 'football_name', 'esb_id', 'gsis_it_id', 'smart_id', 'entry_year', 'rookie_year', 'draft_club', 'draft_number', 'age']

Positions: ['C', 'CB', 'DB', 'DE', 'DL', 'DT', 'FB', 'FS', 'G', 'ILB', 'K', 'LB', 'LS', 'MLB', 'NT', 'OL', 'OLB', 'P', 'PR', 'QB', 'RB', 'S', 'SS', 'T', 'TE', 'WR']
Seasons: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]


## Verify Cached Files

In [9]:
import os

print("Cached files in data/:")
for f in sorted(DATA_DIR.glob('*.parquet')):
    size_mb = os.path.getsize(f) / 1024 / 1024
    print(f"  {f.name:30s}  {size_mb:.1f} MB")

print("\nAll datasets loaded and cached successfully!")

Cached files in data/:
  analysis_ready.parquet          0.9 MB
  contracts.parquet               3.2 MB
  ids.parquet                     1.4 MB
  pfr_def.parquet                 0.4 MB
  pfr_pass.parquet                0.1 MB
  pfr_rec.parquet                 0.2 MB
  pfr_rush.parquet                0.1 MB
  players.parquet                 3.4 MB
  rosters.parquet                 4.6 MB
  scored.parquet                  1.0 MB
  snap_counts.parquet             6.5 MB
  weekly_stats.parquet            3.1 MB

All datasets loaded and cached successfully!
