<a href="https://colab.research.google.com/github/fdac25/football/blob/main/football.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

In [24]:
# Import the dataset. Too big to include in GitHub so have to download it to colab

# Download latest version
path = kagglehub.dataset_download("philiphyde1/nfl-stats-1999-2022") # To access just use path variable

print("Path to dataset files:", path)

!dir {path}

Path to dataset files: C:\Users\chris\.cache\kagglehub\datasets\philiphyde1\nfl-stats-1999-2022\versions\17
 Volume in drive C is OS
 Volume Serial Number is 0C1F-C8E2

 Directory of C:\Users\chris\.cache\kagglehub\datasets\philiphyde1\nfl-stats-1999-2022\versions\17

11/11/2025  04:33 PM    <DIR>          .
11/11/2025  04:33 PM    <DIR>          ..
11/11/2025  04:33 PM        65,567,697 weekly_player_stats_defense.csv
11/11/2025  04:33 PM       105,635,176 weekly_player_stats_offense.csv
11/11/2025  04:33 PM         1,711,446 weekly_team_stats_defense.csv
11/11/2025  04:33 PM         3,692,917 weekly_team_stats_offense.csv
11/11/2025  04:33 PM        13,455,015 yearly_player_stats_defense.csv
11/11/2025  04:33 PM        27,959,466 yearly_player_stats_offense.csv
11/11/2025  04:33 PM           190,956 yearly_team_stats_defense.csv
11/11/2025  04:33 PM           450,662 yearly_team_stats_offense.csv
               8 File(s)    218,663,335 bytes
               2 Dir(s)  290,359,762,944 b

In [25]:
#Display the first few rows of the dataset to understand its structure and contents.
df_yearly_player_stats = pd.read_csv(path + "/yearly_player_stats_offense.csv")
df_yearly_player_stats.head()

Unnamed: 0,player_id,player_name,position,birth_year,draft_year,draft_round,draft_pick,draft_ovr,height,weight,...,delta_comp_pct,delta_int_pct,delta_pass_td_pct,delta_ypa,delta_yptarget,delta_ypr,delta_rush_td_pct,delta_ypc,delta_td_pct,delta_yptouch
0,00-0000865,Charlie Batch,QB,1974,1998.0,2.0,30.0,60.0,74.0,216.0,...,0.220036,-1.0,-1.0,0.025641,0.0,0.0,0.0,0.0,-0.909091,-0.388795
1,00-0004541,Donald Driver,WR,1975,1999.0,7.0,7.0,213.0,72.0,194.0,...,0.0,,,0.0,-0.172028,-0.068732,0.0,0.0,49.0,1.226852
2,00-0006101,Tony Gonzalez,TE,1976,1997.0,1.0,13.0,13.0,77.0,247.0,...,0.0,0.0,0.0,0.0,0.273859,-0.21617,0.0,0.0,3.833333,0.236242
3,00-0006101,Tony Gonzalez,TE,1976,1997.0,1.0,13.0,13.0,77.0,247.0,...,0.0,,,0.0,-0.165038,0.085776,0.0,0.0,-0.413793,0.085776
4,00-0007091,Matt Hasselbeck,QB,1975,1998.0,6.0,34.0,187.0,76.0,235.0,...,0.149281,-1.0,-1.0,-0.162698,0.0,0.0,-1.0,-0.337868,0.5,0.250509


In [26]:
df_yearly_team_stats = pd.read_csv(path + "/yearly_team_stats_offense.csv")
df_yearly_team_stats.head()

Unnamed: 0,team,season,season_type,shotgun,no_huddle,qb_dropback,qb_scramble,total_off_yards,pass_attempts,complete_pass,...,yptarget,ayptarget,ypr,rush_td_pct,ypc,touches,total_tds,td_pct,total_yards,yptouch
0,ARI,2012,REG,605.0,56.0,676.0,12.0,4587,587.0,337.0,...,5.76,8.28,10.04,0.028,3.42,1276.0,32.0,0.03,7970.0,6.25
1,ATL,2012,POST,64.0,3.0,80.0,2.0,894,74.0,54.0,...,8.73,12.43,11.96,0.0,5.06,177.0,12.0,0.07,1540.0,8.7
2,ATL,2012,REG,418.0,101.0,664.0,20.0,6116,601.0,422.0,...,7.85,8.33,11.18,0.032,3.69,1402.0,76.0,0.05,10835.0,7.73
3,BAL,2012,POST,97.0,53.0,134.0,2.0,1679,126.0,73.0,...,9.05,12.23,15.62,0.036,3.88,338.0,27.0,0.08,2819.0,8.34
4,BAL,2012,REG,452.0,257.0,608.0,9.0,5897,549.0,334.0,...,7.28,10.01,11.96,0.043,4.28,1327.0,63.0,0.05,9893.0,7.46


In [27]:
# Filter the dataframe to keep only the columns we need (Only year 2024, those not POST season, and things to calculate our score)
df_yearly_player_stats_filtered = df_yearly_player_stats.loc[(df_yearly_player_stats['season'] == 2024) & (df_yearly_player_stats['season_type'] != 'POST'),
                                                             ['player_name', 'position', 'season_complete_pass', 'season_pass_attempts', 'season_incomplete_pass', 'season_passing_yards',
                                                              'season_receiving_yards', 'season_rush_attempts', 'season_rushing_yards', 'season_fantasy_points_ppr', 'games_played_season',
                                                              'season_rush_touchdown', 'season_pass_touchdown', 'season_interception', 'season_fumble', 'season_fumble_lost',
                                                              'season_receptions', 'season_targets', 'season_receiving_touchdown'
                                                             ]]

# Examples
players = ['Amon-Ra St. Brown', 'Lamar Jackson', 'Dak Prescott', 'George Kittle', 'Sam LaPorta', 'Justin Jefferson', 'Bijan Robinson', 'Saquon Barkley', 'Cameron Dicker']
df_selected_players = df_yearly_player_stats_filtered[df_yearly_player_stats_filtered['player_name'].isin(players)]
df_selected_players

Unnamed: 0,player_name,position,season_complete_pass,season_pass_attempts,season_incomplete_pass,season_passing_yards,season_receiving_yards,season_rush_attempts,season_rushing_yards,season_fantasy_points_ppr,games_played_season,season_rush_touchdown,season_pass_touchdown,season_interception,season_fumble,season_fumble_lost,season_receptions,season_targets,season_receiving_touchdown
6630,Dak Prescott,QB,185.0,278.0,93.0,1978.0,0.0,13.0,54.0,137.52,8,1.0,11.0,8.0,4.0,1.0,0.0,0.0,0.0
6642,George Kittle,TE,0.0,0.0,0.0,0.0,939.0,0.0,0.0,176.9,12,0.0,4.0,1.0,0.0,0.0,61.0,71.0,4.0
6724,Lamar Jackson,QB,316.0,470.0,154.0,4172.0,0.0,140.0,915.0,505.38,17,4.0,41.0,4.0,10.0,5.0,0.0,0.0,0.0
6732,Saquon Barkley,RB,0.0,0.0,0.0,0.0,278.0,348.0,2005.0,348.3,16,13.0,2.0,0.0,2.0,1.0,33.0,43.0,2.0
6814,Justin Jefferson,WR,1.0,1.0,0.0,22.0,1533.0,1.0,3.0,307.48,17,0.0,10.0,5.0,1.0,0.0,103.0,149.0,10.0
6878,Amon-Ra St. Brown,WR,1.0,1.0,0.0,7.0,1263.0,2.0,6.0,327.18,17,0.0,15.0,1.0,1.0,1.0,115.0,140.0,14.0
6978,Bijan Robinson,RB,0.0,0.0,0.0,0.0,431.0,305.0,1456.0,339.7,17,14.0,1.0,0.0,1.0,0.0,61.0,72.0,1.0
7041,Sam LaPorta,TE,0.0,0.0,0.0,0.0,726.0,0.0,0.0,170.6,16,0.0,7.0,2.0,0.0,0.0,60.0,81.0,7.0


In [28]:
# Explore the positions we have in our dataset
print("Unique positions in dataset:")
unique_positions = df_yearly_player_stats_filtered['position'].unique()
print(unique_positions)

print("\nPlayers per position:")
position_counts = df_yearly_player_stats_filtered['position'].value_counts()
print(position_counts)

Unique positions in dataset:
['QB' 'TE' 'P' 'FB' 'WR' 'RB' 'SS' 'OLB' 'CB' 'DE' 'DT' 'FS' 'MLB']

Players per position:
position
WR     188
RB     109
TE     103
QB      74
FB       6
P        5
SS       5
CB       5
FS       2
OLB      1
DE       1
DT       1
MLB      1
Name: count, dtype: int64


In [29]:
# Clean data - removing players with missing data

# Remove any rows that don't have a player name, position, or games played
df_yearly_player_stats_filtered = df_yearly_player_stats_filtered.dropna(subset=(['player_name', 'position', 'games_played_season']))


# Replace any missing numbers with 0 (means they didn't do that stat)
numeric_columns = df_yearly_player_stats_filtered.select_dtypes(include=[np.number]).columns
df_yearly_player_stats_filtered[numeric_columns] = df_yearly_player_stats_filtered[numeric_columns].fillna(0)

# Reset the index after dropping rows
df_yearly_player_stats_filtered = df_yearly_player_stats_filtered.reset_index(drop=True)

print("\nAfter cleaning Players per position:")
position_counts = df_yearly_player_stats_filtered['position'].value_counts()
print(position_counts)


After cleaning Players per position:
position
WR     188
RB     109
TE     103
QB      74
FB       6
P        5
SS       5
CB       5
FS       2
OLB      1
DE       1
DT       1
MLB      1
Name: count, dtype: int64


In [30]:
# Only focusing on the main offensive positions, combining FB with RB
offensive_positions = ['QB', 'RB', 'WR', 'TE', 'FB']

df_yearly_player_stats_filtered = df_yearly_player_stats_filtered[df_yearly_player_stats_filtered['position'].isin(offensive_positions)]

# combine the fb positions into the rb column
df_yearly_player_stats_filtered['position'] = df_yearly_player_stats_filtered['position'].replace('FB', 'RB')

print(f"Total offensive position players: {len(df_yearly_player_stats_filtered)}")
print(df_yearly_player_stats_filtered['position'].value_counts())

Total offensive position players: 480
position
WR    188
RB    115
TE    103
QB     74
Name: count, dtype: int64
