In [1]:
import warnings 
import pandas as pd
import numpy as np
import nfl_data_py as nfl
import datetime as dt
import copy
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


# Background/Ideas

- Features will be based off seasonal,weekly, and career based values
- Idea is that certain players which can be differentiated by career based values paired with weekly performance and or seasonal (team strength proxy) can be paired to build something relatively predictive.
- Interactivity can be dependent on clicking and choosing assortment of players and identifying/projecting current projections.
- Data seems to get updated weekly so these predictions would change over time as well.

# Data

This section focuses on pulling the data and prepping/aggregating the dependent variable. (Fantasy Points)

In [2]:
roster_data = nfl.import_seasonal_rosters([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999])
pbp_df = pd.DataFrame(nfl.import_pbp_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
weekly_df = pd.DataFrame(nfl.import_weekly_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
# injuries_df = pd.DataFrame(nfl.import_injuries([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000]))
schedules_df = pd.DataFrame(nfl.import_schedules([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))

2024 done.
2023 done.
2022 done.
2021 done.
2020 done.
2019 done.
2018 done.
2017 done.
2016 done.
2015 done.
2014 done.
2013 done.
2012 done.
2011 done.
2010 done.
2009 done.
2008 done.
2007 done.
2006 done.
2005 done.
2004 done.
2003 done.
2002 done.
2001 done.
2000 done.
1999 done.
Downcasting floats.
Downcasting floats.


# Basic Player Related Stats 

**Game by Game**

0                None
1                None
2                None
3          00-0035228
4          00-0035228
              ...    
1202652    00-0011024
1202653    00-0011024
1202654          None
1202655    00-0011024
1202656          None
Name: passer_player_id, Length: 1202657, dtype: object

In [35]:
team['depth_chart_position'].unique()

array(['T', 'QB', 'P', 'K', 'TE', 'LS', 'DE', 'SS', 'NT', 'DT', 'C',
       'ILB', 'FS', 'CB', 'G', 'MLB', 'FB', 'WR', 'RB', 'OLB', 'DB', 'LB',
       'OT', 'OG', 'S', 'SAF', 'HB', 'DL', None, 'OL', 'PR'], dtype=object)

In [36]:
## Basic PBP Passing Stats

def get_opposing_team(df):
    if df['home_team'] == df['team']:
        val = df['away_team']
    elif df['away_team'] == df['team']:
        val = df['home_team']
    else:
        val = None

    return val

passing_stats = pbp_df[~pbp_df['passer_player_id'].isna()].copy()

passing_stats = pbp_df.groupby(['game_id', 'game_date','season', 'week', 'div_game', 'home_team', 'away_team', 'weather', 'location', 'stadium',  'spread_line', 'total_line', 'roof', 'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'passer_player_id', 'passer_player_name']).agg({
    'pass_attempt': 'sum',
    'complete_pass': 'sum',
    'passing_yards': 'sum',
    'air_yards': 'sum',
    'pass_touchdown': 'sum',
    'interception': 'sum',
    'was_pressure': 'sum',
    'rush_attempt': 'sum',
    'rushing_yards': 'sum',# Sum passing yards
    'rush_touchdown': 'sum',
    'lateral_rush': 'sum',
    'fumble': 'sum'
}).reset_index()


## Grabbing seasonal info

team = roster_data[roster_data['depth_chart_position'] == 'QB'][['season','player_id','team','depth_chart_position']]

team['team'] = team['team'].replace({'OAK':'LV', 'STL':'LA', 'SD':'LAC','HST':'HOU', 'BLT':'BAL', 'CLV':'CLE','SL':'LA','ARZ':'ARI'})


team.rename(columns = {'player_id':'passer_player_id'},inplace = True)

passing_stats = passing_stats.merge(team, on = ['passer_player_id','season'], how = 'inner')


## Aggregate average score to opposition 

passing_stats['opponent_team'] = passing_stats.apply(get_opposing_team,axis = 1)

print('Number Missing Opponent:' + str(passing_stats[passing_stats['opponent_team'].isna()].shape[0]))
passing_stats = passing_stats[~passing_stats['opponent_team'].isna()]

home_teams = schedules_df[['season','home_team','away_score']].copy()

away_teams = schedules_df[['season','away_team','home_score']].copy()

home_teams.rename(columns = {'home_team':'team','away_score':'points_allowed'}, inplace = True)
away_teams.rename(columns = {'away_team':'team','home_score':'points_allowed'}, inplace = True)

points_allowed_df = pd.concat([home_teams,away_teams])


points_allowed_df['avg_points_allowed'] = points_allowed_df.groupby(['season','team'])['points_allowed'].transform('mean')


points_allowed_df.rename(columns = {'team':'opponent_team'},inplace = True)


kicker_stats = passing_stats.merge(points_allowed_df[['season','opponent_team','avg_points_allowed']].drop_duplicates(), on = ['opponent_team','season'], how = 'left')

#Checking the passing stats dataframe
passing_stats.head(2)

Number Missing Opponent:16


Unnamed: 0,game_id,game_date,season,week,div_game,home_team,away_team,weather,location,stadium,spread_line,total_line,roof,surface,temp,wind,home_coach,away_coach,passer_player_id,passer_player_name,pass_attempt,complete_pass,passing_yards,air_yards,pass_touchdown,interception,was_pressure,rush_attempt,rushing_yards,rush_touchdown,lateral_rush,fumble,team,depth_chart_position,opponent_team
0,2001_01_ATL_SF,2001-09-09,2001,1,1,SF,ATL,"partly cloudy Temp: 68° F, Humidity: 63%, Wind...",Home,3COM Park,3.5,46.0,outdoors,grass,68.0,12.0,Steve Mariucci,Dan Reeves,00-0002876,C.Chandler,20.0,11.0,121.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,ATL,QB,SF
1,2001_03_ATL_ARI,2001-09-30,2001,3,0,ARI,ATL,"Temp: 104° F, Humidity: 13%, Wind: East 20 mph",Home,Sun Devil Stadium,-3.0,42.0,outdoors,grass,104.0,20.0,Dave McGinnis,Dan Reeves,00-0002876,C.Chandler,30.0,20.0,286.0,0.0,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,ATL,QB,ARI


**Expanding it to Seasonal**

- Avg Passing Touchdowns per game
- Avg Interceptions per game
- Avg Rush Attempts
- Average Offensive Snaps Facing Pressure
- Proportion of Rush Touchdowns to Passing Touchdowns
- Average Rushing Touchdowns per game
- Average Rushing Yards 
- Average Passing Yards

In [37]:

## Avg Passing TD's
passing_stats['avg_ptd'] = passing_stats.groupby(['passer_player_id','season'])['pass_touchdown'].transform('mean')

## Avg Interceptions
passing_stats['avg_int'] = passing_stats.groupby(['passer_player_id','season'])['interception'].transform('mean')

## Avg Rush Attempts

passing_stats['avg_ra'] = passing_stats.groupby(['passer_player_id','season'])['rush_attempt'].transform('mean')


## Average Offensive Snaps Facing Pressure


passing_stats['avg_pressure_snaps'] = passing_stats.groupby(['passer_player_id','season'])['was_pressure'].transform('mean')


## Prop of Rush TD to Pass TD
passing_stats['total_rush_td'] = passing_stats.groupby(['passer_player_id','season'])['rush_touchdown'].transform('sum')
passing_stats['total_pass_td'] = passing_stats.groupby(['passer_player_id','season'])['pass_touchdown'].transform('sum')

passing_stats['rush_to_pass'] = passing_stats['total_rush_td'] / passing_stats['total_pass_td']


## Average Rushing TD's per game

passing_stats['avg_rush_td'] = passing_stats.groupby(['passer_player_id','season'])['rush_touchdown'].transform('mean')



## Avg Rushing Yards

passing_stats['avg_rush_yd'] = passing_stats.groupby(['passer_player_id','season'])['rushing_yards'].transform('mean')



## Avg Passing Yards

passing_stats['avg_pass_yd'] = passing_stats.groupby(['passer_player_id','season'])['passing_yards'].transform('mean')


## Avg Yards Per Pass Attempt

passing_stats['total_pass_attempts'] = passing_stats.groupby(['passer_player_id','season'])['pass_attempt'].transform('sum')
passing_stats['total_pass_yards'] = passing_stats.groupby(['passer_player_id','season'])['passing_yards'].transform('sum')
passing_stats['avg_yd_pass_att'] = passing_stats['total_pass_yards'] / passing_stats['total_pass_attempts']


## Fantasy Points (DV)


## Dependent Variable (Fantasy Points)

- Passing yards: 1 point for every 20–25 passing yards, or 0.04–0.05 points per passing yard
- Passing touchdowns: 4 points
- Rushing yards: 1 point for every 10 rushing yards
- Rushing touchdowns: 6 points 
- Other points that can be awarded include:
- Interceptions or fumbles lost -2 points
- Extra point: 1 point
- Field goal from 0–39 yards: 3 points
- Field goal from 40–49 yards: 4 points

In [41]:
passing_stats['fantasy_points'] = (( passing_stats['passing_yards'] * .05 ) + 
(passing_stats['pass_touchdown'] * 4) + 
(passing_stats['rushing_yards'] *.01) + (passing_stats['fumble'] * -2) + (passing_stats['interception'] * -2) + (passing_stats['rush_touchdown'] * 6)) 

In [43]:
passing_stats.sort_values('season', ascending = False).head(4)

Unnamed: 0,game_id,game_date,season,week,div_game,home_team,away_team,weather,location,stadium,spread_line,total_line,roof,surface,temp,wind,home_coach,away_coach,passer_player_id,passer_player_name,pass_attempt,complete_pass,passing_yards,air_yards,pass_touchdown,interception,was_pressure,rush_attempt,rushing_yards,rush_touchdown,lateral_rush,fumble,team,depth_chart_position,opponent_team,avg_ptd,avg_int,avg_ra,avg_pressure_snaps,total_rush_td,total_pass_td,rush_to_pass,avg_rush_td,avg_rush_yd,avg_pass_yd,total_pass_attempts,total_pass_yards,avg_yd_pass_att,fantasy_points
3903,2024_07_PHI_NYG,2024-10-20,2024,7,1,NYG,PHI,"Shallow Fog Temp: 72° F, Humidity: 28%, Wind: ...",Home,MetLife Stadium,-3.0,42.0,outdoors,fieldturf,71.0,4.0,Brian Daboll,Nick Sirianni,00-0038102,K.Pickett,2.0,0.0,0.0,5.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,PHI,QB,NYG,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,0.0,0.0
3775,2024_01_MIN_NYG,2024-09-08,2024,1,0,NYG,MIN,"A few clouds Temp: 64° F, Humidity: 48%, Wind:...",Home,MetLife Stadium,-1.0,42.5,outdoors,fieldturf,64.0,10.0,Brian Daboll,Kevin O'Connell,00-0035710,D.Jones,47.0,22.0,186.0,216.0,0.0,2.0,0,0.0,0.0,0.0,0.0,0.0,NYG,QB,MIN,0.75,0.625,0.0,0.0,0.0,6.0,0.0,0.0,0.0,213.25,306.0,1706.0,5.575163,5.3
3784,2024_02_SEA_NE,2024-09-15,2024,2,0,NE,SEA,"Sunny Temp: 77° F, Humidity: 50%, Wind: W 5 mph",Home,Gillette Stadium,-3.5,39.0,outdoors,fieldturf,77.0,5.0,Jerod Mayo,Mike Macdonald,00-0033119,J.Brissett,30.0,15.0,149.0,145.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,NE,QB,SEA,0.333333,0.166667,0.0,0.0,0.0,2.0,0.0,0.0,0.0,138.0,178.0,828.0,4.651685,11.450001
3783,2024_01_NE_CIN,2024-09-08,2024,1,0,CIN,NE,"Sunny Temp: 66° F, Humidity: 40%, Wind: NNE 5 mph",Home,Paycor Stadium,8.0,40.5,outdoors,fieldturf,66.0,5.0,Zac Taylor,Jerod Mayo,00-0033119,J.Brissett,25.0,15.0,121.0,154.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,NE,QB,CIN,0.333333,0.166667,0.0,0.0,0.0,2.0,0.0,0.0,0.0,138.0,178.0,828.0,4.651685,6.05


# Team Related Stats

**Game By Game**

- Proportion of Plays that are Runs
- Offensive Snaps Ratio (Does the team spend more time on defense or offense)

In [16]:
pbp_df['play_type'].unique()

array([None, 'kickoff', 'run', 'pass', 'extra_point', 'field_goal',
       'no_play', 'qb_kneel', 'punt', 'qb_spike'], dtype=object)

In [18]:
## Getting Team Related Run Plays


pass_rush_df = pbp_df[pbp_df['play_type'].isin(['run','pass'])]

## Seasonal Run Play Percentage

pass_rush_df['run_flag'] = np.where(pass_rush_df['play_type'] == 'run',1,0)

pass_rush_df['pass_flag'] = np.where(pass_rush_df['play_type'] == 'pass',1,0)

pass_rush_df['seasonal_run_total'] = pass_rush_df.groupby(['game_id',''])

pass_rush_df['seasonal_pass_total'] = np.where(pass_rush_df['play_type'] == 'pass',1,0)


# rushing_attempts_df.groupby(['game_id', 'game_date','season', 'week', 'div_game', 'home_team', 'away_team', 'weather', 'location', 'stadium',  'spread_line', 'total_line', 'roof', 'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'passer_player_id', 'passer_player_name'])['play_type'].transform('count')

2           NaN
3          35.0
4          35.0
5           NaN
6           NaN
           ... 
1202648     NaN
1202649     NaN
1202652     NaN
1202653     NaN
1202655     NaN
Name: play_id, Length: 854589, dtype: float64

**Seasonal**

- Average Porportion of Plays that are Runs
- Offensive snaps Ratio (Seasonal)
- Average Points Scored (seasonal)
- Average Opponent Allowed Points Scored
- Average Fantasy PT's opponent allows QB 

# External Game Factor Stats