In [1]:
import sys
import os

# Navigate up one level to the parent directory and append it to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

import nfl_data_py as nfl
import pandas as pd
import numpy as np
from src import utils

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Football Analytics

The goal of this chapter is to get familiar with nfl_data_py and analyze QB ADOT

In [3]:
pbp = nfl.import_pbp_data([2023])

2023 done.
Downcasting floats.


In [45]:
filter_criteria = 'play_type=="pass" & air_yards.notnull()'

pass_pbp = (
    pbp.query(filter_criteria)
    # .groupby(['passer_id', 'passer'])
    .groupby(['passer', 'posteam'])
    .agg({'air_yards': ['count', 'mean'], 'epa':'mean'})
)

pass_pbp.columns = list(map('_'.join, pass_pbp.columns.values))
pass_pbp = pass_pbp[pass_pbp['air_yards_count'] >= 100].sort_values('air_yards_mean', ascending=False).reset_index()
pass_pbp

Unnamed: 0,passer,posteam,air_yards_count,air_yards_mean,epa_mean
0,D.Watson,CLE,107,9.794393,-0.040751
1,R.Tannehill,TEN,157,9.375796,0.096253
2,J.Love,GB,234,9.333333,0.042378
3,D.Carr,NO,282,8.829787,0.102541
4,J.Hurts,PHI,282,8.765958,0.202284
5,M.Stafford,LA,275,8.574546,0.151826
6,J.Allen,BUF,285,8.470176,0.259472
7,J.Dobbs,ARI,266,8.263158,-0.022246
8,C.Stroud,HOU,232,8.232759,0.230595
9,B.Mayfield,TB,246,8.154471,0.088271


In [59]:
fig = px.scatter(pass_pbp, x='air_yards_mean', y='epa_mean', text='passer', color='posteam', color_discrete_map=utils.team_primary_colors, opacity=0)
fig.update_layout(
    height=600,
    width=1000,
    xaxis_title=f'Mean ADOT',
    yaxis_title='EPA per Play',
    title='QB EPA per Play by Mean ADOT',
    # subheader='min 100 pass attempts',
)
fig.update_traces(showlegend=False, textposition='bottom center')

# Iterate through the data and add logos to the chart
for index, row in pass_pbp.iterrows():
    team = row['posteam']
    scale = .16
    fig.add_layout_image(
        dict(source=f'https://a.espncdn.com/i/teamlogos/nfl/500/{team}.png',
            x=row['air_yards_mean'],
            y=row['epa_mean'],
            xref="x",
            yref="y",
            sizex=scale,  # Adjust the size
            sizey=scale,  # Adjust the size
            sizing="contain",
            opacity=.6,
            xanchor="center",
            yanchor="middle",        
            layer="below",
        )
    )

fig.show()

# 2. Stable vs Unstable QB Metrics

## Pass Data Prep

In [3]:
pbp = pd.DataFrame(nfl.import_pbp_data(range(2016, 2023)))
pbp.shape

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


(341697, 384)

In [4]:
# filter to only passing data
pbp_p = pbp.query("play_type == 'pass' & air_yards.notnull()").reset_index()

In [5]:
# categorize passes as either long or short
pbp_p['pass_length_air_yards'] = np.where(pbp_p['air_yards'] >= 20, 'long', 'short')
# fill in pass yards as 0's if they are indeed null
pbp_p['passing_yards'] = pbp_p['passing_yards'].fillna(0)

In [6]:
pbp_p['passing_yards'].describe()

count    131606.000000
mean          7.192111
std           9.667647
min         -20.000000
25%           0.000000
50%           5.000000
75%          11.000000
max          98.000000
Name: passing_yards, dtype: float64

In [7]:
pbp_p.query('pass_length_air_yards == "long"')['passing_yards'].describe()

count    15519.000000
mean        12.168761
std         17.923370
min          0.000000
25%          0.000000
50%          0.000000
75%         26.000000
max         98.000000
Name: passing_yards, dtype: float64

In [8]:
pbp_p.query('pass_length_air_yards == "short"')['passing_yards'].describe()

count    116087.000000
mean          6.526812
std           7.695791
min         -20.000000
25%           0.000000
50%           5.000000
75%          10.000000
max          95.000000
Name: passing_yards, dtype: float64

In [9]:
pbp_p.query('pass_length_air_yards == "long"')['epa'].describe()

count    15519.000000
mean         0.382649
std          2.185549
min        -10.477921
25%         -0.827421
50%         -0.465344
75%          2.136431
max          8.789743
Name: epa, dtype: float64

In [10]:
pbp_p.query('pass_length_air_yards == "short"')['epa'].describe()

count    116086.000000
mean          0.119606
std           1.426229
min         -13.031219
25%          -0.606135
50%          -0.002100
75%           0.959107
max           8.241420
Name: epa, dtype: float64

In [11]:
px.histogram(pbp, x='passing_yards', height=500, width=800)

In [12]:
px.box(pbp_p, 
        x='pass_length_air_yards', 
        y='passing_yards',
        height=600, 
        width=500
    )

In [13]:
pbp_p_s = pbp_p.groupby(['passer_id', 'passer', 'season']).agg({'passing_yards': ['mean', 'count']})
pbp_p_s.columns = list(map('_'.join, pbp_p_s.columns))
pbp_p_s = pbp_p_s.rename(columns={
    'passing_yards_mean': 'ypa',
    'passing_yards_count': 'n',
})
pbp_p_s = pbp_p_s.query('n >= 100').sort_values(by='ypa', ascending=False)
pbp_p_s.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ypa,n
passer_id,passer,season,Unnamed: 3_level_1,Unnamed: 4_level_1
00-0023682,R.Fitzpatrick,2018,9.617887,246
00-0026143,M.Ryan,2016,9.442155,631
00-0029701,R.Tannehill,2019,9.069971,343
00-0033537,D.Watson,2020,8.898524,542
00-0036212,T.Tagovailoa,2022,8.892231,399


## The Hypothesis

*Throwing Deep Passes is more valuable than short passes, but it is difficult to say whether or not a quarterback is good at deep passes*

In [16]:
# play-by-play, pass data, by season, by pass length
pbp_p_s_pl = pbp_p.groupby(['passer_id', 'passer', 'season', 'pass_length_air_yards']).agg({'passing_yards': ['mean', 'count']})
pbp_p_s_pl.columns = list(map('_'.join, pbp_p_s_pl.columns))
pbp_p_s_pl = pbp_p_s_pl.rename(columns={
    'passing_yards_mean': 'ypa',
    'passing_yards_count': 'n',
})
pbp_p_s_pl = pbp_p_s_pl.reset_index()
criteria = '(n >= 100 & pass_length_air_yards=="short") | (n >= 30 & pass_length_air_yards=="long")'
pbp_p_s_pl = pbp_p_s_pl.query(criteria).reset_index()

In [18]:
air_yards = pbp_p_s_pl[['passer_id', 'passer', 'season', 'pass_length_air_yards', 'ypa']].copy()

In [19]:
# get the lag from the prior season
air_yards_lag = air_yards.copy().rename(columns={'ypa': 'ypa_last'})
air_yards_lag['season'] += 1

In [None]:
# join back to the main df
air_yards

# 3. Simple Linear Regression: Rushing Yards over Expected

# 4. Multiple Regression: Rushing Yards over Expected

# 5. Generalized Linear Models: Completion Percentage over Expected

# 6. Data Science for Sports Betting: Poisson Regression and Passing Touchdowns

# 7. Web Scraping: Obtaining and Analyzing Draft Picks

# 8. PCA and Clustering: Player Attributes

# 9. Advanced Tools and Next Steps