In [1]:
import nfl_data_py as nfl

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests

In [3]:
# Download historic NFL play-by-play data
pbp_1999_to_2023 = nfl.import_pbp_data(range(1999, 2024), downcast=True, cache=False)

1999 done.
2000 done.
2001 done.
2002 done.
2003 done.
2004 done.
2005 done.
2006 done.
2007 done.
2008 done.
2009 done.
2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
Downcasting floats.


For some reason, the nfl_data_py package uses the wrong pbp section of the nflverse repo. To counter that, the 2024 data will be downloaded seperately. 

In [83]:
# Define current year for play-by-play data download
current_year = 2024

# URL of the CSV file
url = f'https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_{current_year}.csv'

# Send a GET request to the URL to fetch the CSV
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Read the content into a pandas DataFrame
    from io import StringIO
    csv_data = StringIO(response.text)
    pbp_2024_df = pd.read_csv(csv_data, low_memory=False)

# Use 538's ELO Rating system

In [84]:
class Forecast:
    @staticmethod
    def forecast_elo(df):
        """Calculates Elo ratings and win probabilities for each game."""
        teams = {}  # Store the Elo ratings of each team
        K = 20.0    # Speed at which Elo ratings change
        HFA = 65.0  # Home field advantage
        
        # Start by initializing team Elos based on 2020 season data
        for i, row in df.iterrows():
            if pd.notna(row['elo1']) and pd.notna(row['elo2']):
                teams[row['team1']] = row['elo1']
                teams[row['team2']] = row['elo2']

        # Iterate through games starting from 2021
        for i, row in df.iterrows():
            if pd.isna(row['elo1']) or pd.isna(row['elo2']):
                team1 = row['team1']
                team2 = row['team2']
                
                # Set default Elo if missing for new teams or from 2021
                if team1 not in teams:
                    teams[team1] = 1500  # Default Elo for new teams
                if team2 not in teams:
                    teams[team2] = 1500
                
                # Get the Elo ratings of both teams
                elo1 = teams[team1]
                elo2 = teams[team2]
                
                # Adjust for home field advantage
                elo_diff = elo1 - elo2 + (0 if row['neutral'] == 1 else HFA)
                
                # Calculate win probability for team1
                prob1 = 1.0 / (math.pow(10.0, (-elo_diff / 400.0)) + 1.0)
                
                # Store the Elo ratings and probability in the DataFrame
                df.at[i, 'elo1'] = elo1
                df.at[i, 'elo2'] = elo2
                df.at[i, 'elo_prob1'] = prob1
                
                # If the game is already played, update the Elo ratings based on the result
                if pd.notna(row['score1']):
                    pd_diff = abs(row['score1'] - row['score2'])
                    mult = math.log(max(pd_diff, 1) + 1.0) * (2.2 / (1.0 if row['result1'] == 0.5 else ((elo_diff if row['result1'] == 1.0 else -elo_diff) * 0.001 + 2.2)))

                    shift = (K * mult) * (row['result1'] - prob1)
                    
                    # Update Elo ratings for both teams
                    teams[team1] += shift
                    teams[team2] -= shift
        
        return df

In [85]:
# Get the 538 ELO ratings
data_538 = pd.read_csv("data_538/nfl_games.csv")

The 538 data uses the actual date of the game rather than the week the game took place. To correct this and bring it inline with the play-by-play data, the actual date will be converted to its corresponding NFL week. 

In [86]:
# Convert the date column to datetime format
data_538['date'] = pd.to_datetime(data_538['date'])

# Sort by date and season
data_538 = data_538.sort_values(by=['season', 'date'])

# Initialize the 'week' column with zeros
data_538['week'] = 0

# Group by each season
for season, group in data_538.groupby('season'):
    
    # Sort the group by date within the season
    group = group.sort_values(by='date')
    
    # Assign week numbers based on the date range
    # Week 1 starts at the first game
    week_num = 1
    first_game_date = group['date'].iloc[0]
    
    for idx, row in group.iterrows():
        # If more than 7 days have passed since the first game of the week, increment week number
        if (row['date'] - first_game_date).days >= 7:
            week_num += 1
            first_game_date = row['date']
        
        # Assign the current week number
        data_538.at[idx, 'week'] = week_num

In [87]:
# Sort DataFrame
data_538 = data_538[
    [
        'week',
        'season',
        'neutral',
        'playoff',
        'team1',
        'team2',
        'elo1',
        'elo2',
        'elo_prob1',
        'score1',
        'score2',
        'result1'
    ]
]

In [88]:
# Split out the 2020 ELO ratings
data_538_2020 = data_538[data_538['season'] == 2020]

## Use the Play-by-Play Data to Fill in the Missing Seasons

In [89]:
# Get season data
season_data_df = pbp_1999_to_2023[
   [
    'week',
    'season', 
    'location',
    'season_type',
    'home_team', 
    'away_team', 
    'home_score', 
    'away_score'
   ]
].drop_duplicates().reset_index(drop=True)

# Drop unneeded seasons for ELO creation
season_data_2021_to_2023_df = season_data_df[season_data_df['season'] >= 2021]

In [90]:
# Format the DataFrame to match the 538 DataFrame
season_data_2021_to_2023_df['neutral'] = season_data_2021_to_2023_df['location'].map({'Neutral' : 1, 'Home' : 0})
season_data_2021_to_2023_df['playoff'] = season_data_2021_to_2023_df['season_type'].map({'POST' : 1, 'REG' : 0})
season_data_2021_to_2023_df['result1'] = season_data_2021_to_2023_df['home_score'] > season_data_2021_to_2023_df['away_score']
season_data_2021_to_2023_df['result1'] = season_data_2021_to_2023_df['result1'].astype(int)

# Drop unneeded columns
season_data_2021_to_2023_df.drop(['location', 'season_type'], axis=1)

# Organize the columns of the DataFrame
season_data_2021_to_2023_df = season_data_2021_to_2023_df[
    [
        'week',
        'season', 
        'neutral',
        'playoff',
        'home_team', 
        'away_team', 
        'home_score', 
        'away_score',
        'result1'   
    ]
]

season_data_2021_to_2023_df

Unnamed: 0,week,season,neutral,playoff,home_team,away_team,home_score,away_score,result1
5849,1,2021,0,0,TEN,ARI,13,38,0
5850,1,2021,0,0,LV,BAL,33,27,1
5851,1,2021,0,0,LA,CHI,34,14,1
5852,1,2021,0,0,KC,CLE,33,29,1
5853,1,2021,0,0,TB,DAL,31,29,1
...,...,...,...,...,...,...,...,...,...
6698,20,2023,0,1,BUF,KC,24,27,0
6699,20,2023,0,1,DET,TB,31,23,1
6700,21,2023,0,1,SF,DET,34,31,1
6701,21,2023,0,1,BAL,KC,10,17,0


In [91]:
# Rename columns to match what the Forecast class expects
season_data_2021_to_2023_df.rename(columns={
    'home_team': 'team1',
    'away_team': 'team2',
    'home_score': 'score1',
    'away_score': 'score2'
}, inplace=True)

# Add ELO columns for team1 and team2, initialize them with NaNs
season_data_2021_to_2023_df['elo1'] = pd.NA 
season_data_2021_to_2023_df['elo2'] = pd.NA 
season_data_2021_to_2023_df['elo_prob1'] = pd.NA

# Change column ordering
season_data_2021_to_2023_df = season_data_2021_to_2023_df[
    [
        'week',
        'season',
        'neutral',
        'playoff',
        'team1',
        'team2',
        'elo1',
        'elo2',
        'elo_prob1',
        'score1',
        'score2',
        'result1'
    ]
]

In [98]:
# Perform a union of the DataFrames
elo_df = pd.concat([data_538_2020, season_data_2021_to_2023_df]).reset_index(drop=True)

In [99]:
# Apply the class to the ELO DataFrame
elo_df = Forecast.forecast_elo(elo_df)

In [100]:
# View the DataFrame
elo_df

Unnamed: 0,week,season,neutral,playoff,team1,...,elo2,elo_prob1,score1,score2,result1
0,1,2020,0,0,KC,...,1527.930047,0.761756,34,20,1.0
1,1,2020,0,0,ATL,...,1546.899069,0.575148,25,38,0.0
2,1,2020,0,0,BAL,...,1440.533332,0.819559,38,6,1.0
3,1,2020,0,0,CAR,...,1437.326108,0.564980,30,34,0.0
4,1,2020,0,0,JAX,...,1482.654778,0.529966,27,20,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1118,20,2023,0,1,BUF,...,1703.704642,0.642802,24,27,0.0
1119,20,2023,0,1,DET,...,1560.432632,0.638348,31,23,1.0
1120,21,2023,0,1,SF,...,1609.349457,0.763842,34,31,1.0
1121,21,2023,0,1,BAL,...,1722.393960,0.603585,10,17,0.0


Now that I have the ELOs for the seasons that were not generated by 538. I will need to pull the ELO data for all of the seasons that there is play-by-play data for (1999 - current)

In [95]:
# Get the ELO ratings from 1999 to 2022
data_538_1999_2019 = data_538[(data_538['season'] >= 1999) & (data_538['season'] < 2020)]

In [101]:
# Perform a union of the DataFrames
elo_df = pd.concat([data_538_1999_2019, elo_df]).reset_index(drop=True)

In [102]:
# View DataFrame
elo_df

Unnamed: 0,week,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,1,1999,0,0,PHI,ARI,1367.636000,1476.375000,0.437385,24,25,0.0
1,1,1999,0,0,TEN,CIN,1522.743000,1377.417000,0.770429,36,35,1.0
2,1,1999,0,0,NYJ,NE,1611.759000,1516.205000,0.715902,28,30,0.0
3,1,1999,0,0,NO,CAR,1437.814000,1443.281000,0.584846,19,10,1.0
4,1,1999,0,0,JAX,SF,1572.739000,1599.935000,0.554191,41,3,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6701,20,2023,0,1,BUF,KC,1740.771606,1703.704642,0.642802,24,27,0.0
6702,20,2023,0,1,DET,TB,1594.139273,1560.432632,0.638348,31,23,1.0
6703,21,2023,0,1,SF,DET,1748.269757,1609.349457,0.763842,34,31,1.0
6704,21,2023,0,1,BAL,KC,1730.429639,1722.393960,0.603585,10,17,0.0
