In [1]:
import warnings 
import pandas as pd
import numpy as np
import nfl_data_py as nfl
import datetime as dt
import copy
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


# Background/Ideas

- Features will be based off seasonal,weekly, and career based values
- Idea is that certain players which can be differentiated by career based values paired with weekly performance and or seasonal (team strength proxy) can be paired to build something relatively predictive.
- Interactivity can be dependent on clicking and choosing assortment of players and identifying/projecting current projections.
- Data seems to get updated weekly so these predictions would change over time as well.

# Data

This section focuses on pulling the data and prepping/aggregating the dependent variable. (Fantasy Points)

In [2]:
roster_data = nfl.import_seasonal_rosters([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999])
pbp_df = pd.DataFrame(nfl.import_pbp_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
weekly_df = pd.DataFrame(nfl.import_weekly_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
# injuries_df = pd.DataFrame(nfl.import_injuries([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000]))
schedules_df = pd.DataFrame(nfl.import_schedules([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))

2024 done.
2023 done.
2022 done.
2021 done.
2020 done.
2019 done.
2018 done.
2017 done.
2016 done.
2015 done.
2014 done.
2013 done.
2012 done.
2011 done.
2010 done.
2009 done.
2008 done.
2007 done.
2006 done.
2005 done.
2004 done.
2003 done.
2002 done.
2001 done.
2000 done.
1999 done.
Downcasting floats.
Downcasting floats.


In [3]:
def calc_agg_stats(group, fields, career=True):
    """
    Calculate aggregate statistics for each player over their career and season,
    including prior season means, rolling averages, and cumulative counts.

    Parameters:
    - group: DataFrame grouped by player or other identifier.
    - fields: List of fields to calculate statistics on.
    - career: Boolean indicating whether to calculate career-level stats.

    Returns:
    - DataFrame with calculated aggregate statistics.
    """
    # Ensure 'game_date' is datetime
    group['game_date'] = pd.to_datetime(group['game_date'], errors='coerce')
    
    # Sort the group chronologically
    group_sorted = group.sort_values('game_date')
    
    # Initialize the result DataFrame
    result = pd.DataFrame(index=group_sorted.index)
    
    # Calculate cumulative game counts
    if career:
        # Career game count (number of games up to current point, excluding current game)
        result['n_games_career'] = np.arange(len(group_sorted))
    
    # Season game count
    result['n_games_season'] = group_sorted.groupby('season').cumcount()

    # Loop over each field to calculate aggregate stats
    for field in fields:
        if career:
            # Career mean up to the previous game (excluding current game)
            result[f'{field}_mean_career'] = (
                group_sorted[field]
                .expanding()
                .mean()
                .shift()
            )
        
        # Season mean up to the previous game (excluding current game)
        result[f'{field}_mean_season'] = (
            group_sorted.groupby('season')[field]
            .expanding()
            .mean()
            .shift()
            .reset_index(level=0, drop=True)
        )
        
        # # Prior season mean (mean of the entire previous season)
        # result[f'{field}_mean_prior_season'] = (
        #     group_sorted.groupby('season')[field]
        #     .transform('mean')
        #     .shift()
        # )
        
        # Rolling mean for the last 5 games up to the previous game (excluding current game)
        result[f'{field}_mean_last5'] = (
            group_sorted[field]
            .rolling(window=5, min_periods=1)
            .mean()
            .shift()
        )
    
    # Combine the result with the original group_sorted DataFrame
    combined = pd.concat([group_sorted, result], axis=1)
    
    return combined

## Basic Kicking Related Variables

In [4]:
# Filter rows where 'kicker_player_name' is not null and the play type is relevant
df_kicker_pbp = pbp_df.loc[
    pbp_df['kicker_player_name'].notnull() & 
    pbp_df['play_type'].isin(['field_goal', 'extra_point', 'kickoff'])
].copy() 

# Ensure 'posteam' and 'defteam' columns exist
if 'posteam' in df_kicker_pbp.columns and 'defteam' in df_kicker_pbp.columns:
    # Create a mask for kickoff plays
    kickoff_mask = df_kicker_pbp['play_type'] == 'kickoff'

    # Log the number of kickoff plays being processed
    print(f"Swapping 'posteam' and 'defteam' for {kickoff_mask.sum()} kickoff plays...")

    # Swap values using the mask
    df_kicker_pbp.loc[kickoff_mask, ['posteam', 'defteam']] = (
        df_kicker_pbp.loc[kickoff_mask, ['defteam', 'posteam']].values
    )

    print("Swap complete.")
else:
    print("Error: Required columns 'posteam' and 'defteam' are missing from the DataFrame.")

# Convert 'game_date' column to datetime format, with error handling
try:
    df_kicker_pbp['game_date'] = pd.to_datetime(df_kicker_pbp['game_date'], errors='coerce')
    if df_kicker_pbp['game_date'].isnull().any():
        print("Warning: Some 'game_date' entries could not be converted and have been set to NaT.")
except Exception as e:
    print(f"An error occurred while converting 'game_date' to datetime: {e}")

# Final log for confirmation
print("Data processing for 'df_kicker_pbp' completed.")




# Set extra point distance based on year and create flags for XP attempts and success
df_kicker_pbp['xp_distance'] = np.where(df_kicker_pbp['game_date'].dt.year < 2015, 19, 33)
df_kicker_pbp["xp_attempt"] = df_kicker_pbp["extra_point_result"].notnull()
df_kicker_pbp["xp_made"] = df_kicker_pbp["extra_point_result"] == "good"

# Create flags for successful and attempted XPs by distance
df_kicker_pbp["xp_made_33y"] = df_kicker_pbp["xp_made"] & (df_kicker_pbp["xp_distance"] == 33)
df_kicker_pbp["xp_made_19y"] = df_kicker_pbp["xp_made"] & (df_kicker_pbp["xp_distance"] == 19)
df_kicker_pbp["xp_attempt_33y"] = df_kicker_pbp["xp_attempt"] & (df_kicker_pbp["xp_distance"] == 33)
df_kicker_pbp["xp_attempt_19y"] = df_kicker_pbp["xp_attempt"] & (df_kicker_pbp["xp_distance"] == 19)

# Field goal (FG) results and distance-based flags
df_kicker_pbp["50+_fg_made"] = (df_kicker_pbp["field_goal_result"] == "made") & (df_kicker_pbp["kick_distance"] >= 50)
df_kicker_pbp["40-49_fg_made"] = (df_kicker_pbp["field_goal_result"] == "made") & (df_kicker_pbp["kick_distance"].between(40, 49))
df_kicker_pbp["0-39_fg_made"] = (df_kicker_pbp["field_goal_result"] == "made") & (df_kicker_pbp["kick_distance"] < 40)

# Missed FG flags by distance
df_kicker_pbp["missed_fg_0-39"] = (df_kicker_pbp["field_goal_result"] == "missed") & (df_kicker_pbp["kick_distance"] < 40)
df_kicker_pbp["missed_fg_40-49"] = (df_kicker_pbp["field_goal_result"] == "missed") & (df_kicker_pbp["kick_distance"].between(40, 49))
df_kicker_pbp["missed_fg_50+"] = (df_kicker_pbp["field_goal_result"] == "missed") & (df_kicker_pbp["kick_distance"] >= 50)

# Total FGs made and missed
df_kicker_pbp["total_fg_made"] = df_kicker_pbp[["50+_fg_made", "40-49_fg_made", "0-39_fg_made"]].sum(axis=1)
df_kicker_pbp["total_fg_missed"] = df_kicker_pbp[["missed_fg_0-39", "missed_fg_40-49", "missed_fg_50+"]].sum(axis=1)

# Calculate fantasy points based on custom scoring system
df_kicker_pbp["fantasy_points"] = (
    df_kicker_pbp["50+_fg_made"] * 5 +
    df_kicker_pbp["40-49_fg_made"] * 4 +
    df_kicker_pbp["0-39_fg_made"] * 3 +
    df_kicker_pbp["xp_made"] * 1 +
    df_kicker_pbp["missed_fg_0-39"] * -2 +
    df_kicker_pbp["missed_fg_40-49"] * -1
)

# Optional: Drop any rows with NaN values in the calculated columns
# df_kicker_pbp.dropna(subset=["fantasy_points"], inplace=True)

# Log completion message
print("Kicker play-by-play data processing completed successfully.")
df_kicker_game_level_stadium = df_kicker_pbp.groupby(['game_id', 'game_date', 'week', 'season', 'stadium'], as_index=False).agg({
    # Game level
    'home_team': 'first',
    'roof': 'first',
    'temp': 'first',
    'wind': 'first',
}).sort_values(by=['game_date'], ascending=False)

df_kicker_game_level = df_kicker_pbp.groupby(['game_id', 'game_date', 'week', 'season', 'posteam', 'defteam', 'kicker_player_name', 'kicker_player_id'], as_index=False).agg({
    # Game level
    'home_team': 'first',
    'away_team': 'first',

    # Play level
    'fantasy_points': 'sum',
    'total_fg_made': 'sum',
    'total_fg_missed': 'sum',
    '50+_fg_made': 'sum',
    '40-49_fg_made': 'sum',
    '0-39_fg_made': 'sum',
    'missed_fg_0-39': 'sum',
    'missed_fg_40-49': 'sum',
    'missed_fg_50+': 'sum',
    'xp_attempt_19y': 'sum',
    'xp_made_19y': 'sum',
    'xp_attempt_33y': 'sum',
    'xp_made_33y': 'sum',
})

df_kicker_game_level["home"] = df_kicker_game_level["home_team"] == df_kicker_game_level["posteam"]
df_kicker_game_level.drop(columns=['home_team', 'away_team'], inplace=True)
# Define the fields for which you want to calculate aggregate statistics
kicker_fields = [
    'fantasy_points', 
    'total_fg_made', 
    'total_fg_missed', 
    '50+_fg_made', 
    '40-49_fg_made', 
    '0-39_fg_made', 
    'missed_fg_50+', 
    'missed_fg_40-49', 
    'missed_fg_0-39', 
    'xp_attempt_19y', 
    'xp_made_19y', 
    'xp_attempt_33y', 
    'xp_made_33y'
]

# Apply the 'calc_agg_stats' function to each kicker's data
df_kicker_game_level_agg = df_kicker_game_level.groupby(
    ['kicker_player_name', 'kicker_player_id'], 
    group_keys=False
).apply(
    calc_agg_stats, 
    fields=kicker_fields
).reset_index(drop=True).round(2)
df_kicker_game_level_agg = df_kicker_game_level_agg.drop(columns=df_kicker_game_level_agg.loc[:, "fantasy_points":"home"].columns)


df_kicker_game_level_agg_by_game = df_kicker_game_level.groupby(['game_id', 'game_date', 'week', 'season', 'posteam', 'defteam'], as_index=False).agg({
    # Play level
    'fantasy_points': 'sum',
    'total_fg_made': 'sum',
    'total_fg_missed': 'sum',
    '50+_fg_made': 'sum',
    '40-49_fg_made': 'sum',
    '0-39_fg_made': 'sum',
    'missed_fg_0-39': 'sum',
    'missed_fg_40-49': 'sum',
    'missed_fg_50+': 'sum',
    'xp_attempt_19y': 'sum',
    'xp_made_19y': 'sum',
    'xp_attempt_33y': 'sum',
    'xp_made_33y': 'sum',
})

# Group by 'defteam' and apply the 'calc_agg_stats' function
df_kicker_game_level_agg_by_def = df_kicker_game_level_agg_by_game.groupby(
    ['defteam'], 
    group_keys=False
).apply(
    calc_agg_stats, 
    fields=kicker_fields, 
    career=False 
).reset_index(drop=True).round(2)
df_kicker_game_level_agg_by_def = df_kicker_game_level_agg_by_def.drop(columns=df_kicker_game_level_agg_by_def.loc[:, "fantasy_points":"xp_made_33y"].columns)

# Merge kicker aggregate stats with defensive team stats
df_combined = pd.merge(
    df_kicker_game_level_agg,
    df_kicker_game_level_agg_by_def,
    on=['game_id', 'game_date', 'week', 'season', 'posteam', 'defteam'],
    how='left',
    suffixes=('_k', '_def')
)

# Merge with stadium data
df_combined = pd.merge(
    df_combined,
    df_kicker_game_level_stadium,
    on=['game_id', 'game_date', 'week', 'season'],
    how='left'
)

# Merge with original kicker game level data to include 'fantasy_points'
df_combined = pd.merge(
    df_combined,
    df_kicker_game_level[['game_id', 'fantasy_points', 'kicker_player_id']],
    on=['game_id', 'kicker_player_id'],
    how='left'
)

# Drop redundant columns if necessary
columns_to_drop = ['home_team']
df_combined.drop(columns=columns_to_drop, inplace=True, errors='ignore')


# Reset index
df_combined.reset_index(drop=True, inplace=True)

# Log completion message
print("DataFrames merged successfully into 'df_combined'.")


df_combined = df_combined.fillna(0)


Swapping 'posteam' and 'defteam' for 68390 kickoff plays...
Swap complete.
Data processing for 'df_kicker_pbp' completed.
Kicker play-by-play data processing completed successfully.
DataFrames merged successfully into 'df_combined'.


In [5]:

x_vars = ['n_games_career',
       'n_games_season_k', 'fantasy_points_mean_career',
       'fantasy_points_mean_season_k', 'fantasy_points_mean_last5_k',
       'total_fg_made_mean_career', 'total_fg_made_mean_season_k',
       'total_fg_made_mean_last5_k', 'total_fg_missed_mean_career',
       'total_fg_missed_mean_season_k', 'total_fg_missed_mean_last5_k',
       '50+_fg_made_mean_career', '50+_fg_made_mean_season_k',
       '50+_fg_made_mean_last5_k', '40-49_fg_made_mean_career',
       '40-49_fg_made_mean_season_k', '40-49_fg_made_mean_last5_k',
       '0-39_fg_made_mean_career', '0-39_fg_made_mean_season_k',
       '0-39_fg_made_mean_last5_k', 'missed_fg_50+_mean_career',
       'missed_fg_50+_mean_season_k', 'missed_fg_50+_mean_last5_k',
       'missed_fg_40-49_mean_career', 'missed_fg_40-49_mean_season_k',
       'missed_fg_40-49_mean_last5_k', 'missed_fg_0-39_mean_career',
       'missed_fg_0-39_mean_season_k', 'missed_fg_0-39_mean_last5_k',
       'xp_attempt_19y_mean_career', 'xp_attempt_19y_mean_season_k',
       'xp_attempt_19y_mean_last5_k', 'xp_made_19y_mean_career',
       'xp_made_19y_mean_season_k', 'xp_made_19y_mean_last5_k',
       'xp_attempt_33y_mean_career', 'xp_attempt_33y_mean_season_k',
       'xp_attempt_33y_mean_last5_k', 'xp_made_33y_mean_career',
       'xp_made_33y_mean_season_k', 'xp_made_33y_mean_last5_k',
       'n_games_season_def', 'fantasy_points_mean_season_def',
       'fantasy_points_mean_last5_def', 'total_fg_made_mean_season_def',
       'total_fg_made_mean_last5_def', 'total_fg_missed_mean_season_def',
       'total_fg_missed_mean_last5_def', '50+_fg_made_mean_season_def',
       '50+_fg_made_mean_last5_def', '40-49_fg_made_mean_season_def',
       '40-49_fg_made_mean_last5_def', '0-39_fg_made_mean_season_def',
       '0-39_fg_made_mean_last5_def', 'missed_fg_50+_mean_season_def',
       'missed_fg_50+_mean_last5_def', 'missed_fg_40-49_mean_season_def',
       'missed_fg_40-49_mean_last5_def', 'missed_fg_0-39_mean_season_def',
       'missed_fg_0-39_mean_last5_def', 'xp_attempt_19y_mean_season_def',
       'xp_attempt_19y_mean_last5_def', 'xp_made_19y_mean_season_def',
       'xp_made_19y_mean_last5_def', 'xp_attempt_33y_mean_season_def',
       'xp_attempt_33y_mean_last5_def', 'xp_made_33y_mean_season_def',
       'xp_made_33y_mean_last5_def', 'stadium', 'roof', 'temp', 'wind']

# Begin Modeling

In [6]:
from nfl_model import NFLModel

In [7]:
def get_dummy_variables(df, drop_first=True, dummy_na=False):
    """
    Converts non-numerical columns in a DataFrame to dummy variables.

    Parameters:
    - df: pandas DataFrame
        The input DataFrame containing the data.
    - drop_first: bool, default=False
        Whether to drop the first level of categorical variables to avoid the dummy variable trap.
    - dummy_na: bool, default=False
        Add a column to indicate NaNs, if False NaNs are ignored.

    Returns:
    - df_dummies: pandas DataFrame
        The DataFrame with non-numeric columns converted to dummy variables.
    """
    # Identify non-numeric columns
    non_numeric_cols = df.select_dtypes(exclude=['number', 'bool']).columns.tolist()

    # If there are no non-numeric columns, return the original DataFrame
    if not non_numeric_cols:
        print("No non-numerical columns to convert.")
        return df.copy()

    # Convert categorical variables to dummy variables
    df_dummies = pd.get_dummies(df, columns=non_numeric_cols, drop_first=drop_first, dummy_na=dummy_na)

    return df_dummies

In [8]:
columns_to_include = df_combined.columns.difference(['game_id', 'game_date', 'player_name'])
final_df = df_combined[x_vars + ['fantasy_points']].copy()
# final_df['player_id'] = final_df['player_id'].astype('category')
final_df = get_dummy_variables(final_df)

y_var = 'fantasy_points'

In [9]:
# Initialize the model
model = NFLModel(data=final_df, target_variable=y_var)

# Preprocess data
model.preprocess_data()

# Perform feature selection
model.feature_selection()

# Evaluate models
model.evaluate_models()

# Get and print the results
results_df = model.get_results()
print(results_df)

Data preprocessing completed.
Lasso selected features: ['n_games_career', 'fantasy_points_mean_career', 'fantasy_points_mean_season_k', 'fantasy_points_mean_last5_k', 'total_fg_missed_mean_career', 'missed_fg_40-49_mean_career', 'xp_made_19y_mean_season_k', 'xp_attempt_33y_mean_last5_k', 'xp_made_33y_mean_last5_k', 'n_games_season_def', 'fantasy_points_mean_season_def', 'wind', 'stadium_Arrowhead Stadium', 'roof_closed']
Elastic Net selected features: ['n_games_career', 'fantasy_points_mean_career', 'fantasy_points_mean_season_k', 'fantasy_points_mean_last5_k', 'total_fg_made_mean_career', 'total_fg_missed_mean_career', '40-49_fg_made_mean_career', '0-39_fg_made_mean_career', 'missed_fg_50+_mean_career', 'missed_fg_40-49_mean_career', 'xp_attempt_19y_mean_career', 'xp_attempt_19y_mean_season_k', 'xp_attempt_19y_mean_last5_k', 'xp_made_19y_mean_career', 'xp_made_19y_mean_season_k', 'xp_made_19y_mean_last5_k', 'xp_attempt_33y_mean_career', 'xp_attempt_33y_mean_season_k', 'xp_attempt_33y_

In [10]:
vals = model.elastic_net_features

In [11]:
print(len(final_df.columns))
print(len(vals))

180
57


In [12]:
list(vals)

['n_games_career',
 'fantasy_points_mean_career',
 'fantasy_points_mean_season_k',
 'fantasy_points_mean_last5_k',
 'total_fg_made_mean_career',
 'total_fg_missed_mean_career',
 '40-49_fg_made_mean_career',
 '0-39_fg_made_mean_career',
 'missed_fg_50+_mean_career',
 'missed_fg_40-49_mean_career',
 'xp_attempt_19y_mean_career',
 'xp_attempt_19y_mean_season_k',
 'xp_attempt_19y_mean_last5_k',
 'xp_made_19y_mean_career',
 'xp_made_19y_mean_season_k',
 'xp_made_19y_mean_last5_k',
 'xp_attempt_33y_mean_career',
 'xp_attempt_33y_mean_season_k',
 'xp_attempt_33y_mean_last5_k',
 'xp_made_33y_mean_career',
 'xp_made_33y_mean_season_k',
 'xp_made_33y_mean_last5_k',
 'n_games_season_def',
 'fantasy_points_mean_season_def',
 'fantasy_points_mean_last5_def',
 '50+_fg_made_mean_last5_def',
 '40-49_fg_made_mean_last5_def',
 'missed_fg_50+_mean_last5_def',
 'xp_attempt_19y_mean_season_def',
 'xp_made_19y_mean_season_def',
 'xp_made_33y_mean_season_def',
 'wind',
 'stadium_AT&T Stadium',
 'stadium_Alle

## Fitting Model Using Elastic Net Features

In [13]:
# Initialize the model
model2 = NFLModel(data=final_df[vals + [y_var]], target_variable=y_var)
model2.preprocess_data()

# model2.train_random_forest()
model2.tune_random_forest()
# model2.evaluate_random_forest()
# model2.build_lstm_model()
# model2.evaluate_lstm()
# model2.evaluate_ensemble()

# Get and print the results
results_df2 = model2.get_results()
print(results_df2)

Data preprocessing completed.
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, max_features=sqrt, min_samp

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found: {'max_depth': 10, 'max_features': 0.5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best MAE score: 3.0059
Random Forest Test MAE: 2.91 fantasy points
     Random Forest
MAE       2.907351
MSE      15.400053
R2        0.334100
