In [1]:
import warnings 
import pandas as pd
import numpy as np
import nfl_data_py as nfl
import datetime as dt
import copy
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


  from pandas.core import (


# Background/Ideas

- Features will be based off seasonal,weekly, and career based values
- Idea is that certain players which can be differentiated by career based values paired with weekly performance and or seasonal (team strength proxy) can be paired to build something relatively predictive.
- Interactivity can be dependent on clicking and choosing assortment of players and identifying/projecting current projections.
- Data seems to get updated weekly so these predictions would change over time as well.

# Data

This section focuses on pulling the data and prepping/aggregating the dependent variable. (Fantasy Points)

In [2]:
roster_data = nfl.import_seasonal_rosters([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999])
pbp_df = pd.DataFrame(nfl.import_pbp_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
weekly_df = pd.DataFrame(nfl.import_weekly_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
# injuries_df = pd.DataFrame(nfl.import_injuries([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000]))
schedules_df = pd.DataFrame(nfl.import_schedules([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))

2024 done.
2023 done.
2022 done.
2021 done.
2020 done.
2019 done.
2018 done.
2017 done.
2016 done.
2015 done.
2014 done.
2013 done.
2012 done.
2011 done.
2010 done.
2009 done.
2008 done.
2007 done.
2006 done.
2005 done.
2004 done.
2003 done.
2002 done.
2001 done.
2000 done.
1999 done.
Downcasting floats.
Downcasting floats.


# Basic Player Related Stats 

**Game by Game**

In [3]:
## Basic PBP Passing Stats

def get_opposing_team(df):
    if df['home_team'] == df['team']:
        val = df['away_team']
    elif df['away_team'] == df['team']:
        val = df['home_team']
    else:
        val = None

    return val

passing_stats = pbp_df[~pbp_df['passer_player_id'].isna()].copy()


passing_stats['two_points'] = np.where(passing_stats['two_point_conv_result'] == 'success',1,0)


passing_stats = passing_stats.groupby(['game_id', 'game_date','season', 'week', 'div_game', 'home_team', 'away_team','posteam','defteam', 'weather', 'location', 'stadium',  'spread_line', 'total_line', 'roof', 'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'passer_player_id', 'passer_player_name']).agg({
            'passing_yards': 'sum',
            'air_yards': 'sum',
            'pass_touchdown': 'sum', 
            'pass_attempt': 'sum',
            'complete_pass': 'sum',
            'interception': 'sum', #the passing stats are duplicated for receivers
            'rush_attempt': 'sum',
            'rushing_yards': 'sum',# Sum passing yards
            'rush_touchdown': 'sum',
            'lateral_rush': 'sum',
            'receiving_yards': 'sum',
            'yards_after_catch':'sum',
            'touchdown':'sum',
            'fumble': 'sum',
            'two_points': 'sum'
}).reset_index()

game_score_info = schedules_df[['season','home_score','away_score','game_id']].copy()




passing_stats = passing_stats.merge(game_score_info, on = ['game_id','season'], how = 'inner')

passing_stats.rename(columns = {'defteam':'opponent_team', 'passer_player_name':'player_name', 'passer_player_id':'player_id'} , inplace = True )

## Aggregate average score to opposition 

# passing_stats['opponent_team'] = passing_stats.apply(get_opposing_team,axis = 1)



# Creating Fantasy Points Column


Offensive Players:

- Passing Yards: 1 point per 25 yards
- Passing Touchdowns: 4 points
- Passing Interceptions: -2 points
- Rushing Yards: 1 point per 10 yards
- Rushing Touchdowns: 6 points
- Receptions: 1 points (only if using PPR scoring)
- Receiving Yards: 1 point per 10 yards
- Receiving Touchdowns: 6 points
- 2-Point Conversions: 2 points
- Fumbles Lost: -2 points
- Fumble Recovered for a Touchdown: 6 points

In [4]:
passing_stats['fantasy_points'] = ((passing_stats['passing_yards']/25 )
                                         + (passing_stats['pass_touchdown'] * 4) + 
                                         (passing_stats['interception'] * -2) +
                                         (passing_stats['touchdown'] * 6) +
                                         (passing_stats['receiving_yards'] * .1) +
                                         (passing_stats['fumble'] * -2) +
                                         (passing_stats['two_points'] * 2))

## Dependent Variable (Fantasy Points)

- Passing yards: 1 point for every 20–25 passing yards, or 0.04–0.05 points per passing yard
- Passing touchdowns: 4 points
- Rushing yards: 1 point for every 10 rushing yards
- Rushing touchdowns: 6 points 
- Other points that can be awarded include:
- Interceptions or fumbles lost -2 points
- Extra point: 1 point
- Field goal from 0–39 yards: 3 points
- Field goal from 40–49 yards: 4 points

# Prior Points

In [5]:
calculating_prior_points = passing_stats[['player_name','season','fantasy_points']].copy()

calculating_prior_points['prior_ssn_avg_fp'] = calculating_prior_points.groupby(['player_name','season'])['fantasy_points'].transform('mean')

calculating_prior_points = calculating_prior_points.drop_duplicates()


calculating_prior_points.rename(columns = {'season':'actual_season'}, inplace = True)

calculating_prior_points['season'] = calculating_prior_points['actual_season'] + 1

passing_stats = passing_stats.merge(calculating_prior_points[['player_name','season','prior_ssn_avg_fp']], how = 'left', on = ['player_name','season'])

passing_stats = passing_stats.drop_duplicates()



# External Game Factor Stats

In [6]:
## Home Away Flag 



passing_stats['home_flag'] = np.where(passing_stats['opponent_team'] == passing_stats['home_team'], 0,1)
# passing_stats['surface'] = passing_stats['surface'].str.strip()
# passing_stats = passing_stats.join(pd.get_dummies(data = passing_stats['surface'], prefix = 'flag'))


# passing_stats = passing_stats.join(pd.get_dummies(data = passing_stats['surface'], prefix = 'flag'))



# ## Interaction Vars with Wind and Temp on scorable attempts


# passing_stats['wind_interaction'] = passing_stats['wind'] * passing_stats['scorable_snaps']
# passing_stats['wind_interaction'] = passing_stats['wind_interaction'].fillna(0)

# passing_stats['temp_interaction'] = passing_stats['temp'] * passing_stats['scorable_snaps']
# passing_stats['temp_interaction'] = passing_stats['temp_interaction'].fillna(0)



# ## Stadium 

# Last 5

In [7]:
df_game_level = passing_stats.groupby(['game_id', 'game_date', 'week', 'season', 'posteam', 'opponent_team', 'player_name', 'player_id']).agg({
    # Game level
    'home_team': 'first',
    'away_team': 'first',
    # Play level
    'fantasy_points': 'sum',
    'passing_yards': 'sum',
    'air_yards': 'sum',
    'pass_touchdown': 'sum', 
    'pass_attempt': 'sum',
    'complete_pass': 'sum',
    'interception': 'sum',
    'rush_attempt': 'sum',
    'rushing_yards': 'sum',# Sum passing yards
    'rush_touchdown': 'sum',
    'lateral_rush': 'sum',
    'receiving_yards': 'sum',
    'yards_after_catch': 'sum',
    'touchdown':'sum',
    'fumble': 'sum',
    'two_points': 'sum'
})

In [8]:
def calc_agg_stats(group, fields, career=True):
    # Create a copy to avoid modifying the original
    # df = pd.DataFrame({'game_date': group['game_date']}, index=group.index)
    df = pd.DataFrame(index=group.index)
    
    # Sort chronologically
    group_sorted = group.sort_values('game_date')

    # Calculate the number of unique games for career, season, and prior season
    if career:
        df['n_games_career'] = range(len(group_sorted))

    df['n_games_season'] = group_sorted.groupby(
        group_sorted.index.get_level_values('season')
    ).cumcount()

    # df['n_games_prior_season'] = group_sorted.groupby(
    #     group_sorted.index.get_level_values('season')
    # ).transform('size').shift()



    # Calculate aggregate stats
    for field in fields:
        if career:
            # Career stats
            df[f'{field}_mean_career'] = group_sorted[field].transform(lambda x: x.expanding().mean().shift())
            df[f'{field}_total_career'] = group_sorted[field].transform(lambda x: x.expanding().sum().shift())
        
        # Season stats
        df[f'{field}_mean_season'] = group_sorted.groupby([group_sorted.index.get_level_values('season')])[field].transform(lambda x: x.expanding().mean().shift())
        df[f'{field}_total_season'] = group_sorted.groupby([group_sorted.index.get_level_values('season')])[field].transform(lambda x: x.expanding().sum().shift())

        # # Prior season stats
        # df[f'{field}_mean_prior_season'] = group_sorted.groupby([group_sorted.index.get_level_values('season') - 1])[field].transform('mean')
        
        # Last 5 games
        df[f'{field}_mean_last5'] = group_sorted[field].transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift())
        df[f'{field}_total_last5'] = group_sorted[field].transform(lambda x: x.rolling(window=5, min_periods=1).sum().shift())
        # Last Game
        df[f'{field}_last'] = group_sorted[field].shift()
    return df

In [9]:
fields = ['fantasy_points','pass_attempt','interception','rush_attempt','rush_touchdown','complete_pass','rushing_yards','touchdown','receiving_yards','fumble','passing_yards','pass_touchdown','two_points']


# Apply the function
df_game_level = df_game_level.groupby(['player_name', 'player_id']).apply(calc_agg_stats, fields=fields)


In [10]:
df_game_level = df_game_level.reset_index(0).reset_index(0).drop(columns = ['player_name','player_id']).reset_index()

# Opponent Last Scored

In [11]:
schedules_df_copy = schedules_df[schedules_df['game_id'].isin(schedules_df['game_id'].unique()) & (schedules_df['gameday'] >= '2001-09-09')]
schedules_df_copy.rename(columns = {'gameday':'game_date'}, inplace = True)

home_teams = schedules_df_copy[['game_id', 'game_date','season','home_team','away_score','week']].copy()

away_teams = schedules_df_copy[['game_id', 'game_date','season','away_team','home_score','week']].copy()

home_teams.rename(columns = {'home_team':'team','away_score':'points_allowed'}, inplace = True)
away_teams.rename(columns = {'away_team':'team','home_score':'points_allowed'}, inplace = True)

points_allowed_df = pd.concat([home_teams,away_teams])

points_allowed_df = points_allowed_df.groupby(['game_id', 'game_date','season','week','team']).agg({'points_allowed':'sum'})

group_sorted = points_allowed_df.sort_values('week')

pa_df = group_sorted.groupby(['team']).apply(calc_agg_stats, fields=['points_allowed']).reset_index(0).drop(columns = 'team').reset_index()[['game_id','game_date','season','week','team','points_allowed_mean_season','points_allowed_mean_last5']]


pa_df.rename(columns = {'team':'opponent_team'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schedules_df_copy.rename(columns = {'gameday':'game_date'}, inplace = True)


In [12]:

passing_stats = passing_stats.merge(df_game_level, how = 'inner' ,on = ['game_id','game_date','week','season','posteam','opponent_team','player_name','player_id'])
# rusher_receiver_features['opponent_team'] = np.where(rusher_receiver_features['team'] == rusher_receiver_features['home_team'],rusher_receiver_features['away_team'],rusher_receiver_features['home_team'])
passing_stats = passing_stats.merge(pa_df , how = 'inner',on = ['game_date','season','week','opponent_team','game_id'])


passing_stats = passing_stats.fillna(0)


df_combined = passing_stats.copy()

# Training Model

In [13]:
import nfl_model as NFLModel

In [14]:
x_vars = ['div_game','wind','prior_ssn_avg_fp',
 'home_flag',
 'n_games_career',
 'n_games_season',
 'fantasy_points_mean_career',
 'fantasy_points_total_career',
 'fantasy_points_mean_season',
 'fantasy_points_total_season',
 'fantasy_points_mean_last5',
 'fantasy_points_total_last5',
 'fantasy_points_last',
 'pass_attempt_mean_career',
 'pass_attempt_total_career',
 'pass_attempt_mean_season',
 'pass_attempt_total_season',
 'pass_attempt_mean_last5',
 'pass_attempt_total_last5',
 'pass_attempt_last',
 'interception_mean_career',
 'interception_total_career',
 'interception_mean_season',
 'interception_total_season',
 'interception_mean_last5',
 'interception_total_last5',
 'interception_last',
 'rush_attempt_mean_career',
 'rush_attempt_total_career',
 'rush_attempt_mean_season',
 'rush_attempt_total_season',
 'rush_attempt_mean_last5',
 'rush_attempt_total_last5',
 'rush_attempt_last',
 'rush_touchdown_mean_career',
 'rush_touchdown_total_career',
 'rush_touchdown_mean_season',
 'rush_touchdown_total_season',
 'rush_touchdown_mean_last5',
 'rush_touchdown_total_last5',
 'rush_touchdown_last',
 'complete_pass_mean_career',
 'complete_pass_total_career',
 'complete_pass_mean_season',
 'complete_pass_total_season',
 'complete_pass_mean_last5',
 'complete_pass_total_last5',
 'complete_pass_last',
 'rushing_yards_mean_career',
 'rushing_yards_total_career',
 'rushing_yards_mean_season',
 'rushing_yards_total_season',
 'rushing_yards_mean_last5',
 'rushing_yards_total_last5',
 'rushing_yards_last',
 'touchdown_mean_career',
 'touchdown_total_career',
 'touchdown_mean_season',
 'touchdown_total_season',
 'touchdown_mean_last5',
 'touchdown_total_last5',
 'touchdown_last',
 'receiving_yards_mean_career',
 'receiving_yards_total_career',
 'receiving_yards_mean_season',
 'receiving_yards_total_season',
 'receiving_yards_mean_last5',
 'receiving_yards_total_last5',
 'receiving_yards_last',
 'fumble_mean_career',
 'fumble_total_career',
 'fumble_mean_season',
 'fumble_total_season',
 'fumble_mean_last5',
 'fumble_total_last5',
 'fumble_last',
 'passing_yards_mean_career',
 'passing_yards_total_career',
 'passing_yards_mean_season',
 'passing_yards_total_season',
 'passing_yards_mean_last5',
 'passing_yards_total_last5',
 'passing_yards_last',
 'pass_touchdown_mean_career',
 'pass_touchdown_total_career',
 'pass_touchdown_mean_season',
 'pass_touchdown_total_season',
 'pass_touchdown_mean_last5',
 'pass_touchdown_total_last5',
 'pass_touchdown_last',
 'two_points_mean_career',
 'two_points_total_career',
 'two_points_mean_season',
 'two_points_total_season',
 'two_points_mean_last5',
 'two_points_total_last5',
 'two_points_last',
 'points_allowed_mean_season',
 'points_allowed_mean_last5']

In [15]:
def get_dummy_variables(df, drop_first=True, dummy_na=False):
    """
    Converts non-numerical columns in a DataFrame to dummy variables.

    Parameters:
    - df: pandas DataFrame
        The input DataFrame containing the data.
    - drop_first: bool, default=False
        Whether to drop the first level of categorical variables to avoid the dummy variable trap.
    - dummy_na: bool, default=False
        Add a column to indicate NaNs, if False NaNs are ignored.

    Returns:
    - df_dummies: pandas DataFrame
        The DataFrame with non-numeric columns converted to dummy variables.
    """
    # Identify non-numeric columns
    non_numeric_cols = df.select_dtypes(exclude=['number', 'bool']).columns.tolist()

    # If there are no non-numeric columns, return the original DataFrame
    if not non_numeric_cols:
        print("No non-numerical columns to convert.")
        return df.copy()

    # Convert categorical variables to dummy variables
    df_dummies = pd.get_dummies(df, columns=non_numeric_cols, drop_first=drop_first, dummy_na=dummy_na)

    return df_dummies

In [16]:
columns_to_include = df_combined.columns.difference(['game_id', 'game_date', 'player_name'])
final_df = df_combined[x_vars + ['fantasy_points']].copy()
# final_df['player_id'] = final_df['player_id'].astype('category')
final_df = get_dummy_variables(final_df)

y_var = 'fantasy_points'

No non-numerical columns to convert.


In [22]:
from nfl_model import NFLModel


In [24]:
# Initialize the model
model = NFLModel(data=final_df, target_variable=y_var)

# Preprocess data
model.preprocess_data()

# Perform feature selection
model.feature_selection()

# Evaluate models
model.evaluate_models()

# Get and print the results
results_df = model.get_results()
print(results_df)

Data preprocessing completed.
Lasso selected features: ['div_game', 'wind', 'prior_ssn_avg_fp', 'home_flag', 'fantasy_points_mean_season', 'interception_total_season', 'interception_total_last5', 'interception_last', 'rush_attempt_total_season', 'complete_pass_mean_career', 'complete_pass_mean_season', 'complete_pass_last', 'rushing_yards_total_career', 'rushing_yards_mean_last5', 'receiving_yards_mean_career', 'receiving_yards_mean_season', 'receiving_yards_mean_last5', 'fumble_total_season', 'fumble_total_last5', 'pass_touchdown_mean_career', 'pass_touchdown_total_career', 'pass_touchdown_total_last5', 'pass_touchdown_last', 'two_points_mean_career', 'two_points_total_season', 'two_points_total_last5', 'points_allowed_mean_season', 'points_allowed_mean_last5']
Elastic Net selected features: ['div_game', 'wind', 'prior_ssn_avg_fp', 'home_flag', 'n_games_season', 'fantasy_points_mean_career', 'fantasy_points_total_career', 'fantasy_points_mean_season', 'fantasy_points_mean_last5', 'fan

In [25]:
model.lasso_features

['div_game',
 'wind',
 'prior_ssn_avg_fp',
 'home_flag',
 'fantasy_points_mean_season',
 'interception_total_season',
 'interception_total_last5',
 'interception_last',
 'rush_attempt_total_season',
 'complete_pass_mean_career',
 'complete_pass_mean_season',
 'complete_pass_last',
 'rushing_yards_total_career',
 'rushing_yards_mean_last5',
 'receiving_yards_mean_career',
 'receiving_yards_mean_season',
 'receiving_yards_mean_last5',
 'fumble_total_season',
 'fumble_total_last5',
 'pass_touchdown_mean_career',
 'pass_touchdown_total_career',
 'pass_touchdown_total_last5',
 'pass_touchdown_last',
 'two_points_mean_career',
 'two_points_total_season',
 'two_points_total_last5',
 'points_allowed_mean_season',
 'points_allowed_mean_last5']