In [2]:
import warnings 
import pandas as pd
import numpy as np
import nfl_data_py as nfl
import datetime as dt
import copy
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


# Background/Ideas

- Features will be based off seasonal,weekly,career, and last 5 based values
- Idea is that certain players which can be differentiated by career based values paired with weekly performance and or seasonal (team strength proxy) can be paired to build something relatively predictive.
- Interactivity can be dependent on clicking and choosing assortment of players and identifying/projecting current projections.
- Data seems to get updated weekly so these predictions would change over time as well.

# Data

This section focuses on pulling the data and prepping/aggregating the dependent variable. (Fantasy Points)

In [3]:
# roster_data = nfl.import_seasonal_rosters([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999])
# pbp_df = pd.DataFrame(nfl.import_pbp_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
# weekly_df = pd.DataFrame(nfl.import_weekly_data([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))
# # injuries_df = pd.DataFrame(nfl.import_injuries([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000]))
# schedules_df = pd.DataFrame(nfl.import_schedules([2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999]))

In [4]:
# import os

# try:
#     # Create the data directory if it doesn't exist
#     if not os.path.exists('data'):
#         os.makedirs('data')

#     # Check if the Feather file exists
#     if not os.path.exists('data/pbp_1999_2024.feather'):
#         print("Downloading play-by-play data...")
#         # Fetch data from the source
#         df_pbp = pd.DataFrame(nfl.import_pbp_data([2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 
#                                                 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 
#                                                 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 
#                                                 2000, 1999]))
#         # Save the DataFrame to a Feather file
#         df_pbp.to_feather("data/pbp_1999_2024.feather")
#         print("Data download complete. File saved to 'data/pbp_1999_2024.feather'.")
#     else:
#         print("Loading play-by-play data from local Feather file...")
#         # Read the data from the local Feather file
#         df_pbp = pd.read_feather('data/pbp_1999_2024.feather')
#         print("Data successfully loaded from 'data/pbp_1999_2024.feather'.")

# except Exception as e:
#     print(f"An error occurred: {e}")

In [5]:
# import os

# try:
#     # Create the data directory if it doesn't exist
#     if not os.path.exists('data'):
#         os.makedirs('data')

#     # Check if the Feather file exists
#     if not os.path.exists('data/pbp_1999_2024.csv'):
#         print("Downloading play-by-play data...")
#         # Fetch data from the source
#         df_pbp = pd.DataFrame(nfl.import_pbp_data([2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 
#                                                 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 
#                                                 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 
#                                                 2000, 1999]))
#         # Save the DataFrame to a Feather file
#         df_pbp.to_csv("data/pbp_1999_2024.csv")
#         print("Data download complete. File saved to 'data/pbp_1999_2024.csv'.")
#     else:
#         print("Loading play-by-play data from local CSV file...")
#         # Read the data from the local Feather file
#         df_pbp = pd.read_csv('data/pbp_1999_2024.csv')
#         print("Data successfully loaded from 'data/pbp_1999_2024.csv'.")

# except Exception as e:
#     print(f"An error occurred: {e}")

In [6]:
import os

def load_data(file_name, download_fn):
    try:
        # Create the data directory if it doesn't exist
        if not os.path.exists('data'):
            os.makedirs('data')

        # Check if the Feather file exists
        if not os.path.exists(f'data/{file_name}'):
            print(f"Downloading {file_name}...")
            # Fetch data from the source
            df = pd.DataFrame(download_fn([2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 
                                                    2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 
                                                    2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 
                                                    2000, 1999]))
            # Save the DataFrame to a Feather file
            df.to_csv(f"data/{file_name}")
            print(f"Data download complete. File saved to 'data/{file_name}'.")
        else:
            print(f"Loading {file_name} data from local CSV file...")
            # Read the data from the local Feather file
            df = pd.read_csv(f'data/{file_name}')
            print(f"Data successfully loaded from 'data/{file_name}'.")

        return df

    except Exception as e:
        print(f"An error occurred: {e}")

In [7]:
pbp_df = load_data("pbp_1999_2024.csv", nfl.import_pbp_data)
roster_data = load_data("roster_1999_2024.csv", nfl.import_seasonal_rosters)
schedules_df = load_data("schedules_1999_2024.csv", nfl.import_schedules)
weekly_df = load_data("weekly_1999_2024.csv", nfl.import_weekly_data)

Loading pbp_1999_2024.csv data from local CSV file...


  df = pd.read_csv(f'data/{file_name}')


Data successfully loaded from 'data/pbp_1999_2024.csv'.
Loading roster_1999_2024.csv data from local CSV file...


  df = pd.read_csv(f'data/{file_name}')


Data successfully loaded from 'data/roster_1999_2024.csv'.
Loading schedules_1999_2024.csv data from local CSV file...
Data successfully loaded from 'data/schedules_1999_2024.csv'.
Loading weekly_1999_2024.csv data from local CSV file...


  df = pd.read_csv(f'data/{file_name}')


Data successfully loaded from 'data/weekly_1999_2024.csv'.


In [8]:
# pbp_df = df_pbp

# Base Data Transformations

- Getting General RB and WR stats and creating complete dataframe.

In [9]:
## Basic PBP Passing Stats

def get_opposing_team(df):
    if df['home_team'] == df['team']:
        val = df['away_team']
    elif df['away_team'] == df['team']:
        val = df['home_team']
    else:
        val = None

    return val




# team = roster_data[roster_data['depth_chart_position'].isin(['RB','WR'])][['season','player_id','team','depth_chart_position']]

team = roster_data[['season','player_id','team','depth_chart_position']]

receiver_rusher_stats =  pbp_df[(pbp_df['receiver_player_id'].notnull()) | (pbp_df['rusher_player_id'].notnull())]
                         

receiver_rusher_stats['two_points'] = np.where(receiver_rusher_stats['two_point_conv_result'] == 'success',1,0)
                         
receiver_rusher_stats.rename(columns = {'complete_pass':'reception'},inplace = True)



receiver_stats= receiver_rusher_stats.groupby(['game_id', 'game_date', 'week','div_game','posteam','defteam', 'home_team', 'away_team', 'weather', 'stadium',  'spread_line', 'total_line', 'roof', 'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'receiver_player_id', 'receiver_player_name','season']).agg({
    'passing_yards': 'sum',
    'air_yards': 'sum',
    'pass_touchdown': 'sum', 
    'pass_attempt': 'sum',
    'reception': 'sum',
    'interception': 'sum', #the passing stats are duplicated for receivers
    'rush_attempt': 'sum',
    'rushing_yards': 'sum',# Sum passing yards
    'rush_touchdown': 'sum',
    'lateral_rush': 'sum',
    'receiving_yards': 'sum',
    'yards_after_catch':'sum',
    'touchdown':'sum',
    'fumble': 'sum',
    'two_points': 'sum'
}).reset_index()


rushing_stats = receiver_rusher_stats.groupby(['game_id', 'game_date', 'week', 'div_game','posteam','defteam', 'home_team', 'away_team', 'weather', 'stadium',  'spread_line', 'total_line', 'roof', 'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'rusher_player_id', 'rusher_player_name','season']).agg({
    'passing_yards': 'sum',
    'air_yards': 'sum',
    'pass_touchdown': 'sum', 
    'pass_attempt': 'sum',
    'reception': 'sum',
    'interception': 'sum',
    'rush_attempt': 'sum',
    'rushing_yards': 'sum',# Sum passing yards
    'rush_touchdown': 'sum',
    'lateral_rush': 'sum',
    'receiving_yards': 'sum',
    'yards_after_catch': 'sum',
    'touchdown':'sum',
    'fumble': 'sum',
    'two_points': 'sum'
}).reset_index()


## Grabbing seasonal info


team['team'] = team['team'].replace({'OAK':'LV', 'STL':'LA', 'SD':'LAC','HST':'HOU', 'BLT':'BAL', 'CLV':'CLE','SL':'LA','ARZ':'ARI'})


# team.rename(columns = {'player_id':'passer_player_id'},inplace = True)

## Standardizing Columns
rushing_stats.rename(columns = {'rusher_player_id':'player_id'}, inplace = True)

receiver_stats.rename(columns = {'receiver_player_id':'player_id'}, inplace = True)

rushing_stats.rename(columns = {'rusher_player_name':'player_name'}, inplace = True)

receiver_stats.rename(columns = {'receiver_player_name':'player_name'}, inplace = True)


rusher_receiver_df = pd.concat([receiver_stats,rushing_stats])




# rusher_receiver_df = rusher_receiver_df.merge(team, on = ['player_id','season'], how = 'inner')


## Aggregate average score to opposition 

# rusher_receiver_df['opponent_team'] = rusher_receiver_df.apply(get_opposing_team,axis = 1)

# print('Number Missing Opponent:' + str(rusher_receiver_df[rusher_receiver_df['opponent_team'].isna()].shape[0]))
# rusher_receiver_df = rusher_receiver_df[~rusher_receiver_df['opponent_team'].isna()]





game_score_info = schedules_df[['season','home_score','away_score','game_id']].copy()




rusher_receiver_df = rusher_receiver_df.merge(game_score_info, on = ['game_id','season'], how = 'inner')

# rusher_receiver_df.rename(columns = {'defteam':'opponent_team'}, inplace = True)


# rusher_receiver_df = rusher_receiver_df.groupby(['game_id', 'game_date', 'week', 'div_game', 'posteam','defteam','home_team', 'away_team', 'weather', 'stadium',  'spread_line', 'total_line', 'roof', 'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'player_id', 'player_name','season','home_score','away_score','team','depth_chart_position','opponent_team']).agg({
#     'passing_yards': 'sum',
#     'air_yards': 'sum',
#     'pass_touchdown': 'sum', 
#     'pass_attempt': 'sum',
#     'reception': 'sum',
#     'interception': 'sum',
#     'rush_attempt': 'sum',
#     'rushing_yards': 'sum',# Sum passing yards
#     'rush_touchdown': 'sum',
#     'lateral_rush': 'sum',
#     'receiving_yards': 'sum',
#     'yards_after_catch': 'sum',
#     'touchdown':'sum',
#     'fumble': 'sum',
#     'two_points': 'sum'
# }).reset_index()



rusher_receiver_df = rusher_receiver_df.groupby(['game_id', 'game_date', 'week', 'div_game', 'posteam','defteam','home_team', 'away_team', 'weather', 'stadium',  'spread_line', 'total_line', 'roof', 'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'player_id', 'player_name','season']).agg({
    'passing_yards': 'sum',
    'air_yards': 'sum',
    'pass_touchdown': 'sum', 
    'pass_attempt': 'sum',
    'reception': 'sum',
    'interception': 'sum',
    'rush_attempt': 'sum',
    'rushing_yards': 'sum',# Sum passing yards
    'rush_touchdown': 'sum',
    'lateral_rush': 'sum',
    'receiving_yards': 'sum',
    'yards_after_catch': 'sum',
    'touchdown':'sum',
    'fumble': 'sum',
    'two_points': 'sum'
}).reset_index()



rusher_receiver_df.rename(columns = {'defteam':'opponent_team'} , inplace = True )


# #Checking the passing stats dataframe
# rusher_receiver_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  receiver_rusher_stats['two_points'] = np.where(receiver_rusher_stats['two_point_conv_result'] == 'success',1,0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  receiver_rusher_stats.rename(columns = {'complete_pass':'reception'},inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team['team'] = team['team'].replace({'OAK':'LV', 'STL':'LA', 'SD'

In [10]:

rusher_receiver_df.shape

(84646, 36)

# Creating Fantasy Points Column


Offensive Players:

- Passing Yards: 1 point per 25 yards
- Passing Touchdowns: 4 points
- Passing Interceptions: -2 points
- Rushing Yards: 1 point per 10 yards
- Rushing Touchdowns: 6 points
- Receptions: 1 points (only if using PPR scoring)
- Receiving Yards: 1 point per 10 yards
- Receiving Touchdowns: 6 points
- 2-Point Conversions: 2 points
- Fumbles Lost: -2 points
- Fumble Recovered for a Touchdown: 6 points

In [11]:
rusher_receiver_df['fantasy_points'] = ((rusher_receiver_df['passing_yards']/25 )
                                         + (rusher_receiver_df['pass_touchdown'] * 4) + 
                                         (rusher_receiver_df['interception'] * -2) +
                                         (rusher_receiver_df['reception'] * 1) +
                                         (rusher_receiver_df['touchdown'] * 6) +
                                         (rusher_receiver_df['receiving_yards'] * .1) +
                                         (rusher_receiver_df['fumble'] * -2) +
                                         (rusher_receiver_df['two_points'] * 2))

In [12]:
rusher_receiver_df.head(5)['fantasy_points']

0    8.12
1    0.00
2    4.80
3    1.14
4    0.00
Name: fantasy_points, dtype: float64

# Last 5

In [13]:

df_rusher_receiver_game_level = rusher_receiver_df.groupby(['game_id', 'game_date', 'week', 'season', 'posteam', 'opponent_team', 'player_name', 'player_id']).agg({
    # Game level
    'home_team': 'first',
    'away_team': 'first',

    # Play level
    'fantasy_points': 'sum',
    'passing_yards': 'sum',
    'air_yards': 'sum',
    'pass_touchdown': 'sum', 
    'pass_attempt': 'sum',
    'reception': 'sum',
    'interception': 'sum',
    'rush_attempt': 'sum',
    'rushing_yards': 'sum',# Sum passing yards
    'rush_touchdown': 'sum',
    'lateral_rush': 'sum',
    'receiving_yards': 'sum',
    'yards_after_catch': 'sum',
    'touchdown':'sum',
    'fumble': 'sum',
    'two_points': 'sum'

})

df_rusher_receiver_game_level["home"] = df_rusher_receiver_game_level["home_team"] == df_rusher_receiver_game_level.index.get_level_values("posteam")
df_rusher_receiver_game_level.drop(columns=['home_team', 'away_team'], inplace=True)

In [14]:
def calc_agg_stats(group, fields, career=True):
    # Create a copy to avoid modifying the original
    # df = pd.DataFrame({'game_date': group['game_date']}, index=group.index)
    df = pd.DataFrame(index=group.index)
    
    # Sort chronologically
    group_sorted = group.sort_values('game_date')

    # Calculate the number of unique games for career, season, and prior season
    if career:
        df['n_games_career'] = range(len(group_sorted))

    df['n_games_season'] = group_sorted.groupby(
        group_sorted.index.get_level_values('season')
    ).cumcount()

    # df['n_games_prior_season'] = group_sorted.groupby(
    #     group_sorted.index.get_level_values('season')
    # ).transform('size').shift()



    # Calculate aggregate stats
    for field in fields:
        if career:
            # Career stats
            df[f'{field}_mean_career'] = group_sorted[field].transform(lambda x: x.expanding().mean().shift())
            df[f'{field}_total_career'] = group_sorted[field].transform(lambda x: x.expanding().sum().shift())
        
        # Season stats
        df[f'{field}_mean_season'] = group_sorted.groupby([group_sorted.index.get_level_values('season')])[field].transform(lambda x: x.expanding().mean().shift())
        df[f'{field}_total_season'] = group_sorted.groupby([group_sorted.index.get_level_values('season')])[field].transform(lambda x: x.expanding().sum().shift())

        # # Prior season stats
        # df[f'{field}_mean_prior_season'] = group_sorted.groupby([group_sorted.index.get_level_values('season') - 1])[field].transform('mean')
        
        # Last 5 games
        df[f'{field}_mean_last5'] = group_sorted[field].transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift())
        df[f'{field}_total_last5'] = group_sorted[field].transform(lambda x: x.rolling(window=5, min_periods=1).sum().shift())
        # Last Game
        df[f'{field}_last'] = group_sorted[field].shift()
    return df

In [15]:
fields = ['fantasy_points','reception','rushing_yards','touchdown','receiving_yards','fumble','passing_yards','pass_touchdown','two_points']


# Apply the function
df_rusher_receiver_game_level = df_rusher_receiver_game_level.groupby(['player_name', 'player_id']).apply(calc_agg_stats, fields=fields)



In [17]:
df_rusher_receiver_game_level = df_rusher_receiver_game_level.reset_index(0).reset_index(0).drop(columns = ['player_name','player_id']).reset_index()

KeyError: "['player_name', 'player_id'] not found in axis"

In [19]:
df_rusher_receiver_game_level.sort_values('game_date', ascending = False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,n_games_career,n_games_season,fantasy_points_mean_career,fantasy_points_total_career,fantasy_points_mean_season,fantasy_points_total_season,fantasy_points_mean_last5,fantasy_points_total_last5,fantasy_points_last,reception_mean_career,reception_total_career,reception_mean_season,reception_total_season,reception_mean_last5,reception_total_last5,reception_last,rushing_yards_mean_career,rushing_yards_total_career,rushing_yards_mean_season,rushing_yards_total_season,rushing_yards_mean_last5,rushing_yards_total_last5,rushing_yards_last,touchdown_mean_career,touchdown_total_career,touchdown_mean_season,touchdown_total_season,touchdown_mean_last5,touchdown_total_last5,touchdown_last,receiving_yards_mean_career,receiving_yards_total_career,receiving_yards_mean_season,receiving_yards_total_season,receiving_yards_mean_last5,receiving_yards_total_last5,receiving_yards_last,fumble_mean_career,fumble_total_career,fumble_mean_season,fumble_total_season,fumble_mean_last5,fumble_total_last5,fumble_last,passing_yards_mean_career,passing_yards_total_career,passing_yards_mean_season,passing_yards_total_season,passing_yards_mean_last5,passing_yards_total_last5,passing_yards_last,pass_touchdown_mean_career,pass_touchdown_total_career,pass_touchdown_mean_season,pass_touchdown_total_season,pass_touchdown_mean_last5,pass_touchdown_total_last5,pass_touchdown_last,two_points_mean_career,two_points_total_career,two_points_mean_season,two_points_total_season,two_points_mean_last5,two_points_total_last5,two_points_last
game_id,game_date,week,season,posteam,opponent_team,player_name,player_id,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
2024_09_WAS_NYG,2024-11-03,9,2024,WAS,NYG,Z.Ertz,00-0030061,119,7,15.095462,1796.36,11.46,80.22,12.124,60.62,17.78,4.890756,582.0,4.142857,29.0,4.4,22.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.336134,40.0,0.142857,1.0,0.2,1.0,0.0,52.369748,6232.0,46.142857,323.0,46.6,233.0,77.0,0.067227,8.0,0.142857,1.0,0.2,1.0,0.0,52.344538,6229.0,46.142857,323.0,46.6,233.0,77.0,0.336134,40.0,0.142857,1.0,0.2,1.0,0.0,0.008403,1.0,0.0,0.0,0.0,0.0,0.0
2024_09_LAC_CLE,2024-11-03,9,2024,CLE,LAC,J.Akins,00-0034364,38,6,6.193684,235.36,7.2,43.2,7.284,36.42,0.0,1.894737,72.0,2.5,15.0,2.4,12.0,0.0,0.078947,3.0,0.0,0.0,0.0,0.0,0.0,0.157895,6.0,0.166667,1.0,0.2,1.0,0.0,21.684211,824.0,21.666667,130.0,20.6,103.0,0.0,0.078947,3.0,0.0,0.0,0.0,0.0,0.0,21.684211,824.0,21.666667,130.0,20.6,103.0,0.0,0.131579,5.0,0.166667,1.0,0.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024_09_LAC_CLE,2024-11-03,9,2024,LAC,CLE,J.Palmer,00-0036988,11,2,6.785455,74.64,5.99,11.98,10.68,53.4,7.32,2.181818,24.0,2.0,4.0,3.0,15.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.181818,376.0,28.5,57.0,52.0,260.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.181818,376.0,28.5,57.0,52.0,260.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,1.0,0.0,0.0,0.2,1.0,0.0
2024_09_LAC_CLE,2024-11-03,9,2024,LAC,CLE,J.Herbert,00-0036355,21,3,0.571429,12.0,-0.666667,-2.0,-0.8,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.238095,278.0,6.666667,20.0,17.6,88.0,2.0,0.142857,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190476,4.0,0.333333,1.0,0.4,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,1.0,0.0,0.0,0.0,0.0,0.0
2024_09_LAC_CLE,2024-11-03,9,2024,LAC,CLE,J.Dobbins,00-0036158,23,3,6.315652,145.26,6.746667,20.24,8.052,40.26,8.84,1.304348,30.0,2.0,6.0,2.0,10.0,2.0,67.043478,1542.0,90.333333,271.0,85.2,426.0,96.0,0.608696,14.0,0.666667,2.0,0.6,3.0,1.0,9.086957,209.0,5.333333,16.0,11.8,59.0,6.0,0.130435,3.0,0.0,0.0,0.0,0.0,0.0,9.086957,209.0,5.333333,16.0,11.8,59.0,6.0,0.043478,1.0,0.0,0.0,0.2,1.0,0.0,0.086957,2.0,0.0,0.0,0.0,0.0,0.0


# Opponent Last Scored

In [20]:
schedules_df_copy = schedules_df[schedules_df['game_id'].isin(schedules_df['game_id'].unique()) & (schedules_df['gameday'] >= '2001-09-09')]
schedules_df_copy.rename(columns = {'gameday':'game_date'}, inplace = True)

home_teams = schedules_df_copy[['game_id', 'game_date','season','home_team','away_score','week']].copy()

away_teams = schedules_df_copy[['game_id', 'game_date','season','away_team','home_score','week']].copy()

home_teams.rename(columns = {'home_team':'team','away_score':'points_allowed'}, inplace = True)
away_teams.rename(columns = {'away_team':'team','home_score':'points_allowed'}, inplace = True)

points_allowed_df = pd.concat([home_teams,away_teams])

points_allowed_df = points_allowed_df.groupby(['game_id', 'game_date','season','week','team']).agg({'points_allowed':'sum'})

group_sorted = points_allowed_df.sort_values('week')

# pa_df = group_sorted.groupby(['team']).apply(calc_agg_stats, fields=['points_allowed']).reset_index(0).drop(columns = 'team').reset_index()[['game_id','game_date','season','week','team','points_allowed_mean_season','points_allowed_mean_last5']]
pa_df = group_sorted.groupby(['team']).apply(calc_agg_stats, fields=['points_allowed'])
pa_df = pa_df.reset_index()[['game_id','game_date','season','week','team','points_allowed_mean_season','points_allowed_mean_last5']]


pa_df.rename(columns = {'team':'opponent_team'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schedules_df_copy.rename(columns = {'gameday':'game_date'}, inplace = True)


In [21]:
group_sorted.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,points_allowed
game_id,game_date,season,week,team,Unnamed: 5_level_1
2001_01_ATL_SF,2001-09-09,2001,1,ATL,16.0
2003_01_STL_NYG,2003-09-07,2003,1,STL,23.0
2003_01_TB_PHI,2003-09-08,2003,1,PHI,17.0
2003_01_TB_PHI,2003-09-08,2003,1,TB,0.0
2015_01_PIT_NE,2015-09-10,2015,1,PIT,28.0


### Features from pa_df are the oppositions points allowed until a certain measure of time

In [22]:

rusher_receiver_features = rusher_receiver_df.merge(df_rusher_receiver_game_level, how = 'inner' ,on = ['game_id','game_date','week','season','posteam','opponent_team','player_name','player_id'])
# rusher_receiver_features['opponent_team'] = np.where(rusher_receiver_features['team'] == rusher_receiver_features['home_team'],rusher_receiver_features['away_team'],rusher_receiver_features['home_team'])
rusher_receiver_features = rusher_receiver_features.merge(pa_df , how = 'inner',on = ['game_date','season','week','opponent_team','game_id'])


rusher_receiver_features = rusher_receiver_features.fillna(0)

# Last Game Stats

In [23]:
rusher_receiver_features.shape

(78841, 104)

In [24]:
df_combined = rusher_receiver_features.copy()

In [25]:
# Calculate the percentage of null values in each column
null_percentages = rusher_receiver_features.isnull().mean() * 100

# Sort the percentages in descending order for better readability
null_percentages = null_percentages.sort_values(ascending=False)

# Format the output to display percentages with two decimal places
null_percentages_formatted = null_percentages.apply(lambda x: f"{x:.2f}%")

# Print the results
print("Percentage of Null Values in Each Column:")
print(null_percentages_formatted)





Percentage of Null Values in Each Column:
game_id                      0.00%
game_date                    0.00%
fumble_mean_season           0.00%
fumble_total_career          0.00%
fumble_mean_career           0.00%
                             ...  
receiving_yards              0.00%
lateral_rush                 0.00%
rush_touchdown               0.00%
rushing_yards                0.00%
points_allowed_mean_last5    0.00%
Length: 104, dtype: object


In [26]:
# Ensure 'temp' and 'wind' are numeric
df_combined['temp'] = pd.to_numeric(df_combined['temp'], errors='coerce')
df_combined['wind'] = pd.to_numeric(df_combined['wind'], errors='coerce')

# Calculate mean 'temp' and 'wind' by stadium
temp_wind_means = (
    df_combined.groupby('stadium')[['temp', 'wind']]
    .mean()
    .reset_index()
)

# Merge the mean values back to the original DataFrame
df_combined = pd.merge(
    df_combined,
    temp_wind_means,
    on='stadium',
    how='left',
    suffixes=('', '_mean')
)

# Impute missing 'temp' and 'wind' with the group mean values
df_combined['temp'].fillna(df_combined['temp_mean'], inplace=True)
df_combined['wind'].fillna(df_combined['wind_mean'], inplace=True)

# If any missing 'temp' or 'wind' values remain, fill them with the overall mean
df_combined['temp'].fillna(df_combined['temp'].mean(), inplace=True)
df_combined['wind'].fillna(df_combined['wind'].mean(), inplace=True)

# Drop the temporary mean columns
df_combined.drop(columns=['temp_mean', 'wind_mean'], inplace=True)

# For the rest of the columns, fill missing values with 0
# Exclude 'temp' and 'wind' as they've already been imputed
columns_to_fill = df_combined.columns.difference(['temp', 'wind'])
df_combined[columns_to_fill] = df_combined[columns_to_fill].fillna(0)

# Check if any missing values remain
remaining_nulls = df_combined.isnull().sum()
if remaining_nulls.sum() > 0:
    print("Remaining null values after imputation:")
    print(remaining_nulls[remaining_nulls > 0])
else:
    print("All missing values have been imputed.")

All missing values have been imputed.


In [27]:
x_vars = ['home_team','away_team','spread_line',
 'n_games_career',
 'n_games_season',
 'fantasy_points_mean_career',
 'fantasy_points_total_career',
 'fantasy_points_mean_season',
 'fantasy_points_total_season',
 'fantasy_points_mean_last5',
 'fantasy_points_total_last5',
 'fantasy_points_last',
 'reception_mean_career',
 'reception_total_career',
 'reception_mean_season',
 'reception_total_season',
 'reception_mean_last5',
 'reception_total_last5',
 'reception_last',
 'rushing_yards_mean_career',
 'rushing_yards_total_career',
 'rushing_yards_mean_season',
 'rushing_yards_total_season',
 'rushing_yards_mean_last5',
 'rushing_yards_total_last5',
 'rushing_yards_last',
 'touchdown_mean_career',
 'touchdown_total_career',
 'touchdown_mean_season',
 'touchdown_total_season',
 'touchdown_mean_last5',
 'touchdown_total_last5',
 'touchdown_last',
 'receiving_yards_mean_career',
 'receiving_yards_total_career',
 'receiving_yards_mean_season',
 'receiving_yards_total_season',
 'receiving_yards_mean_last5',
 'receiving_yards_total_last5',
 'receiving_yards_last',
 'fumble_mean_career',
 'fumble_total_career',
 'fumble_mean_season',
 'fumble_total_season',
 'fumble_mean_last5',
 'fumble_total_last5',
 'fumble_last',
 'passing_yards_mean_career',
 'passing_yards_total_career',
 'passing_yards_mean_season',
 'passing_yards_total_season',
 'passing_yards_mean_last5',
 'passing_yards_total_last5',
 'passing_yards_last',
 'pass_touchdown_mean_career',
 'pass_touchdown_total_career',
 'pass_touchdown_mean_season',
 'pass_touchdown_total_season',
 'pass_touchdown_mean_last5',
 'pass_touchdown_total_last5',
 'pass_touchdown_last',
 'two_points_mean_career',
 'two_points_total_career',
 'two_points_mean_season',
 'two_points_total_season',
 'two_points_mean_last5',
 'two_points_total_last5',
 'two_points_last',
 'opponent_team',
 'points_allowed_mean_season',
 'points_allowed_mean_last5']

# Feature Selection

In [32]:
from nfl_model import NFLModel


In [33]:
def get_dummy_variables(df, drop_first=True, dummy_na=False):
    """
    Converts non-numerical columns in a DataFrame to dummy variables.

    Parameters:
    - df: pandas DataFrame
        The input DataFrame containing the data.
    - drop_first: bool, default=False
        Whether to drop the first level of categorical variables to avoid the dummy variable trap.
    - dummy_na: bool, default=False
        Add a column to indicate NaNs, if False NaNs are ignored.

    Returns:
    - df_dummies: pandas DataFrame
        The DataFrame with non-numeric columns converted to dummy variables.
    """
    # Identify non-numeric columns
    non_numeric_cols = df.select_dtypes(exclude=['number', 'bool']).columns.tolist()

    # If there are no non-numeric columns, return the original DataFrame
    if not non_numeric_cols:
        print("No non-numerical columns to convert.")
        return df.copy()

    # Convert categorical variables to dummy variables
    df_dummies = pd.get_dummies(df, columns=non_numeric_cols, drop_first=drop_first, dummy_na=dummy_na)

    return df_dummies

In [34]:
columns_to_include = df_combined.columns.difference(['game_id', 'game_date', 'player_name'])
final_df = df_combined[x_vars + ['fantasy_points']].copy()
# final_df['player_id'] = final_df['player_id'].astype('category')
final_df = get_dummy_variables(final_df)

y_var = 'fantasy_points'

In [31]:
# Initialize the model
model = NFLModel(data=final_df, target_variable=y_var)

# Preprocess data
model.preprocess_data()

# Perform feature selection
model.feature_selection()

# Evaluate models
model.evaluate_models()

# Get and print the results
results_df = model.get_results()
print(results_df)

Data preprocessing completed.
Lasso selected features: ['n_games_career', 'n_games_season', 'fantasy_points_mean_career', 'fantasy_points_total_season', 'fantasy_points_total_last5', 'reception_mean_season', 'reception_mean_last5', 'reception_total_last5', 'reception_last', 'rushing_yards_mean_season', 'rushing_yards_last', 'touchdown_total_season', 'receiving_yards_mean_career', 'receiving_yards_total_last5', 'fumble_mean_career', 'passing_yards_mean_career', 'pass_touchdown_total_career', 'two_points_total_career', 'points_allowed_mean_last5', 'away_team_IND', 'opponent_team_DET', 'opponent_team_PIT', 'opponent_team_TEN']
Elastic Net selected features: ['n_games_career', 'n_games_season', 'fantasy_points_mean_career', 'fantasy_points_total_career', 'fantasy_points_mean_season', 'fantasy_points_total_season', 'fantasy_points_total_last5', 'reception_mean_career', 'reception_mean_season', 'reception_total_season', 'reception_mean_last5', 'reception_total_last5', 'reception_last', 'rush

In [35]:
model.evaluate_models_train_data()

Model evaluation completed.


In [36]:
model.results

{'Method': ['Lasso', 'Lasso', 'ElasticNet', 'ElasticNet'],
 'Model': ['Linear Regression',
  'Random Forest',
  'Linear Regression',
  'Random Forest'],
 'MAE': [5.343331544187606,
  2.136150635070646,
  5.334698351023667,
  2.065068551975472],
 'MSE': [56.11856016071055,
  9.582633558519158,
  55.917493199436365,
  8.78914995069062],
 'R2': [0.30674250994042007,
  0.8816214730043854,
  0.30922637938588815,
  0.8914237283151295]}

# Scoreout Data

In [313]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

def preprocess_data(data,target_variable):
        """
        Preprocesses the data by splitting into training and testing sets,
        converting categorical variables to dummy variables, and scaling the features.
        """
        # Separate features and target
        X = data.drop(columns=[target_variable])
        y = data[target_variable]

        # Split data into training and testing sets
        X_train_raw, X_test_raw, y_train, y_test = train_test_split(
            X, y, test_size=.2, random_state= 42
        )

        # Align the training and testing data
        X_train, X_test = X_train_raw.align(X_test_raw, join='left', axis=1, fill_value=0)

        # Convert y_train and y_test to 1D arrays if necessary
        y_train = y_train.values.ravel()
        y_test = y_test.values.ravel()

        # Standardize data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)


        return X_train_scaled,X_test_scaled,y_train,y_test



In [314]:
final_features = model.elastic_net_features

In [317]:
x_train,x_test,y_train,y_test = preprocess_data(final_df, 'fantasy_points')


model = LinearRegression()

model.fit(x_train,y_train)


In [331]:
# Predict train and test
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)


final_df['predictions'] = np.hstack([y_pred_train, y_pred_test])

In [339]:
 
 final_df[['game_id',
 'game_date',
 'week',
 'div_game',
 'posteam',
 'opponent_team',
 'home_team',
 'away_team',
 'weather',
 'stadium',
 'spread_line',
 'total_line',
 'roof',
 'surface',
 'temp',
 'wind',
 'home_coach',
 'away_coach',
 'player_id',
 'player_name',
 'season']] = df_combined[['game_id',
 'game_date',
 'week',
 'div_game',
 'posteam',
 'opponent_team',
 'home_team',
 'away_team',
 'weather',
 'stadium',
 'spread_line',
 'total_line',
 'roof',
 'surface',
 'temp',
 'wind',
 'home_coach',
 'away_coach',
 'player_id',
 'player_name',
 'season']]

In [341]:
final_df.to_csv('./rb_wr_predictions.csv')

## Performance

In [215]:
from sklearn.metrics import mean_absolute_error,mean_squared_error



In [343]:
# preds = ebm.predict(X_test_raw)

# preds_df['actual'] = y_test['fantasy_points']



print("MAE: " + str(mean_absolute_error(y_pred_test, y_test)))

print("MSE: " + str(mean_squared_error(y_pred_test, y_test)))




MAE: 5.321056013474444
MSE: 55.95881634529496
