In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
import requests
import warnings
import json
# import matplotlib.pyplot as plt
#import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

# Read in Excel Data

In [2]:
excel_path = '/'.join(['.','Data',
                      'NBA Stats_2021-2023_01292024.xlsx'])

print('Reading in games...')
games = pd.read_excel(excel_path,
                     sheet_name = 'Games',
                     header = 0,
                     index_col = 'game_id',
                     parse_dates = ['game_date'],
                     engine='openpyxl')

print('Reading in players...')
players = pd.read_excel(excel_path,
                     sheet_name = 'Players',
                     header = 0,
                       index_col = 'player_id',
                       engine='openpyxl')

print('Reading in teams...')
teams = pd.read_excel(excel_path,
                     sheet_name = 'Teams',
                     header = 0,
                     index_col = 'team_id',
                     engine='openpyxl')

print('Reading in stats...')
stats = pd.read_excel(excel_path,
                     sheet_name = 'Stats',
                     header = 0,
                     engine='openpyxl')

Reading in games...
Reading in players...
Reading in teams...
Reading in stats...


In [3]:
games.head()

Unnamed: 0_level_0,game_date,game_season,game_status,game_period,game_time,game_home_team_id,game_home_team_score,game_visitor_team_id,game_visitor_team_score,game_preseason,game_postseason
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
473410,2021-10-19,2021,Final,4,,17,127,3,104,N,False
473409,2021-10-19,2021,Final,4,,14,114,10,121,N,False
473413,2021-10-20,2021,Final,4,,28,83,30,98,N,False
473415,2021-10-20,2021,Final,4,,15,132,6,121,N,False
473414,2021-10-20,2021,Final,6,,20,138,2,134,N,False


In [4]:
players.head()

Unnamed: 0_level_0,player_team_id,player_position,player_full_name,player_first_name,player_last_name,player_height_feet,player_height_inches,player_weight_pounds
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17896049,3,G,David Duke Jr.,David,Duke Jr.,,,
17553995,14,G,Austin Reaves,Austin,Reaves,,,
666543,14,F,Sekou Doumbouya,Sekou,Doumbouya,,,
59,14,G,Avery Bradley,Avery,Bradley,6.0,2.0,180.0
220,14,C,Dwight Howard,Dwight,Howard,6.0,11.0,265.0


In [5]:
stats.head()

Unnamed: 0,id,game_id,player_id,team_id,opponent_team_id,min,fga,fgm,fg_pct,fta,ftm,ft_pct,fg3a,fg3m,fg3_pct,oreb,dreb,reb,pts,ast,stl,blk,pf,turnover
0,7315088,473410,17896049,3,17,0:00,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0
1,7315614,473409,17553995,14,10,0:00,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0
2,7315611,473409,666543,14,10,0:00,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0
3,7315610,473409,59,14,10,8,3,2,0.667,0,0,,3,2,0.667,1,0,1,6,1,0,0,2,1
4,7315612,473409,220,14,10,13,2,1,0.5,4,3,0.75,0,0,,0,6,6,5,0,0,0,2,2


In [6]:
# Convert 'min' to numeric and fill NaN with 0
stats['min'] = pd.to_numeric(stats['min'],
                             errors = 'coerce')\
                .fillna(0)

In [7]:
# Fantasy points (PrizePicks)
## Points = 1
## Rebound = 1.2
## Assists = 1.5
## Block = 3
## Steals = 3
## Turnover = -1
stats['fpts'] = stats['pts']\
                + (1.2 * stats['reb'])\
                + (1.5 * stats['ast'])\
                + (3 * stats['blk'])\
                + (3 * stats['stl'])\
                + (-1 * stats['turnover'])

In [8]:
# Merge date of games
stats_dates = pd.merge(stats,
                      games[['game_date']],
                      how = 'left',
                      left_on = 'game_id',
                      right_index = True)\
                .sort_values(['game_date','player_id'])

In [9]:
# Merge player positions
stats_dates_pos = pd.merge(stats_dates,
                           players[['player_position']],
                           how = 'left',
                           left_on = 'player_id',
                           right_index = True)

In [10]:
# Calculate pts per 36 min by game
#stats_dates_pos['pts_per36'] = 36 * (stats_dates_pos['pts']/stats_dates_pos['min'])

In [11]:
# Determine rolling_period on rolling metrics for entire analysis
rolling_period = 62    # Based on number of observations/games, not number of days

# rolling_period_days = '120D'    # Based on number of days, not number of observations/games

# Team Stats

In [12]:
stats_dates_pos.head()

Unnamed: 0,id,game_id,player_id,team_id,opponent_team_id,min,fga,fgm,fg_pct,fta,ftm,ft_pct,fg3a,fg3m,fg3_pct,oreb,dreb,reb,pts,ast,stl,blk,pf,turnover,fpts,game_date,player_position
26,7315084,473410,6,3,17,16.0,2,0,0.0,4,1,0.25,0,0,,0,3,3,1,1,0,1,2,1,8.1,2021-10-19,F
35,7315097,473410,8,17,3,28.0,10,3,0.3,1,1,1.0,10,3,0.3,0,4,4,10,6,2,0,0,0,29.8,2021-10-19,G
48,7315095,473410,15,17,3,31.0,25,12,0.48,9,7,0.778,4,1,0.25,5,9,14,32,7,1,2,4,4,64.3,2021-10-19,F
6,7315609,473409,17,14,10,26.0,9,3,0.333,2,1,0.5,4,2,0.5,0,4,4,9,2,0,0,2,1,15.8,2021-10-19,F
9,7315607,473409,36,14,10,31.0,9,3,0.333,0,0,,8,2,0.25,0,2,2,8,0,0,1,4,1,12.4,2021-10-19,G


In [13]:
def team_pts_rolling_mean(group):
    group_indexed = group.set_index('game_date')
    
    rolling_group = group_indexed\
                    [['pts']]\
                    .rolling(rolling_period,
                             min_periods = 1)\
                    .mean()\
                    .rename(columns = {'pts':'pgm'})
    
    return rolling_group

## Defensive Efficiency by Position

In [14]:
team_def_eff_pos = stats_dates_pos\
                        .groupby(['opponent_team_id','game_date','player_position'])\
                        [['pts']]\
                        .sum()\
                        .reset_index()

In [15]:
team_stats_rolling_def_eff = team_def_eff_pos.groupby(['opponent_team_id','player_position'])\
                            .apply(team_pts_rolling_mean)\
                            .reset_index()\
                            .rename(columns = {'opponent_team_id':'team_id',
                                              'pgm':'def_pgm'})

## Offensive Efficiency by Position

In [16]:
team_off_eff_pos = stats_dates_pos\
                        .groupby(['team_id','game_date','player_position'])\
                        [['pts']]\
                        .sum()\
                        .reset_index()

In [17]:
team_stats_rolling_off_eff = team_off_eff_pos.groupby(['team_id','player_position'])\
                            .apply(team_pts_rolling_mean)\
                            .reset_index()\
                            .rename(columns = {'pgm':'off_pgm'})\

SyntaxError: unexpected EOF while parsing (1578752808.py, line 4)

## Merge Team Defensive and Offensive Efficiency by Position

In [None]:
team_eff_pos = pd.merge(team_stats_rolling_off_eff,
                        team_stats_rolling_def_eff,
                        how = 'outer',
                        left_on = ['team_id','game_date','player_position'],
                        right_on = ['team_id','game_date','player_position'])

In [None]:
team_eff_pos[(team_eff_pos['game_date'].isin(['2021-10-19']))
            & (team_eff_pos['player_position'] == 'C')]

## League Team Averages

In [None]:
def league_team_pts_stats(group):
    '''
    Grouping function to get standard deviation of pts_per36 for each player on a given date
    Need to account for offseason/days where there are no games
    '''
    # Sort by date and player prior to resampling
    group_sorted = group.sort_values(['game_date','team_id'])\
                        .set_index(['game_date'])
    
    # Resample data to daily by each player
    ## Forward fill blank values
    group_resampled = group_sorted.groupby('team_id')\
                        .resample('1D')\
                        [['pts_off','pts_def']]\
                        .last()
    
    # Were games played on date
    ## Due to resample, dates in the off season were added
    ## Need to remove; will cause calculations over at the beginning of each season
    date_no_minutes = group_resampled.groupby(level = 1)\
                        .apply(lambda x: x.isna()\
                                           .all()
                              )
    
    # Drop dates with no games
    ## Includes in-season breaks
    date_no_games = date_no_minutes[(date_no_minutes['pts_off'] == True)
                                   & (date_no_minutes['pts_def'] == True)].index
    
    group_resampled.drop(index = date_no_games,
                        level = 1,
                        inplace = True)
    
    # Forward fill values by player
    final_group_resampled = group_resampled.groupby(level = [0])[['pts_off','pts_def']].ffill()
    
    # Calculate the standard deviation of pts_per36 for all players by date
    final_rolling_stats = final_group_resampled.groupby(level = 1).agg(['mean','std'])
    
    return final_rolling_stats

In [None]:
# Calculate teams offensive pts production by position per game
team_pos_off_pts_total = stats_dates_pos.groupby(['game_date',
                                                  'player_position',
                                                  'team_id'])\
                            [['pts']]\
                            .sum()\
                            .reset_index()\
                            .rename(columns = {'pts':'pts_off'})

# Calculate teams defensive pts production by position per game
team_pos_def_pts_total = stats_dates_pos.groupby(['game_date',
                                                  'player_position',
                                                  'opponent_team_id'])\
                            [['pts']]\
                            .sum()\
                            .reset_index()\
                            .rename(columns = {'opponent_team_id':'team_id',
                                               'pts':'pts_def'})

# Merge teams offensive and defensive production by position
team_pos_eff_total = pd.merge(team_pos_off_pts_total,
                               team_pos_def_pts_total,
                               how = 'outer',
                               left_on = ['game_date','team_id','player_position'],
                               right_on = ['game_date','team_id','player_position'])

In [None]:
team_pos_rolling_eff = team_pos_eff_total\
                        .groupby('player_position')\
                        .apply(league_team_pts_stats)

In [None]:
team_pos_rolling_eff.columns = ['_'.join(i) for i in team_pos_rolling_eff.columns]

In [None]:
team_pos_rolling_eff.reset_index(inplace = True)

In [None]:
team_pos_rolling_eff

# Player Stats

## Individual Player Efficiency

In [18]:
def player_pts_rolling_sum(group):
    group_indexed = group.sort_values('game_date')\
                        .set_index('game_date')

    group_index['prev_game_date'] = group_index['game_date'].shift(1)

    group_index['days_since_last_game'] = (group_index['game_date'] - group_index['prev_game_date']).days
    
    rolling_group = group_indexed\
                    [['min','pts']]\
                    .rolling(rolling_period,
                             min_periods = 1)\
                    .sum()
    
    return rolling_group

In [71]:
stats_dates[stats_dates['player_id'] == 417]['player_id'].cumsum()

634        417
1012       834
31943     1251
32373     1668
32994     2085
         ...  
94723    52542
95172    52959
95692    53376
96219    53793
96621    54210
Name: player_id, Length: 130, dtype: int64

In [19]:
player_stats_rolling_eff = stats_dates.groupby(stats_dates['player_id'])\
                            .apply(player_pts_rolling_sum)

# player_stats_rolling_eff = stats_dates.set_index('game_date')\
#                             .groupby(stats_dates['player_id'])\
#                             [['min','pts']]\
#                             .rolling(rolling_period,
#                                     min_periods = 1)\
#                             .sum()\
#                             .shift(1)\
#                             .dropna(how = 'all')

In [20]:
stat_categories = player_stats_rolling_eff.columns.drop('min')

In [21]:
for cat in stat_categories:
    player_stats_rolling_eff[cat+'_per36'] = player_stats_rolling_eff[cat] * (36/player_stats_rolling_eff['min'])

In [22]:
actual_eff_merged = pd.merge(stats_dates_pos[['game_date','player_id','player_position',
                                              'min','pts']],
                             player_stats_rolling_eff[['pts_per36']],
                             how = 'left',
                             left_on = ['player_id','game_date'],
                             right_index = True)#\
                    #.sort_values(['player_id','game_date'])

In [23]:
per36_stat_cols = [i for i in actual_eff_merged.columns if ('_per36' in i)]

In [24]:
player_stats_rolling_eff

Unnamed: 0_level_0,Unnamed: 1_level_0,min,pts,pts_per36
player_id,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,2021-10-20,32.0,8.0,9.000000
3,2021-10-23,59.0,25.0,15.254237
3,2021-10-24,93.0,39.0,15.096774
3,2021-10-27,111.0,43.0,13.945946
3,2021-10-28,132.0,55.0,15.000000
...,...,...,...,...
59421201,2023-12-13,0.0,0.0,
59421201,2023-12-15,0.0,0.0,
59421201,2023-12-16,0.0,0.0,
59421201,2023-12-18,0.0,0.0,


In [25]:
actual_eff_merged

Unnamed: 0,game_date,player_id,player_position,min,pts,pts_per36
26,2021-10-19,6,F,16.0,1,2.250000
35,2021-10-19,8,G,28.0,10,12.857143
48,2021-10-19,15,F,31.0,32,37.161290
6,2021-10-19,17,F,26.0,9,12.461538
9,2021-10-19,36,G,31.0,8,9.290323
...,...,...,...,...,...,...
96762,2024-01-29,56677866,F,12.0,0,9.818182
96659,2024-01-29,56677871,F,0.0,0,2.769231
96906,2024-01-29,56677872,G,0.0,0,16.095032
96514,2024-01-29,56783340,C,4.0,0,18.293160


## League Position Average

In [26]:
def league_player_pts_stats(group):
    '''
    Grouping function to get standard deviation of pts_per36 for each player on a given date
    Need to account for offseason/days where there are no games
    '''
    # Sort by date and player prior to resampling
    group_sorted = group.sort_values(['game_date','player_id'])\
                        .set_index(['game_date'])
    
    # Resample data to daily by each player
    ## Forward fill blank values
    group_resampled = group_sorted.groupby('player_id')\
                        .resample('1D')\
                        [['pts_per36']]\
                        .last()
    
    # Were games played on date
    ## Due to resample, dates in the off season were added
    ## Need to remove; will cause calculations over at the beginning of each season
    date_no_minutes = group_resampled.groupby(level = 1)\
                        .apply(lambda x: x.isna()\
                                           .all()
                              )
    
    # Drop dates with no games
    ## Includes in-season breaks
    date_no_games = date_no_minutes[(date_no_minutes['pts_per36'] == True)].index
    
    group_resampled.drop(index = date_no_games,
                        level = 1,
                        inplace = True)
    
    # Forward fill values by player
    final_group_resampled = group_resampled.groupby(level = [0])[['pts_per36']].ffill()
    
    # Calculate the standard deviation of pts_per36 for all players by date
    final_rolling_stats = final_group_resampled.groupby(level = 1).agg(['mean','std'])
    
    return final_rolling_stats

In [28]:
league_player_rolling_stats = actual_eff_merged.groupby('player_position')\
                                .apply(league_player_pts_stats)\
                                .reset_index()

In [60]:
players[players['player_full_name'] == 'Ben Simmons']

Unnamed: 0_level_0,player_team_id,player_position,player_full_name,player_first_name,player_last_name,player_height_feet,player_height_inches,player_weight_pounds
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
417,3,G-F,Ben Simmons,Ben,Simmons,6.0,10.0,230.0


In [48]:
actual_eff_merged

Unnamed: 0,game_date,player_id,player_position,min,pts,pts_per36
26,2021-10-19,6,F,16.0,1,2.250000
35,2021-10-19,8,G,28.0,10,12.857143
48,2021-10-19,15,F,31.0,32,37.161290
6,2021-10-19,17,F,26.0,9,12.461538
9,2021-10-19,36,G,31.0,8,9.290323
...,...,...,...,...,...,...
96762,2024-01-29,56677866,F,12.0,0,9.818182
96659,2024-01-29,56677871,F,0.0,0,2.769231
96906,2024-01-29,56677872,G,0.0,0,16.095032
96514,2024-01-29,56783340,C,4.0,0,18.293160


In [64]:
player_played = actual_eff_merged['min'].ne(0)
player_played1 = player_played.ne(player_played.groupby([#actual_eff_merged['game_date'],
                                                        actual_eff_merged['player_id']]
                                                       ).shift())
s = player_played1.cumsum()
actual_eff_merged['num_dnp_before'] = np.where(player_played & player_played1,
                                          s.map(s.value_counts()).shift(),
                                          0).astype(int)

In [66]:
actual_eff_merged[actual_eff_merged['player_id'] == 417].head(50)

Unnamed: 0,game_date,player_id,player_position,min,pts,pts_per36,num_dnp_before
634,2021-10-22,417,G-F,0.0,0,,0
1012,2021-10-24,417,G-F,0.0,0,,0
31943,2022-10-19,417,G-F,23.0,4,6.26087,2
32373,2022-10-21,417,G-F,33.0,6,6.428571,0
32994,2022-10-24,417,G-F,28.0,7,7.285714,0
33630,2022-10-26,417,G-F,34.0,4,6.40678,0
33732,2022-10-27,417,G-F,37.0,7,6.503226,0
34343,2022-10-29,417,G-F,36.0,9,6.973822,0
34742,2022-10-31,417,G-F,0.0,0,6.973822,0
34944,2022-11-01,417,G-F,0.0,0,6.973822,0


# Shift Data
Need to shift team efficiency, player efficiency, and league rolling data down a row so they can be used as dependent variables on the actual predictor variable

In [223]:
# Need to shift data by 1 day
actual_eff_shifted = actual_eff_merged[['game_date','player_id','pts_per36']]\
                        .groupby(['player_id'])\
                        .apply(lambda x: x.set_index('game_date')[['pts_per36']].shift(1))\
                        .reset_index()

In [226]:
league_pos_rolling_avg_shifted = league_pos_rolling_sum[['game_date','player_position','pts_per36']]\
                                    .groupby(['player_position'])\
                                    .apply(lambda x: x.set_index('game_date')[['pts_per36']].shift(1))\
                                    .reset_index()

In [227]:
league_pos_rolling_std_shifted = league_pos_rolling_std[['game_date','player_position','pts_per36']]\
                                    .groupby(['player_position'])\
                                    .apply(lambda x: x.set_index('game_date')[['pts_per36']].shift(1))\
                                    .reset_index()

In [229]:
# Table of relevant fields from original data
rel_stats = stats_dates_pos[['game_date',
                             'player_id','player_position',
                             'team_id','opponent_team_id',
                             'min','pts']]

In [238]:
# Merge player efficiency onto original data
player_stats_eff = pd.merge(rel_stats,
                            actual_eff_shifted\
                                .rename(columns = {'pts_per36':'previous_rolling_pts_per36'}),
                            how = 'left',
                            left_on = ['game_date','player_id'],
                            right_on = ['game_date','player_id']
                           )

In [235]:
# Merge rolling statistics into one table
league_rolling_stats = pd.merge(league_pos_rolling_avg_shifted\
                                   .rename(columns = {'pts_per36':'pos_pts_per36_mean'}),
                                league_pos_rolling_std_shifted\
                                    .rename(columns = {'pts_per36':'pos_pts_per36_std'}),
                                how = 'outer',
                                left_on = ['player_position','game_date'],
                                right_on = ['player_position','game_date']
                               )

In [239]:
# Merge individual player efficiency stats to league rolling stats
player_league_rolling = pd.merge(player_stats_eff,
                                league_rolling_stats,
                                how = 'outer',
                                left_on = ['game_date','player_position'],
                                right_on = ['game_date','player_position'])

In [242]:
player_league_rolling[(player_league_rolling['game_date'] == '2021-10-31')]

Unnamed: 0,game_date,player_id,player_position,team_id,opponent_team_id,min,pts,previous_rolling_pts_per36,pos_pts_per36_mean,pos_pts_per36_std
2431,2021-10-31,6,F,3,9,21.0,16,21.386139,16.267085,5.890860
2432,2021-10-31,15,F,17,29,35.0,25,31.619048,16.267085,5.890860
2433,2021-10-31,17,F,14,11,25.0,23,20.509091,16.267085,5.890860
2434,2021-10-31,24,F,26,7,0.0,0,7.200000,16.267085,5.890860
2435,2021-10-31,30,F,26,7,38.0,15,24.064171,16.267085,5.890860
...,...,...,...,...,...,...,...,...,...,...
2559,2021-10-31,3547252,F-C,4,25,0.0,0,,17.544452,5.120224
2560,2021-10-31,3547267,F-C,9,3,18.0,5,14.033898,17.544452,5.120224
2561,2021-10-31,17896032,F-C,4,25,0.0,0,,17.544452,5.120224
2562,2021-10-31,17896062,F-C,11,14,20.0,11,16.468085,17.544452,5.120224


In [None]:
stats_dates_pos[(stats_dates_pos['game_date'] == '2021-10-19')
               & (stats_dates_pos['player_position'] == 'F')
               & (stats_dates_pos['min'].notna())]#['pts_per36'].mean()

In [None]:
team_stats_rolling_eff.head()

# Extra

In [None]:
def league_team_pts_rolling_mean(group):
    group_indexed = group.set_index('game_date')
    
    rolling_group = group_indexed.groupby(['opponent_team_id','player_position'])\
                    [['pts']]\
                    .rolling(rolling_period,
                             min_periods = 1)\
                    .mean()\
                    .shift(1)\
                    .rename(columns = {'pts':'pgm'})

    final_rolling = rolling_group.droplevel([0,1])
    
    return final_rolling

In [None]:
team_stats_rolling_eff[(team_stats_rolling_eff['opponent_team_id'] == 1)
                       & (team_stats_rolling_eff['player_position'] == 'C')]

In [None]:
team_stats_by_game[team_stats_by_game['team_id'] == 10]