In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
#import pytz
import scipy
import requests
import warnings
import json
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

# Read in Excel Data

In [20]:
class NBAData:
    def __init__(self, sheet_name):
        self.sheet_name = sheet_name
    
    def data(self):
        excel_path = excel_path = '/'.join(['.','Data',
                                            'NBA Stats_2021-2023_01292024.xlsx'])
        
        excel_sheet_name = self.sheet_name.title()

        sheet_index_cols = {'Players':'player_id',
                            'Teams':'team_id',
                            'Stats':None,
                            'Games':'game_id'}

        print('Reading in %s' %excel_sheet_name)
        raw_data = pd.read_excel(excel_path,
                             sheet_name = excel_sheet_name,
                             header = 0,
                             index_col = sheet_index_cols[excel_sheet_name],
                             engine = 'openpyxl')

        if excel_sheet_name == 'Stats':
            # Clean stats data and add column for fantasy points
            # Convert 'min' to numeric and fill NaN with 0
            raw_data['min'] = pd.to_numeric(raw_data['min'],
                                        errors = 'coerce')\
                            .fillna(0)

            # Shorthand turnover column
            raw_data.rename(columns = {'turnover':'to'},
                        inplace = True)

            # Fantasy points (PrizePicks)
            ## Points = 1
            ## Rebound = 1.2
            ## Assists = 1.5
            ## Block = 3
            ## Steals = 3
            ## Turnover = -1
            raw_data['fpts'] = raw_data['pts']\
                            + (1.2 * raw_data['reb'])\
                            + (1.5 * raw_data['ast'])\
                            + (3 * raw_data['blk'])\
                            + (3 * raw_data['stl'])\
                            + (-1 * raw_data['to'])
        elif excel_sheet_name == 'Games':
            raw_data['game_date'] == pd.to_datetime(raw_data['game_date'],
                                                errors = 'coerce')

        return raw_data

In [21]:
stats_data = NBAData('stats').data()

Reading in Stats


In [22]:
games_data = NBAData('games').data()

Reading in Games


In [23]:
players_data = NBAData('players').data()

Reading in Players


## Merge Game and Position Data

In [None]:
# Merge date of games
stats_dates = pd.merge(stats,
                      games[['game_date']],
                      how = 'left',
                      left_on = 'game_id',
                      right_index = True)\
                .sort_values(['game_date','player_id'])

In [None]:
# Merge player positions
stats_dates_pos = pd.merge(stats_dates,
                           players[['player_position']],
                           how = 'left',
                           left_on = 'player_id',
                           right_index = True)

# Parameters

In [None]:
class Parameters:
    '''Class object to store specified parameters and
    return objects useful for script'''

    def __init__(self, rolling_period, stat_cats):
        self.rolling_period = rolling_period
        self.stat_cats = stat_cats

    valid_cats = ['fga','fgm',
                    'fta','ftm',
                    'fg3a','fg3m',
                    'oreb','dreb','reb',
                    'pts','ast',
                    'stl','blk',
                    'pf','to']
    
    # Check if specified rolling period is a valid integer
    try:
        int(self.rolling_period)
    except TypeError:
        print('Specified rolling period not of type integer')
        sys.exit()

    if len(self.stat_cats.split(',')) > 0:
        specified_stats = [cat.strip()
                           for cat in self.stat_cats.split(',')]

        invalid_cats = [cat
                        for cat in specified_stats
                        if cat not in valid_cats]

        if len(invalid_cats) > 0:
            raise KeyError('Invalid statistical category provided')
    
    def per36stat_col_names(self):
        return [cat+'_per36' for cat in self.stat_cats]

    def pergamestat_col_names(self):
        return {cat:cat+'_pg'
                for cat in valid_cats}

In [None]:
# Determine rolling_period on rolling metrics for entire analysis
rolling_period = 62    # Based on number of observations/games, not number of days

# rolling_period_days = '120D'    # Based on number of days, not number of observations/games

In [None]:
stat_cats = ['pts']

In [None]:
valid_cats = ['fga','fgm',
              'fta','ftm',
              'fg3a','fg3m',
              'oreb','dreb','reb',
              'pts','ast',
              'stl','blk',
              'pf','to']

invalid_cats = [cat for cat in stat_cats if cat not in valid_cats]

if len(invalid_cats) > 0:
    raise KeyError('Invalid statistical category provided')

In [None]:
cats_pergame_name_map = {cat:cat+'_pg' for cat in valid_cats}

# Add/Impute Statistics

## Team Stats

In [None]:
stats_dates_pos.head()

In [None]:
def team_rolling_mean(group):
    group_indexed = group.set_index('game_date')
    
    rolling_group = group_indexed\
                    [stat_cats]\
                    .rolling(rolling_period,
                             min_periods = 1)\
                    .mean()\
                    .rename(columns = cats_pergame_name_map)
    
    return rolling_group

### Defensive Efficiency by Position

In [None]:
team_def_eff_pos = stats_dates_pos\
                        .groupby(['opponent_team_id','game_date','player_position'])\
                        [stat_cats]\
                        .sum()\
                        .reset_index()

In [None]:
team_stats_rolling_def_eff = team_def_eff_pos.groupby(['opponent_team_id','player_position'])\
                            .apply(team_rolling_mean)\
                            .reset_index()\
                            .rename(columns = {'opponent_team_id':'team_id'})

team_stats_rolling_def_eff.rename(columns = {value:'opp_'+value
                                             for key,value in cats_pergame_name_map.items()},
                                 inplace = True)

In [None]:
player_position = 'F'

fig = plt.figure(figsize = (20,20))
ax = plt.subplot(2,1,1)
sns.lineplot(data = team_stats_rolling_def_eff[team_stats_rolling_def_eff['player_position'] == player_position],
                  x = 'game_date',
                  y = 'opp_pts_pg',
                  hue = 'team_id',
                  palette = sns.color_palette(),
                  ci = None)
ax2 = sns.lineplot(data = team_stats_rolling_def_eff[team_stats_rolling_def_eff['player_position'] == player_position]\
                           .groupby('game_date')\
                           [['opp_pts_pg']]\
                           .mean()\
                           .reset_index(),
                  x = 'game_date',
                  y = 'opp_pts_pg',
                   color = 'red',
                  dashes = True)
plt.legend(loc = 'upper right')

ax = plt.subplot(2,1,2)
ax2 = sns.lineplot(data = team_stats_rolling_def_eff[team_stats_rolling_def_eff['player_position'] == player_position]\
                           .groupby('game_date')\
                           [['opp_pts_pg']]\
                           .std()\
                           .reset_index(),
                  x = 'game_date',
                  y = 'opp_pts_pg',
                   color = 'red',
                  dashes = True)

fig.show()

#### Ranking Team Defensive Capability by Position

In [None]:
team_def_eff_resampled = team_stats_rolling_def_eff.set_index('game_date')\
                            .groupby(['team_id',
                                      'player_position'])\
                            .resample('1D')\
                            [['opp_pts_pg']]\
                            .ffill()\
                            .reset_index()

In [None]:
team_def_eff_ranked = team_def_eff_resampled.groupby(['game_date','player_position'])\
                        .apply(lambda x: x.set_index('team_id')
                                           [['opp_pts_pg']]\
                                           .rank(method = 'min',
                                                 na_option = 'bottom')\
                              )\
                        .reset_index()

In [None]:
team_def_eff_pos_mean = team_def_eff_ranked.groupby(['team_id','player_position'])[['opp_pts_pg']].mean().reset_index()
team_def_eff_pos_mean_pivoted = pd.pivot_table(team_def_eff_pos_mean,
                                               index = 'player_position',
                                               columns = 'team_id')

In [None]:
team_def_eff_pos_mean_pivoted

In [None]:
player_position = 'G-F'

fig = plt.figure(figsize = (20,10))
ax = sns.lineplot(data = team_def_eff_ranked[(team_def_eff_ranked['player_position'] == player_position)
                                              & (team_def_eff_ranked['team_id'] == 14)],
                  x = 'game_date',
                  y = 'opp_pts_pg',
                  hue = 'team_id',
                  palette = sns.color_palette(),
                  ci = None)
plt.legend(loc = 'upper right')
fig.show()

### Offensive Efficiency by Position

In [None]:
team_off_eff_pos = stats_dates_pos\
                        .groupby(['team_id','game_date','player_position'])\
                        [stat_cats]\
                        .sum()\
                        .reset_index()

In [None]:
team_stats_rolling_off_eff = team_off_eff_pos.groupby(['team_id','player_position'])\
                            .apply(team_rolling_mean)\
                            .reset_index()

### Merge Team Defensive and Offensive Efficiency by Position

In [None]:
team_eff_pos = pd.merge(team_stats_rolling_off_eff,
                        team_stats_rolling_def_eff,
                        how = 'outer',
                        left_on = ['team_id','game_date','player_position'],
                        right_on = ['team_id','game_date','player_position'])

### League Team Averages

In [None]:
def league_team_stats(group):
    '''
    Grouping function to get standard deviation of stat per36 for each player on a given date
    Need to account for offseason/days where there are no games
    '''
    team_stat_cats = ['_'.join([cat,'pg']) for cat in stat_cats]
    opp_stat_cats = ['_'.join(['opp',cat,'pg']) for cat in stat_cats]

    resample_stat_cats = team_stat_cats + opp_stat_cats
    
    # Sort by date and player prior to resampling
    group_sorted = group.sort_values(['game_date','team_id'])\
                        .set_index(['game_date'])
    
    # Resample data to daily by each player
    ## Forward fill blank values
    group_resampled = group_sorted.groupby('team_id')\
                        .resample('1D')\
                        [resample_stat_cats]\
                        .last()
    
    # Were games played on date
    ## Due to resample, dates in the off season were added
    ## Need to remove; will cause calculations over at the beginning of each season
    date_no_minutes = group_resampled.groupby(level = 1)\
                        .apply(lambda x: x.isna()\
                                           .all()
                              )
    
    # Drop dates with no games
    ## Includes in-season breaks
    date_no_games = date_no_minutes[(date_no_minutes == True).all(axis = 1)].index
    
    group_resampled.drop(index = date_no_games,
                        level = 1,
                        inplace = True)
    
    # Forward fill values by player
    final_group_resampled = group_resampled.groupby(level = [0])[resample_stat_cats].ffill()
    
    # Calculate the standard deviation of pts_per36 for all players by date
    final_rolling_stats = final_group_resampled.groupby(level = 1).agg(['mean',
                                                                        'std'])
    
    return final_rolling_stats

In [None]:
# Calculate teams offensive pts production by position per game
team_pos_off_total = stats_dates_pos.groupby(['game_date',
                                                  'player_position',
                                                  'team_id'])\
                            [stat_cats]\
                            .sum()\
                            .reset_index()\
                            .rename(columns = cats_pergame_name_map)

# Calculate teams defensive pts production by position per game
team_pos_def_total = stats_dates_pos.groupby(['game_date',
                                                  'player_position',
                                                  'opponent_team_id'])\
                            [stat_cats]\
                            .sum()\
                            .reset_index()\
                            .rename(columns = {'opponent_team_id':'team_id'})

team_pos_def_total.rename(columns = {key:'opp_'+value
                                         for key,value in cats_pergame_name_map.items()},
                             inplace = True)

# Merge teams offensive and defensive production by position
team_pos_eff_total = pd.merge(team_pos_off_total,
                               team_pos_def_total,
                               how = 'outer',
                               left_on = ['game_date','team_id','player_position'],
                               right_on = ['game_date','team_id','player_position'])

In [None]:
team_pos_rolling_eff = team_pos_eff_total\
                        .groupby('player_position')\
                        .apply(league_team_stats)

In [None]:
team_pos_rolling_eff.columns = ['_'.join(i) for i in team_pos_rolling_eff.columns]

In [None]:
team_pos_rolling_eff.reset_index(inplace = True)

## Player Stats

### Individual Player Efficiency

In [None]:
player_stat_cats = stat_cats + ['min']

In [None]:
def player_rolling_sum(group):
    group_indexed = group.sort_values('game_date')\
                    .set_index('game_date')
    
    rolling_group = group_indexed\
                        [player_stat_cats]\
                       .rolling(rolling_period,
                                min_periods = 1)\
                       .sum()
    
    rolling_min_mean = group_indexed\
                        [['min']]\
                        .rolling(rolling_period,
                                 min_periods = 1)\
                        .mean()
    
    rolling_merged = pd.merge(rolling_min_mean,
                              rolling_group,
                              how = 'outer',
                              left_index = True,
                              right_index = True,
                             suffixes = ['_mean','_cumsum'])
    
    return rolling_merged

In [None]:
player_stats_rolling_eff = stats_dates.groupby('player_id')\
                            .apply(player_rolling_sum)

In [None]:
player_stats_rolling_eff

In [None]:
for cat in stat_cats:
    player_stats_rolling_eff[cat+'_per36'] = player_stats_rolling_eff[cat] * (36/player_stats_rolling_eff['min_cumsum'])

In [None]:
actual_eff_merged = pd.merge(stats_dates_pos[['game_date','player_id','player_position',
                                              'min'] + stat_cats],
                             player_stats_rolling_eff.reset_index()\
                                 .drop(stat_cats + ['min_cumsum'],
                                      axis = 1),
                             how = 'left',
                             left_on = ['player_id','game_date'],
                             right_on = ['player_id','game_date'])#\
                    #.sort_values(['player_id','game_date'])

In [None]:
per36_stat_cols = [i for i in actual_eff_merged.columns if ('_per36' in i)]

### League Position Average

In [None]:
def league_player_pts_stats(group):
    '''
    Grouping function to get standard deviation of pts_per36 for each player on a given date
    Need to account for offseason/days where there are no games
    '''
    # Sort by date and player prior to resampling
    group_sorted = group.sort_values(['game_date','player_id'])\
                        .set_index(['game_date'])
    
    # Resample data to daily by each player
    ## Forward fill blank values
    group_resampled = group_sorted.groupby(['player_id',
                                           ])\
                        .resample('1D')\
                        [per36_stat_cols]\
                        .last()
    
    # Were games played on date
    ## Due to resample, dates in the off season were added
    ## Need to remove; will cause calculations over at the beginning of each season
    date_no_minutes = group_resampled.groupby(level = 1)\
                        .apply(lambda x: x.isna()\
                                           .all()
                              )
    
    # Drop dates with no games
    ## Includes in-season breaks
    date_no_games = date_no_minutes[(date_no_minutes[per36_stat_cols] == True).all(axis = 1)].index
    
    group_resampled.drop(index = date_no_games,
                        level = 1,
                        inplace = True)
    
    # Forward fill values by player
    final_group_resampled = group_resampled.groupby(level = [0])[per36_stat_cols].ffill()
    
    # Calculate the standard deviation of pts_per36 for all players by date
    final_rolling_stats = final_group_resampled.groupby(level = [1]).agg(['mean',
                                                                          'std'])
    final_rolling_stats.columns = ['_'.join(i) for i in final_rolling_stats.columns]
    
    return final_rolling_stats

In [None]:
league_player_rolling_stats = actual_eff_merged.groupby('player_position',
                                                       dropna = False)\
                                .apply(league_player_pts_stats)

In [None]:
league_player_rolling_stats

In [None]:
league_player_rolling_stats.reset_index(inplace = True)

## Shift Data
Need to shift team efficiency, player efficiency, and league rolling data down a row so they can be used as dependent variables on the actual predictor variable

In [None]:
team_eff_pos_shifted = team_eff_pos.groupby(['team_id','player_position'])\
                        .apply(lambda x: x.sort_values('game_date')\
                                           .set_index('game_date')\
                                           [[col for col in team_eff_pos.columns if col.endswith('_pg')]]\
                                           .shift(1))\
                        .reset_index()

In [None]:
team_pos_rolling_eff_shifted = team_pos_rolling_eff.groupby('player_position')\
                                .apply(lambda x: x.sort_values('game_date')\
                                                   .set_index('game_date')\
                                                   [[col for col in team_pos_rolling_eff.columns if '_pg_' in col]]\
                                                   .shift(1))\
                                .reset_index()

In [None]:
actual_eff_merged_shifted = actual_eff_merged.groupby(['player_id','player_position'])\
                                .apply(lambda x: x.sort_values('game_date')\
                                                   .set_index('game_date')\
                                                   [per36_stat_cols + ['min_mean']]\
                                                   .shift(1))\
                                .reset_index()

In [None]:
league_player_rolling_stats_shifted = league_player_rolling_stats.groupby('player_position')\
                                        .apply(lambda x: x.sort_values('game_date')\
                                                           .set_index('game_date')\
                                                           [[col for col in league_player_rolling_stats.columns if '_per36_' in col]]\
                                                           .shift(1))\
                                        .reset_index()

## Merge Data

### Merge Team Stats

In [None]:
rolling_stats_teams = pd.merge(team_eff_pos_shifted.rename(columns = {col:'team_'+col
                                                                      for col in team_eff_pos_shifted.columns
                                                                      if col.endswith('_pg')}),
                               team_pos_rolling_eff_shifted.rename(columns = {col:'league_'+col
                                                                              for col in team_pos_rolling_eff_shifted.columns
                                                                              if '_pg_' in col}),
                               how = 'outer',
                               left_on = ['player_position','game_date'],
                               right_on = ['player_position','game_date'])\
                        .sort_values(['game_date','player_position','team_id'])

### Merge Player Stats

In [None]:
rolling_stats_players = pd.merge(actual_eff_merged_shifted.rename(columns = {**{'min_mean':'player_avg_min'},\
                                                                             **{col:'player_'+col for col in per36_stat_cols}
                                                                            }),
                                 league_player_rolling_stats_shifted.rename(columns = {col:'league_player_'+col
                                                                                       for col in league_player_rolling_stats_shifted
                                                                                       if '_per36_' in col}),
                                 how = 'outer',
                                 left_on = ['player_position','game_date'],
                                 right_on = ['player_position','game_date'])\
                        .sort_values(['game_date','player_position','player_id'])

### Merge Player and Team Stats to Relevant Stats

In [None]:
# Table of relevant fields from original data
rel_stats = stats_dates_pos[['game_date',
                             'player_id','player_position',
                             'team_id','opponent_team_id',
                             'min']
                            + stat_cats]

In [None]:
rel_rolling_players = pd.merge(rel_stats,
                               rolling_stats_players,
                               how = 'outer',
                               left_on = ['game_date','player_id','player_position'],
                               right_on = ['game_date','player_id','player_position'])

In [None]:
rolling_stats_teams

In [None]:
rel_rolling_team_off = pd.merge(rel_rolling_players,
                                rolling_stats_teams[['team_id','player_position','game_date']
                                                    + [col for col in rolling_stats_teams if (('_pg' in col)
                                                                                              & ('_opp_' not in col))]],
                                how = 'left',
                                left_on = ['game_date','team_id','player_position'],
                                right_on = ['game_date','team_id','player_position'])

In [None]:
rel_rolling_team_opp = pd.merge(rel_rolling_team_off,
                                rolling_stats_teams[['team_id','player_position','game_date']
                                                    + [col for col in rolling_stats_teams if (('_pg' in col)
                                                                                              & ('_opp_' in col))]]\
                                    .rename(columns = {col:'opponent_'+col
                                                       for col in rolling_stats_teams.columns
                                                       if ('team_opp_' in col)}),
                                how = 'left',
                                left_on = ['game_date','opponent_team_id','player_position'],
                                right_on = ['game_date','team_id','player_position'],
                               suffixes = ['','_y'])\
                        .drop('team_id_y',
                              axis = 1)

### Standardize Data

In [None]:
for cat in stat_cats:
    # Team Data Standardization
    rel_rolling_team_opp['team_'+cat+'_pg_stand'] = (rel_rolling_team_opp['team_'+cat+'_pg'] - rel_rolling_team_opp['league_'+cat+'_pg_mean'])/rel_rolling_team_opp['league_'+cat+'_pg_std']

    # Opponent Team Data Standardization
    rel_rolling_team_opp['opponent_team_'+cat+'_pg_stand'] = (rel_rolling_team_opp['opponent_team_opp_'+cat+'_pg'] - rel_rolling_team_opp['league_opp_'+cat+'_pg_mean'])/rel_rolling_team_opp['league_opp_'+cat+'_pg_std']
    
    # Player Data Standardization
    rel_rolling_team_opp['player_'+cat+'_per36_stand'] = (rel_rolling_team_opp['player_'+cat+'_per36'] - rel_rolling_team_opp['league_player_'+cat+'_per36_mean'])/rel_rolling_team_opp['league_player_'+cat+'_per36_std']

In [None]:
rel_rolling_team_opp

## Plotting

### Team Data

In [None]:
rel_rolling_team_opp[(rel_rolling_team_opp['game_date'] > '2021-12-01')
                        & (rel_rolling_team_opp['player_position'] != 'UNK')]['opp_def_ppg_stand'].max()

In [None]:
rel_rolling_team_opp[rel_rolling_team_opp['opp_def_ppg_stand'] == rel_rolling_team_opp[(rel_rolling_team_opp['game_date'] > '2021-12-01')
                        & (rel_rolling_team_opp['player_position'] != 'UNK')]['opp_def_ppg_stand'].max()]

In [None]:
rel_rolling_team_opp[(rel_rolling_team_opp['opponent_team_id'] == 2)
                        & (rel_rolling_team_opp['player_position'] == 'G-F')]\
    [['game_date','opp_def_ppg_stand']]\
    .drop_duplicates()

In [None]:
opponent_team_id = 6
opponent_team_name = teams[teams.index == opponent_team_id]['team_name'][opponent_team_id]
player_position = 'G-F'

fig = plt.figure(figsize = (20,10))
ax = sns.pointplot(data = rel_rolling_team_opp[(rel_rolling_team_opp['opponent_team_id'] == opponent_team_id)
                                               & (rel_rolling_team_opp['player_position'] == player_position)]\
                            [['game_date','opp_def_ppg_stand']]\
                            .drop_duplicates(),
                   x = 'game_date',
                   y = 'opp_def_ppg_stand',
                  ci = None)
ax.set_title('%s vs. %s' %(opponent_team_name, player_position))
ax.set_xlabel('Game Date')
ax.set_ylabel('Standard Deviation')

# xaxis_date_format = mdates.DateFormatter('%y-%m')
# plt.gca().xaxis.set_major_formatter(xaxis_date_format)

plt.xticks(rotation=90)
# fig.tight_layout()
fig.show()

In [None]:
team_id = 10
team_name = teams[teams.index == team_id]['team_name'][team_id]
player_position = 'G'

fig = plt.figure(figsize = (20,10))
ax = sns.pointplot(data = rel_rolling_team_opp[(rel_rolling_team_opp['team_id'] == team_id)
                                               & (rel_rolling_team_opp['player_position'] == player_position)]\
                            [['game_date','team_off_ppg_stand']]\
                            .drop_duplicates(),
                   x = 'game_date',
                   y = 'team_off_ppg_stand',
                  ci = None)
ax.set_title('%s %s vs. League' %(team_name, player_position))
ax.set_xlabel('Game Date')
ax.set_ylabel('Standard Deviation')

# xaxis_date_format = mdates.DateFormatter('%y-%m')
# plt.gca().xaxis.set_major_formatter(xaxis_date_format)

plt.xticks(rotation=90)
# fig.tight_layout()
fig.show()

### Player Data

In [None]:
rel_rolling_team_opp[rel_rolling_team_opp['player_id'] == 175]

In [None]:
player_id = 37

fig = plt.figure(figsize = [30,10])
ax = sns.pointplot(data = rel_rolling_team_opp[rel_rolling_team_opp['player_id'] == player_id],
                 x = 'game_date',
                 y = 'player_pts_per36_stand',
                ci = None # Confidence Interval
                )
ax.set_title(players[players.index == player_id]['player_full_name'][player_id])
ax.set_xlabel('Game Date')
ax.set_ylabel('Player Points per 36min Standard Dev')

xaxis_date_format = mdates.DateFormatter('%y-%m')
plt.gca().xaxis.set_major_formatter(xaxis_date_format)

plt.xticks(rotation=90)
# fig.tight_layout()
fig.show()

In [None]:
players[players['player_last_name'] == 'Beal']

In [None]:
teams[teams.index == 24]

# Export Data

In [None]:
excel_export_path = '/'.join(['.','Data',
                              'NBA Stats_2021-2023.xlsx'])

for cat in stat_cats:
    category_rolling_stats = rel_rolling_team_opp[['game_date','player_id','player_position',
                                                   'team_id','opponent_team_id',
                                                   'min']
                                                    + [col for col in rel_rolling_team_opp.columns if cat in col]
                                                    ]
    
    category_rolling_stats.to_excel(excel_export_path,
                                  sheet_name = cat,
                                  index = False)