In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
import requests
import warnings
import json
# import matplotlib.pyplot as plt
#import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

# Read in data

In [None]:
# save data locally to avoid pulling from API
data_2021 = pd.read_csv('./Data/api_extract_all_data_2021.csv',
                       header = 0)\
                    .drop_duplicates()

# save data locally to avoid pulling from API
data_2022 = pd.read_csv('./Data/api_extract_all_data_2022.csv',
                       header = 0)\
                    .drop_duplicates()

# save data locally to avoid pulling from API
data_2023 = pd.read_csv('./Data/api_extract_all_data_2023.csv',
                       header = 0)\
                    .drop_duplicates()

all_data = pd.concat([data_2021,
                     data_2022,
                     data_2023],
                    ignore_index = True)

In [None]:
# save data locally to avoid pulling from API
all_data = pd.read_csv('./Data/api_extract_all_data.csv',
                       header = 0)\
                    .drop_duplicates()

# Check Data

In [None]:
# convert 'game_date' into datetime
all_data['game_date'] = pd.to_datetime(all_data['game_date']).dt.tz_localize(None)

In [None]:
all_data.loc[((all_data['game_date'].dt.date >= datetime.strptime('2021-10-19','%Y-%m-%d').date())
              & (all_data['game_date'].dt.date <= datetime.strptime('2022-04-10','%Y-%m-%d').date()))
            | ((all_data['game_date'].dt.date >= datetime.strptime('2022-10-18','%Y-%m-%d').date())
               & (all_data['game_date'].dt.date <= datetime.strptime('2023-04-09','%Y-%m-%d').date()))
            | ((all_data['game_date'].dt.date >= datetime.strptime('2023-10-24','%Y-%m-%d').date())),
            'game_preseason'] = 'N'

In [None]:
all_data['game_preseason'].fillna('Y',
                                 inplace = True)

In [None]:
all_data.drop(all_data[all_data['game_preseason'] == 'Y'].index,
             inplace = True)

## Ensure all data has been gathered

In [None]:
# Drop duplicate rows
## May have arisen due to pulling too many pages
all_data.drop_duplicates(inplace = True)

In [None]:
all_data.sort_values('game_date',
                    inplace = True)

In [None]:
games_per_team = all_data.groupby(['game_season','team_id'])[['game_id']].nunique()

In [None]:
games_per_team[(games_per_team['game_id'] != 82)
              & (games_per_team.index.get_level_values(0) != 2023)]

In [None]:
all_data['game_id'] = all_data['game_id'].astype(int)
all_data['team_id'] = all_data['team_id'].astype(int)

In [None]:
# combine first and last name of players
all_data['player_full_name'] = all_data['player_first_name'] + ' ' + all_data['player_last_name']

## Player Positions
### Investigate blank player positions

In [None]:
all_data[all_data['player_position'].isna()]['player_full_name'].unique()

Do players with blank positions have positions in other games


In [None]:
blank_pos_player_map = all_data[(all_data['player_full_name'].isin(all_data[all_data['player_position'] == '']['player_full_name'].unique()))
                                & (all_data['player_position'] != '')]\
                        [['player_full_name','player_position']]\
                        .drop_duplicates()\
                        .set_index('player_full_name')\
                        .to_dict()\
                        ['player_position']

Only 3 players have position values. Use them to fill blank values

In [None]:
all_data['player_position'].replace('',np.nan,
                                   inplace = True)

In [None]:
all_data['player_position'].fillna(all_data['player_full_name'].map(blank_pos_player_map),
                                  inplace = True)

Confirm mapping worked

In [None]:
all_data[all_data['player_full_name'].isin(list(blank_pos_player_map.keys()))][['player_full_name','player_position']].drop_duplicates()

Which player remain without a position?

In [None]:
all_data[all_data['player_position'].isna()]['player_full_name'].unique()

### Reversible Positions

In [None]:
all_data['player_position'].unique()

Dual positions, like 'G-F', are reversible.

In [None]:
all_data['player_position'].replace({'F-G':'G-F',
                                    'C-F':'F-C',
                                    np.nan:'UNK'},
                                   inplace = True)

In [None]:
all_data['player_position'].unique()

### Players with 2 positions

In [None]:
player_pos = all_data.groupby('player_full_name')[['player_position']].nunique()

player_pos[player_pos['player_position'] !=  1]

## Input Opponent fields

Use 'game_home_team_id', 'game_visitor_team_id', and team_id fields to determine who the opponents per player was

In [None]:
team_by_id = all_data.groupby('team_id')\
                [['team_abbreviation','team_city',
                  'team_conference','team_division']]\
                .first()

In [None]:
# Create 'opponent_team_id' field by summing 'game_home_team_id' and 'game_visitor_team_id' and subtracting the team_id
all_data['opponent_team_id'] = all_data[['game_home_team_id','game_visitor_team_id']].sum(axis = 1) - all_data['team_id']

In [None]:
team_stats_by_game = all_data.groupby(['game_season',
                                       'game_id',
                                       'team_abbreviation',
                                       'player_position'])\
                        [['fga','fgm','fta','ftm','fg3a','fg3m','dreb','oreb','reb','ast','stl','blk','turnover','pf','pts']]\
                        .sum()\
                        .reset_index()

In [None]:
team_stats_by_game[team_stats_by_game['game_id'] == 473409]

In [None]:
stats = all_data[['id', 'game_id','player_id',
                  'team_id','opponent_team_id',
                   'min', 
                  'fga','fgm','fg_pct', 
                  'fta', 'ftm', 'ft_pct',
                  'fg3a', 'fg3m', 'fg3_pct',
                  'oreb', 'dreb', 'reb',
                  'pts','ast', 'stl', 'blk', 
                  'pf','turnover']]
games = all_data[['game_id', 'game_date','game_season',
          'game_status','game_period', 'game_time',
          'game_home_team_id', 'game_home_team_score',
          'game_visitor_team_id', 'game_visitor_team_score',
          'game_preseason','game_postseason']]
players = all_data[['player_id','player_team_id','player_position',
                    'player_full_name','player_first_name','player_last_name',
                    'player_height_feet', 'player_height_inches',
                    'player_weight_pounds']]
teams = all_data[['team_id',
                  'team_full_name', 'team_name',
                  'team_abbreviation', 'team_city',
                  'team_conference', 'team_division']]

In [None]:
excel_path = '/'.join(['.','Data',
                      'NBA Stats_2021-2023_01292024.xlsx'])

with pd.ExcelWriter(excel_path) as writer:
    stats.to_excel(writer,
                  sheet_name = 'Stats',
                  index = False)
    games.to_excel(writer,
                  sheet_name = 'Games',
                  index = False)
    players.to_excel(writer,
                  sheet_name = 'Players',
                  index = False)
    teams.to_excel(writer,
                  sheet_name = 'Teams',
                  index = False)

# Analysis

Need:
- player rolling averages
- team defense by position
- positional averages

In [None]:
data_2021 = all_data[all_data['game_season'] == 2021]
data_2022 = all_data[all_data['game_season'] == 2022]
data_2023 = all_data[all_data['game_season'] == 2023]

## Player Rolling Averages

In [None]:
data_2021.sort_values('game_date',
                     inplace = True)

In [None]:
data_2021['player_full_name'] = data_2021['player_first_name'] + ' ' + data_2021['player_last_name']

In [None]:
player_avg = data_2021[~data_2021['min'].isin([0,np.nan])]\
                .set_index('game_date')\
                .groupby(['player_full_name'])\
                [['pts']]\
                .rolling('300D')\
                .mean()

In [None]:
player_avg[player_avg.index.get_level_values(0) == 'Kyrie Irving'].tail(1)

In [None]:
rolling_avg_pts_by_pos = data_2021[~data_2021['min'].isin([0,np.nan])]\
                            .set_index('game_date')\
                            .groupby(['player_position'])\
                            [['pts']]\
                            .rolling('300D')\
                            .mean()\
                            .groupby(level = [0,1])\
                            .last()\
                            .reset_index()

rolling_std_pts_by_pos = data_2021[~data_2021['min'].isin([0,np.nan])]\
                            .set_index('game_date')\
                            .groupby(['player_position'])\
                            [['pts']]\
                            .rolling('300D')\
                            .std()\
                            .groupby(level = [0,1])\
                            .last()\
                            .reset_index()

In [None]:
rolling_avg_pts_by_pos[(rolling_avg_pts_by_pos['player_position'] == 'F')
                      #& (rolling_avg_pts_by_pos['game_date'] == '2021-10-19')
                      ]

In [None]:
rolling_std_pts_by_pos[(rolling_std_pts_by_pos['player_position'] == 'F')
                      #& (rolling_std_pts_by_pos['game_date'] == '2021-10-19')
                      ]

In [None]:
_ = sns.histplot(data = data_2021[(~data_2021['min'].isin([0,np.nan]))
                          & (data_2021['player_position'] == 'F')],
                x = 'pts')

plt.show()

In [None]:
season_team_position_by_game = all_data.groupby(['game_season',
                                                         'team_id',
                                                         'player_position',
                                                        'game_id'])\
                                    [['fga','fgm','fta','ftm','fg3a','fg3m',
                                      'dreb','oreb','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .sum()\
                                    .reset_index()

In [None]:
season_opp_position_by_game = all_data.groupby(['game_season',
                                                         'opponent_team_id',
                                                         'player_position',
                                                        'game_id'])\
                                    [['fga','fgm','fta','ftm','fg3a','fg3m',
                                      'dreb','oreb','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .sum()\
                                    .reset_index()

In [None]:
# Get number of games each team faced a position
season_opp_games_by_pos = season_opp_position_by_game.groupby(['game_season',
                                                              'player_position',
                                                              'opponent_team_id'])\
                            [['game_id']]\
                            .nunique()

# Get avg stats by position per team
season_opp_stats_by_pos = season_opp_position_by_game.groupby(['game_season',
                                                               'player_position',
                                                               'opponent_team_id'])\
                                    [['fga','fgm','fta','ftm','fg3a','fg3m',
                                      'dreb','oreb','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .agg(['mean','std'])

season_opp_stats_by_pos.columns = ['_'.join(i) for i in season_opp_stats_by_pos.columns]

# Combine games per position per team and avg stats per position per team
season_opp_position_stats = pd.merge(season_opp_games_by_pos,
                                         season_opp_stats_by_pos,
                                         how = 'outer',
                                         left_index = True,
                                         right_index = True)\
                                .reset_index()

In [None]:
# Represents opponent_team_id defensive ability by position
season_opp_position_stats

In [None]:
# Get number of games each team faced a position
season_team_games_by_pos = season_team_position_by_game.groupby(['game_season',
                                                              'player_position',
                                                              'team_id'])\
                            [['game_id']]\
                            .nunique()

# Get avg stats by position per team
season_team_stats_by_pos = season_team_position_by_game.groupby(['game_season',
                                                               'player_position',
                                                               'team_id'])\
                                    [['fg3m','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .agg(['mean','std'])

season_team_stats_by_pos.columns = ['_'.join(i) for i in season_team_stats_by_pos.columns]

# Combine games per position per team and avg stats per position per team
season_team_position_stats = pd.merge(season_team_games_by_pos,
                                         season_team_stats_by_pos,
                                         how = 'outer',
                                         left_index = True,
                                         right_index = True)\
                                .reset_index()

In [None]:
season_league_games_by_position = season_team_position_by_game.groupby(['game_season',
                                                                        'player_position'])\
                                    [['game_id']]\
                                    .nunique()

season_league_counting_stats_by_position = season_team_position_by_game.groupby(['game_season',
                                                                        'player_position'])\
                                    [['fg3m','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .agg(['mean','std'])
season_league_counting_stats_by_position.columns = ['_'.join(i) for i in season_league_counting_stats_by_position.columns]

season_league_stats_by_position = pd.merge(season_league_games_by_position,
                                          season_league_counting_stats_by_position,
                                          how = 'outer',
                                          left_index = True,
                                          right_index = True)

In [None]:
season_team_stats_by_pos[(season_team_stats_by_pos.index.get_level_values(1) == 'G')
                        & (season_team_stats_by_pos.index.get_level_values(0) == 2021)]['pts_mean'].mean()

In [None]:
season_league_stats_by_position

In [None]:
# Represents opponent_team_id offensive ability by position
season_team_position_stats[season_team_position_stats['team_id'] == 10]

In [None]:
all_data.set_index('game_date')\
    .groupby(['game_season','player_position'])\
    [['fg3m','reb','ast','stl','blk','turnover','pf','pts']]\
    .rolling('1D')\
    .agg(['mean','std'])