In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

# Pull from API

In [32]:
page = range(400)
per_page = '100'
postseason = 'false'

Extract all data for the 2021-2023/24 seasons to date
- done on 1/22/24

In [3]:
all_data = pd.DataFrame()

In [4]:
# split 'game','player', and 'team' columns into individual columns
# these fields contain meta-data as a dictionary
dict_cols = ['game',
             'player',
             'team'
            ]

In [33]:
for j in page:
    get_req = 'https://balldontlie.io/api/v1/stats?per_page=100&page=%d&seasons[]=2023&postseason=false&start_date=2023-10-24' %(j)

    req = requests\
        .get(get_req)

    req_status_code = req.status_code
    
    if req_status_code == 200:
        req_json = req.json()    

        data = pd.DataFrame().from_dict(req_json['data'])
        
        if len(data) == 0:
            print('No more data')
            break
        else:
            data.dropna(subset = ['game','player','team'],
                       inplace = True)

            data.reset_index(drop = True,
                            inplace = True)

            for col in dict_cols:
                sub_df = pd.DataFrame(data[col].to_list())

                sub_df.columns = ['_'.join([col,c])
                                   for c in sub_df.columns]

                data = pd.concat([data,
                                      sub_df],
                                     axis = 1)

            all_data = pd.concat([all_data,
                                 data])

    elif req_status_code == 429:
        print('Too many Requests. Stopped at: %d' %j)
        page = range(j,500)
        break
    else:
        print('Error: %d' %req_status_code)
        break

No more data


In [34]:
page

range(0, 400)

In [35]:
j

247

In [36]:
req_status_code

200

In [37]:
#31630
# 72859
len(all_data)

97409

In [38]:
all_data.tail()

Unnamed: 0,id,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,game,min,oreb,pf,player,pts,reb,stl,team,turnover,game_id,game_date,game_home_team_id,game_home_team_score,game_period,game_postseason,game_season,game_status,game_time,game_visitor_team_id,game_visitor_team_score,player_id,player_first_name,player_height_feet,player_height_inches,player_last_name,player_position,player_team_id,player_weight_pounds,team_id,team_abbreviation,team_city,team_conference,team_division,team_full_name,team_name
62,14174040,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,"{'id': 1037945, 'date': '2023-12-18T00:00:00.0...",0,0,0,"{'id': 38017700, 'first_name': 'Wendell', 'hei...",0,0,0,"{'id': 18, 'abbreviation': 'MIN', 'city': 'Min...",0,1037945,2023-12-18T00:00:00.000Z,16,108,4,False,2023,Final,Final,18,112,38017700,Wendell,,,Moore Jr.,G,18,,18,MIN,Minnesota,West,Northwest,Minnesota Timberwolves,Timberwolves
63,14174042,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,"{'id': 1037945, 'date': '2023-12-18T00:00:00.0...",0,0,0,"{'id': 56677851, 'first_name': 'Jaylen', 'heig...",0,0,0,"{'id': 18, 'abbreviation': 'MIN', 'city': 'Min...",0,1037945,2023-12-18T00:00:00.000Z,16,108,4,False,2023,Final,Final,18,112,56677851,Jaylen,,,Clark,G,18,,18,MIN,Minnesota,West,Northwest,Minnesota Timberwolves,Timberwolves
64,14174043,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,"{'id': 1037945, 'date': '2023-12-18T00:00:00.0...",0,0,0,"{'id': 17896056, 'first_name': 'Luka', 'height...",0,0,0,"{'id': 18, 'abbreviation': 'MIN', 'city': 'Min...",0,1037945,2023-12-18T00:00:00.000Z,16,108,4,False,2023,Final,Final,18,112,17896056,Luka,,,Garza,C,18,,18,MIN,Minnesota,West,Northwest,Minnesota Timberwolves,Timberwolves
65,14174044,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,"{'id': 1037945, 'date': '2023-12-18T00:00:00.0...",0,0,0,"{'id': 56677782, 'first_name': 'Leonard', 'hei...",0,0,0,"{'id': 18, 'abbreviation': 'MIN', 'city': 'Min...",0,1037945,2023-12-18T00:00:00.000Z,16,108,4,False,2023,Final,Final,18,112,56677782,Leonard,,,Miller,F,18,,18,MIN,Minnesota,West,Northwest,Minnesota Timberwolves,Timberwolves
66,14174045,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,"{'id': 1037945, 'date': '2023-12-18T00:00:00.0...",0,0,0,"{'id': 17895968, 'first_name': 'Daishen', 'hei...",0,0,0,"{'id': 18, 'abbreviation': 'MIN', 'city': 'Min...",0,1037945,2023-12-18T00:00:00.000Z,16,108,4,False,2023,Final,Final,18,112,17895968,Daishen,,,Nix,F,18,,18,MIN,Minnesota,West,Northwest,Minnesota Timberwolves,Timberwolves


In [None]:
data

In [None]:
test = pd.DataFrame().from_dict(req_json['data'])

In [None]:
data.dropna(subset=['player'],
           inplace = True)

In [None]:
test_df = pd.DataFrame(data['player'].to_list())

In [None]:
test_df[test_df['first_name'].isna()]

# Clean Data

In [None]:
final_data = pd.concat([all_data_local,
                       all_data],
                      ignore_index = True)

## Label Pre-season Games
It appears pre-season games are present in the data.
Need to flag and remove from data as these games are typically non-competitive

In [None]:
# convert 'game_date' into datetime
all_data['game_date'] = pd.to_datetime(all_data['game_date'])

In [None]:
all_data.loc[((all_data['game_date'].dt.date >= datetime.strptime('2021-10-19','%Y-%m-%d').date())
              & (all_data['game_date'].dt.date <= datetime.strptime('2022-04-10','%Y-%m-%d').date()))
            | ((all_data['game_date'].dt.date >= datetime.strptime('2022-10-18','%Y-%m-%d').date())
               & (all_data['game_date'].dt.date <= datetime.strptime('2023-04-09','%Y-%m-%d').date()))
            | ((all_data['game_date'].dt.date >= datetime.strptime('2023-10-24','%Y-%m-%d').date())),
            'game_preseason'] = 'N'

In [None]:
all_data['game_preseason'].fillna('Y',
                                 inplace = True)

In [None]:
all_data.drop(all_data[all_data['game_preseason'] == 'Y'].index,
             inplace = True)

## Ensure all data has been gathered

In [None]:
all_data['game_id'] = all_data['game_id'].astype(int)
all_data['team_id'] = all_data['team_id'].astype(int)

In [None]:
# Each game should have 2 teams
teams_per_game = all_data.groupby('game_id')[['team_abbreviation']].nunique()

missing_data_games = teams_per_game[teams_per_game['team_abbreviation'] != 2].index

In [None]:
missing_data_games

In [None]:
teams_per_game[teams_per_game['team_abbreviation'] != 2]

In [None]:
all_data[all_data['game_id'] == 473442]

In [None]:
all_data = pd.DataFrame()

for i in missing_data_games:
    get_req = 'https://balldontlie.io/api/v1/stats?per_page=%s&game_ids[]=%s' %(per_page,
                                                                                str(i))
    
    req = requests\
        .get(get_req)
    
    req_status_code = req.status_code
    
    if req_status_code == 200:
        req_json = req.json()    
        
        data = req_json['data']
        
        all_data = pd.concat([all_data,
                              pd.DataFrame().from_dict(data)
                             ]
                            )

In [None]:
# combine first and last name of players
all_data['player_full_name'] = all_data['player_first_name'] + ' ' + all_data['player_last_name']

In [None]:
# Drop duplicate rows
## May have arisen due to pulling too many pages
all_data.drop_duplicates(inplace = True)

## Player Positions
### Investigate blank player positions

In [None]:
all_data[all_data['player_position'].isna()]['player_full_name'].unique()

Do players with blank positions have positions in other games


In [None]:
blank_pos_player_map = all_data[(all_data['player_full_name'].isin(all_data[all_data['player_position'] == '']['player_full_name'].unique()))
                                & (all_data['player_position'] != '')]\
                        [['player_full_name','player_position']]\
                        .drop_duplicates()\
                        .set_index('player_full_name')\
                        .to_dict()\
                        ['player_position']

Only 3 players have position values. Use them to fill blank values

In [None]:
all_data['player_position'].replace('',np.nan,
                                   inplace = True)

In [None]:
all_data['player_position'].fillna(all_data['player_full_name'].map(blank_pos_player_map),
                                  inplace = True)

Confirm mapping worked

In [None]:
all_data[all_data['player_full_name'].isin(list(blank_pos_player_map.keys()))][['player_full_name','player_position']].drop_duplicates()

Which player remain without a position?

In [None]:
all_data[all_data['player_position'].isna()]['player_full_name'].unique()

### Reversible Positions

In [None]:
all_data['player_position'].unique()

Dual positions, like 'G-F', are reversible.

In [None]:
all_data['player_position'].replace({'F-G':'G-F',
                                    'C-F':'F-C',
                                    np.nan:'UNK'},
                                   inplace = True)

In [None]:
all_data['player_position'].unique()

## Input Opponent fields

Use 'game_home_team_id', 'game_visitor_team_id', and team_id fields to determine who the opponents per player was

In [None]:
team_by_id = all_data.groupby('team_id')\
                [['team_abbreviation','team_city',
                  'team_conference','team_division']]\
                .first()

In [None]:
# Create 'opponent_team_id' field by summing 'game_home_team_id' and 'game_visitor_team_id' and subtracting the team_id
all_data['opponent_team_id'] = all_data[['game_home_team_id','game_visitor_team_id']].sum(axis = 1) - all_data['team_id']

In [None]:
team_stats_by_game = all_data.groupby(['game_season',
                                       'game_id',
                                       'team_abbreviation',
                                       'player_position'])\
                        [['fga','fgm','fta','ftm','fg3a','fg3m','dreb','oreb','reb','ast','stl','blk','turnover','pf','pts']]\
                        .sum()\
                        .reset_index()

In [None]:
team_stats_by_game[team_stats_by_game['game_id'] == 473409]

In [39]:
# save data locally to avoid pulling from API
all_data.to_csv('./Data/api_extract_all_data.csv',
               index = False)

# Read from CSV

In [None]:
# save data locally to avoid pulling from API
all_data_local = pd.read_csv('./Data/api_extract_all_data.csv',
                       header = 0)\
                    .drop_duplicates()

In [None]:
combined = pd.concat([all_data,
                     all_data_local,
                     combined])

In [None]:
len(combined.drop_duplicates())

In [None]:
len(all_data_local)

In [None]:
combined

In [None]:
games_per_season_by_team = all_data_local.groupby(['game_season','team_abbreviation'])[['game_id']].nunique()

In [None]:
games_per_season_by_team[games_per_season_by_team['game_id'] != 82]

# Analysis

In [None]:
team_by_id = all_data_local.groupby('team_id')\
                [['team_abbreviation','team_city',
                  'team_conference','team_division']]\
                .first()

In [None]:
season_team_position_by_game = all_data_local.groupby(['game_season',
                                                         'team_id',
                                                         'player_position',
                                                        'game_id'])\
                                    [['fga','fgm','fta','ftm','fg3a','fg3m',
                                      'dreb','oreb','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .sum()\
                                    .reset_index()

In [None]:
season_opp_position_by_game = all_data_local.groupby(['game_season',
                                                         'opponent_team_id',
                                                         'player_position',
                                                        'game_id'])\
                                    [['fga','fgm','fta','ftm','fg3a','fg3m',
                                      'dreb','oreb','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .sum()\
                                    .reset_index()

In [None]:
# Get number of games each team faced a position
season_opp_games_by_pos = season_opp_position_by_game.groupby(['game_season',
                                                              'player_position',
                                                              'opponent_team_id'])\
                            [['game_id']]\
                            .nunique()

# Get avg stats by position per team
season_opp_stats_by_pos = season_opp_position_by_game.groupby(['game_season',
                                                               'player_position',
                                                               'opponent_team_id'])\
                                    [['fga','fgm','fta','ftm','fg3a','fg3m',
                                      'dreb','oreb','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .agg(['mean','std'])

season_opp_stats_by_pos.columns = ['_'.join(i) for i in season_opp_stats_by_pos.columns]

# Combine games per position per team and avg stats per position per team
season_opp_position_stats = pd.merge(season_opp_games_by_pos,
                                         season_opp_stats_by_pos,
                                         how = 'outer',
                                         left_index = True,
                                         right_index = True)\
                                .reset_index()

In [None]:
# Represents opponent_team_id defensive ability by position
season_opp_position_stats

In [None]:
# Get number of games each team faced a position
season_team_games_by_pos = season_team_position_by_game.groupby(['game_season',
                                                              'player_position',
                                                              'team_id'])\
                            [['game_id']]\
                            .nunique()

# Get avg stats by position per team
season_team_stats_by_pos = season_team_position_by_game.groupby(['game_season',
                                                               'player_position',
                                                               'team_id'])\
                                    [['fg3m','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .agg(['mean','std'])

season_team_stats_by_pos.columns = ['_'.join(i) for i in season_team_stats_by_pos.columns]

# Combine games per position per team and avg stats per position per team
season_team_position_stats = pd.merge(season_team_games_by_pos,
                                         season_team_stats_by_pos,
                                         how = 'outer',
                                         left_index = True,
                                         right_index = True)\
                                .reset_index()

In [None]:
season_league_games_by_position = season_team_position_by_game.groupby(['game_season',
                                                                        'player_position'])\
                                    [['game_id']]\
                                    .nunique()

season_league_counting_stats_by_position = season_team_position_by_game.groupby(['game_season',
                                                                        'player_position'])\
                                    [['fg3m','reb','ast','stl','blk',
                                      'turnover','pf','pts']]\
                                    .agg(['mean','std'])
season_league_counting_stats_by_position.columns = ['_'.join(i) for i in season_league_counting_stats_by_position.columns]

season_league_stats_by_position = pd.merge(season_league_games_by_position,
                                          season_league_counting_stats_by_position,
                                          how = 'outer',
                                          left_index = True,
                                          right_index = True)

In [None]:
season_team_stats_by_pos[(season_team_stats_by_pos.index.get_level_values(1) == 'G')
                        & (season_team_stats_by_pos.index.get_level_values(0) == 2021)]['pts_mean'].mean()

In [None]:
season_league_stats_by_position

In [None]:
# Represents opponent_team_id offensive ability by position
season_team_position_stats[season_team_position_stats['team_id'] == 10]