### Extract data from NBA API

Here we are using the NBA unofficial API to extract data. First we'll extract data from the games and then the plays.

In [1]:
import pandas as pd
import requests
import time
import os
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import playbyplayv2

In [2]:
headers  = {"Connection": "keep-alive",
            "Accept": "application/json, text/plain, */*",
            "x-nba-stats-token": "true",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
            "x-nba-stats-origin": "stats",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-Mode": "cors",
            "Referer": "https://stats.nba.com/",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9"}

### Functions to execute the extraction of data.

In [3]:
def fix_df(df):
    
    """Function to extract only the columns that I want of the extracted dataframe."""
    
    columns = ["GAME_ID", "GAME_DATE", "SEASON_ID", "MATCHUP", 
               "TEAM_ID", "TEAM_ABBREVIATION", "TEAM_NAME", "PTS"]
    
    df = df[columns]
    
    dict_home = {'TEAM_ID': 'HOME_TEAM_ID', 'TEAM_ABBREVIATION': 'HOME_TEAM_ABBR', 'TEAM_NAME': 'HOME_TEAM', 'PTS': 'HOME_PTS'}
    dict_away = {'TEAM_ID': 'AWAY_TEAM_ID', 'TEAM_ABBREVIATION': 'AWAY_TEAM_ABBR', 'TEAM_NAME': 'AWAY_TEAM', 'PTS': 'AWAY_PTS'}

    returned_df = pd.concat([
        df\
        .query("MATCHUP.str.contains(pat = 'vs\\.', regex = True, na = False)")\
        .rename(dict_home, axis = 1)\
        .reset_index(drop = True)\
        .drop(['GAME_ID', 'MATCHUP'], axis = 1),
        df\
        .query("MATCHUP.str.contains(pat = '@', regex = True, na = False)")\
        .rename(dict_away, axis = 1)\
        .reset_index(drop = True)\
        .drop(['GAME_ID', 'GAME_DATE', 'SEASON_ID'], axis = 1)
    ], axis = 1)
    
    returned_columns = ["SEASON_ID", "GAME_DATE", "MATCHUP", "HOME_TEAM_ID", "AWAY_TEAM_ID", 
                        "HOME_TEAM_ABBR", "AWAY_TEAM_ABBR", "HOME_TEAM", "HOME_PTS", "AWAY_PTS", "AWAY_TEAM"]
    
    return returned_df[returned_columns]

In [4]:
def munge_dataframe_games(df, type_games = ['Regular Season', "Play-in", "Playoffs"]):
    
    df_return = df\
    .reset_index()\
    .assign(HOME_TEAM_ID = lambda x: [str(i).replace(".0", "") for i in x["HOME_TEAM_ID"]],
            AWAY_TEAM_ID = lambda x: [str(i).replace(".0", "") for i in x["AWAY_TEAM_ID"]],
            SEASON_TYPE = lambda x: [i[:3] for i in x["GAME_ID"]])\
    .assign(SEASON_TYPE = lambda x: ["Preseason" if i == "001" else i for i in x["SEASON_TYPE"]])\
    .assign(SEASON_TYPE = lambda x: ["Regular Season" if i == "002" else i for i in x["SEASON_TYPE"]])\
    .assign(SEASON_TYPE = lambda x: ["All star" if i == "003" else i for i in x["SEASON_TYPE"]])\
    .assign(SEASON_TYPE = lambda x: ["Playoffs" if i == "004" else i for i in x["SEASON_TYPE"]])\
    .assign(SEASON_TYPE = lambda x: ["Play-in" if i == "005" else i for i in x["SEASON_TYPE"]])\
    .query("SEASON_TYPE in " + str(type_games))\
    .astype({"HOME_PTS": "Int64", "AWAY_PTS": "Int64"})\
    [["SEASON_ID", "SEASON", "SEASON_TYPE", "GAME_ID", "GAME_DATE", "MATCHUP", 
      "HOME_TEAM", "HOME_PTS", "AWAY_PTS", "AWAY_TEAM", "HOME_TEAM_ID", 
      "AWAY_TEAM_ID", "HOME_TEAM_ABBR", "AWAY_TEAM_ABBR"]]\
    .sort_values(["SEASON", "GAME_DATE"])\
    .reset_index(drop = True)
    
    return df_return

In [5]:
def download_games(seasons = ["2008-09", "2009-10", "2010-11", "2011-12", "2012-13", "2013-14", "2014-15", "2015-16", 
                              "2016-17", "2017-18", "2018-19", "2019-20", "2020-21", "2021-22", "2022-23", "2023-24"], 
                   inform_steps = True, 
                   type_games = ['Regular Season', "Play-in", "Playoffs"]):    
    
    lst_data = []
    for s in seasons: 
        
        if inform_steps:
            print("Extracting games of season: " + s)
        
        gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable = s, league_id_nullable = '00', headers = headers)
        
        games = gamefinder\
        .get_data_frames()[0]\
        .groupby('GAME_ID')\
        .apply(fix_df)
        
        games['SEASON'] = s
        
        lst_data.append(games)
        
    df = munge_dataframe_games(pd.concat(lst_data, axis = 0), type_games = type_games)
    return df

In [6]:
def download_play_by_play(df):
    
    lst_data = []
    i = 0
    
    for game_id in list(df['GAME_ID']):
        i += 1
        
        pbp = playbyplayv2.PlayByPlayV2(game_id, headers = headers).get_data_frames()[0]
        
        lst_data.append(pbp)
        
        print(game_id + ": " + str(len(pbp)))
        
        if((i % 10) == 0):
            time.sleep(5)
    
    return lst_data

In [17]:
def execute_download_data_nba_api(path_to_save_pbp, file_to_save_games_nba, 
                                  seasons = [], inform_steps = True, type_games = []):
    
    """Function to execute all of the process to extract the games and the play by play of NBA api."""
    
    if seasons == []:
        seasons = ["2008-09", "2009-10", "2010-11", "2011-12", "2012-13", "2013-14", "2014-15", "2015-16", 
                   "2016-17", "2017-18", "2018-19", "2019-20", "2020-21", "2021-22", "2022-23", "2023-24"]
    
    if type_games == []:
        type_games = ['Regular Season', "Play-in", "Playoffs"]
    
    # Doing the download of the games.
    games_nba = download_games(seasons = seasons,  inform_steps = inform_steps, type_games = type_games)
    # Saving the csv file of games nba details
    games_nba.to_csv(file_to_save_games_nba, index = False, sep = ';')
    
    # columns there will be in our pbp data file.
    pbp_data_columns = ['GAME_ID', 'EVENTNUM', 'EVENTMSGTYPE', 'EVENTMSGACTIONTYPE', 'PERIOD', 'WCTIMESTRING', 
                        'PCTIMESTRING', 'HOMEDESCRIPTION', 'NEUTRALDESCRIPTION', 'VISITORDESCRIPTION', 'SCORE', 
                        'SCOREMARGIN', 'PERSON1TYPE', 'PLAYER1_ID', 'PLAYER1_NAME', 'PLAYER1_TEAM_ID', 
                        'PLAYER1_TEAM_CITY', 'PLAYER1_TEAM_NICKNAME', 'PLAYER1_TEAM_ABBREVIATION', 'PERSON2TYPE', 
                        'PLAYER2_ID', 'PLAYER2_NAME', 'PLAYER2_TEAM_ID', 'PLAYER2_TEAM_CITY', 'PLAYER2_TEAM_NICKNAME', 
                        'PLAYER2_TEAM_ABBREVIATION', 'PERSON3TYPE', 'PLAYER3_ID', 'PLAYER3_NAME', 'PLAYER3_TEAM_ID', 
                        'PLAYER3_TEAM_CITY', 'PLAYER3_TEAM_NICKNAME', 'PLAYER3_TEAM_ABBREVIATION', 
                        'VIDEO_AVAILABLE_FLAG']
    
    # If there isn't any kind of downlaoded pbp data yet, so we download all the data from the games in games_nba.
    # If there are downloaded pbp data, so we will download only the games of games_nba the we haven't pbp data.
    if len(os.listdir(path = path_to_save_pbp)) == 0:
        
        for s in seasons:
            pbp_season = games_nba.query("SEASON == " + s)\
            .pipe(download_play_by_play)
            
            path = path_to_save_pbp + "pbpa_" + str.replace(s, "-", "_") + ".csv"
            
            pbp_data = pd.concat(pbp_season, axis = 0)\
            .reset_index(drop = True)\
            .assign(GAME_ID = lambda x: ['00' + str(i) for i in x['GAME_ID']])\
            .astype({"PERSON1TYPE": "Int64", 
                     "PLAYER1_TEAM_ID": "Int64", 
                     "PLAYER2_TEAM_ID": "Int64", 
                     "PLAYER3_TEAM_ID": "Int64"})\
            [pbp_data_columns]
            
            pbp_data.to_csv(path_or_buf = path, sep = ';', index = False)
    
    else: 
        games_playbyplay_downloaded = list()
        
        for file in os.listdir(path_to_save_pbp):
            df_downloaded_pbp = pd.read_csv(path_to_save_pbp + file, sep = ';')[['GAME_ID']].drop_duplicates()
            games_playbyplay_downloaded.append(df_downloaded_pbp)
            
        games_playbyplay_downloaded = pd.concat(games_playbyplay_downloaded, axis = 0)\
        .assign(GAME_ID = lambda x: ['00' + str(i) for i in x['GAME_ID']])\
        .reset_index(drop = True)
        
        xdf_pbp = games_nba\
        .merge(games_playbyplay_downloaded.assign(aux = 1), how = 'left')\
        .query("aux != 1")\
        .pipe(download_play_by_play)
        
        xdf_pbp = pd.concat(xdf_pbp, axis = 0).reset_index(drop = True)
        
        path = path_to_save_pbp + "pbpa_complement" + ".csv"
        
        xdf_pbp.to_csv(path_or_buf = path, sep = ';', index = False)
                
        pbp_data = list()
        
        for file in os.listdir(path_to_save_pbp):
            
            print("Reading file: " + file)
            temp = pd.read_csv(path_to_save_pbp + file, sep = ';')
            pbp_data.append(temp)
            
        pbp_data = pd.concat(pbp_data, axis = 0)
        
        pbp_data = pbp_data\
        .assign(GAME_ID = lambda x: ['00' + str(i) for i in x['GAME_ID']])\
        .astype({"PERSON1TYPE": "Int64", 
                 "PLAYER1_TEAM_ID": "Int64", 
                 "PLAYER2_TEAM_ID": "Int64", 
                 "PLAYER3_TEAM_ID": "Int64"})\
        [pbp_data_columns]
        
        for s in games_nba['SEASON'].drop_duplicates().to_list():
            
            print("Recording the data for the season " + s)
            
            games_nba\
            .query("SEASON == '" + s + "'")[['GAME_ID']]\
            .drop_duplicates()\
            .merge(pbp_data, how = 'inner')\
            .sort_values(['GAME_ID', 'PERIOD', 'EVENTNUM'], ascending = [True, True, True])\
            .reset_index(drop = True)\
            .to_csv(path_to_save_pbp + "pbpa_" + s.replace("-", "_") + ".csv", index = False, sep = ';')


In [18]:
execute_download_data_nba_api(path_to_save_pbp = "D:/Mestrado/NBA/data/api/", 
                              file_to_save_games_nba = "D:/Mestrado/NBA/jogos_nba.csv")

Extracting games of season: 2008-09
Extracting games of season: 2009-10
Extracting games of season: 2010-11
Extracting games of season: 2011-12
Extracting games of season: 2012-13
Extracting games of season: 2013-14
Extracting games of season: 2014-15
Extracting games of season: 2015-16
Extracting games of season: 2016-17
Extracting games of season: 2017-18
Extracting games of season: 2018-19
Extracting games of season: 2019-20
Extracting games of season: 2020-21
Extracting games of season: 2021-22
Extracting games of season: 2022-23
Extracting games of season: 2023-24
0022300044: 0
Reading file: pbpa_2008_09.csv
Reading file: pbpa_2009_10.csv
Reading file: pbpa_2010_11.csv
Reading file: pbpa_2011_12.csv
Reading file: pbpa_2012_13.csv
Reading file: pbpa_2013_14.csv
Reading file: pbpa_2014_15.csv
Reading file: pbpa_2015_16.csv
Reading file: pbpa_2016_17.csv
Reading file: pbpa_2017_18.csv
Reading file: pbpa_2018_19.csv
Reading file: pbpa_2019_20.csv
Reading file: pbpa_2020_21.csv
Reading

In [20]:
pd.read_csv("D:/Mestrado/NBA/jogos_nba.csv", sep = ';')

Unnamed: 0,SEASON_ID,SEASON,SEASON_TYPE,GAME_ID,GAME_DATE,MATCHUP,HOME_TEAM,HOME_PTS,AWAY_PTS,AWAY_TEAM,HOME_TEAM_ID,AWAY_TEAM_ID,HOME_TEAM_ABBR,AWAY_TEAM_ABBR
0,22008,2008-09,Regular Season,20800001,2008-10-28,CLE @ BOS,Boston Celtics,90,85,Cleveland Cavaliers,1610612738,1610612739,BOS,CLE
1,22008,2008-09,Regular Season,20800002,2008-10-28,MIL @ CHI,Chicago Bulls,108,95,Milwaukee Bucks,1610612741,1610612749,CHI,MIL
2,22008,2008-09,Regular Season,20800003,2008-10-28,POR @ LAL,Los Angeles Lakers,96,76,Portland Trail Blazers,1610612747,1610612757,LAL,POR
3,22008,2008-09,Regular Season,20800004,2008-10-29,ATL @ ORL,Orlando Magic,85,99,Atlanta Hawks,1610612753,1610612737,ORL,ATL
4,22008,2008-09,Regular Season,20800005,2008-10-29,TOR @ PHI,Philadelphia 76ers,84,95,Toronto Raptors,1610612755,1610612761,PHI,TOR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19379,22023,2023-24,Regular Season,22300236,2023-11-22,GSW @ PHX,Phoenix Suns,123,115,Golden State Warriors,1610612756,1610612744,PHX,GSW
19380,22023,2023-24,Regular Season,22300237,2023-11-22,UTA @ POR,Portland Trail Blazers,121,105,Utah Jazz,1610612757,1610612762,POR,UTA
19381,22023,2023-24,Regular Season,22300238,2023-11-22,DAL @ LAL,Los Angeles Lakers,101,104,Dallas Mavericks,1610612747,1610612742,LAL,DAL
19382,22023,2023-24,Regular Season,22300043,2023-11-24,BOS @ ORL,Orlando Magic,113,96,Boston Celtics,1610612753,1610612738,ORL,BOS
