# Imports

In [None]:
import pandas as pd
import time
import random
import os
from nba_api.stats.endpoints import leaguegamelog, boxscoretraditionalv2

pd.set_option('display.max_columns', None)

# Functions

In [2]:
def smart_sleep(min_sec=1.2, max_sec=3.5):
    time.sleep(random.uniform(min_sec, max_sec))

In [3]:
def get_existing_game_ids(csv_path):
    if os.path.exists(csv_path):
        existing = pd.read_csv(csv_path, usecols=['GAME_ID'], dtype={'GAME_ID': str})
        return set(existing['GAME_ID'].unique())
    return set()

In [None]:
# Returns a dataframe of all regular season games with GAME_ID, GAME_DATE, SEASON, and GAME_DATE
def get_all_game_ids_and_dates(season):
    # Get all games for regular season
    print(f"📅 Fetching games for {season} season")
    gamelog = leaguegamelog.LeagueGameLog(
        season=season,
        player_or_team_abbreviation='T',
        season_type_all_star='Regular Season'
    )
    df = gamelog.get_data_frames()[0]
    df = df[['GAME_ID', 'GAME_DATE']]
    df['SEASON'] = season
    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

    # Drop duplicates since done for each team
    df = df.drop_duplicates(subset='GAME_ID')

    return df

In [None]:
# Returns a dataframe of relevant boxscore stats for a game
def fetch_box_score(game_id):
    # Load boxscores
    box = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)

    # Get player boxscores
    player_df = box.get_data_frames()[0]

    # Get team boxscores
    team_df = box.team_stats.get_data_frame()

    # Create opponent version of the team stats
    team_df_opp = team_df.copy()
    team_df_opp = team_df_opp.drop(['GAME_ID', 'MIN'], axis=1)
    team_df_opp = team_df_opp.add_prefix('OPP_')

    # Load team-level stats
    team_df = team_df.drop(['GAME_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY'], axis=1)
    team_df = team_df.add_prefix('TEAM_')
    team_df = team_df.rename(columns={'TEAM_TEAM_ID': 'TEAM_ID','TEAM_TEAM_NAME': 'TEAM_NAME','TEAM_MIN': 'GAME_MIN'})

    # Merge a player's team stats into their stats
    merged_df = player_df.merge(team_df, on='TEAM_ID', how='left')

    # Determine home and away team IDs (away team always appears first)
    team_ids = merged_df['TEAM_ID'].drop_duplicates().tolist()
    if len(team_ids) != 2:
        print(f"⚠️ Unexpected number of teams in GAME_ID {game_id}")
        return None
    away_team_id, home_team_id = team_ids

    # Get home vs. away scores (get first one)
    away_score = merged_df.loc[merged_df['TEAM_ID'] == away_team_id, 'TEAM_PTS'].values[0]
    home_score = merged_df.loc[merged_df['TEAM_ID'] == home_team_id, 'TEAM_PTS'].values[0]

    # Label as home or away
    merged_df['HOME'] = merged_df['TEAM_ID'] == home_team_id

    # Label as won or lost
    winning_team_id = home_team_id if home_score > away_score else away_team_id
    merged_df['WON'] = merged_df['TEAM_ID'] == winning_team_id

    # Add oponent team stats
    def get_opponent_team_id(team_id):
        return home_team_id if team_id == away_team_id else away_team_id

    merged_df['OPP_TEAM_ID'] = merged_df['TEAM_ID'].apply(get_opponent_team_id)
    merged_df = merged_df.merge(team_df_opp, on='OPP_TEAM_ID', how='left')

    return merged_df

In [6]:
def main(season):
    csv_path = f"data/nba_boxscores_{season}.csv"

    print("🔍 Checking existing data...")
    existing_game_ids = get_existing_game_ids(csv_path)

    print("📅 Getting game IDs + dates for seasons...")
    games_df = get_all_game_ids_and_dates(season)
    new_games = games_df[~games_df['GAME_ID'].isin(existing_game_ids)]
    new_games = new_games.reset_index(drop=True)

    print(f"🆕 Found {len(new_games)} new games to fetch.")

    for i, row in new_games.iterrows():
        game_id = row['GAME_ID']
        game_date = row['GAME_DATE']
        game_type = row['SEASON_TYPE']

        try:
            print(f"📦 Fetching box score for {game_id} ({game_date.date()}) [{i+1}/{len(new_games)}]")
            df = fetch_box_score(game_id)
            df['GAME_DATE'] = game_date
            df['SEASON_TYPE'] = game_type

            if os.path.exists(csv_path):
                df.to_csv(csv_path, mode='a', header=False, index=False)
            else:
                df.to_csv(csv_path, index=False)

            print(f"✅ Success fetching {game_id}\n")
            smart_sleep()
            
        except Exception as e:
            print(f"❌ Error fetching {game_id}: {e}\n")
            continue

# Main

In [7]:
main('2023-24')

🔍 Checking existing data...
📅 Getting game IDs + dates for seasons...
📅 Fetching Regular Season logs for 2023-24
📅 Fetching Playoffs logs for 2023-24
🆕 Found 1312 new games to fetch.
📦 Fetching box score for 0022300062 (2023-10-24) [1/1312]
✅ Success fetching 0022300062

📦 Fetching box score for 0022300061 (2023-10-24) [2/1312]
✅ Success fetching 0022300061

📦 Fetching box score for 0022300063 (2023-10-25) [3/1312]
✅ Success fetching 0022300063

📦 Fetching box score for 0022300064 (2023-10-25) [4/1312]
✅ Success fetching 0022300064

📦 Fetching box score for 0022300068 (2023-10-25) [5/1312]
✅ Success fetching 0022300068

📦 Fetching box score for 0022300065 (2023-10-25) [6/1312]
✅ Success fetching 0022300065

📦 Fetching box score for 0022300074 (2023-10-25) [7/1312]
✅ Success fetching 0022300074

📦 Fetching box score for 0022300070 (2023-10-25) [8/1312]
✅ Success fetching 0022300070

📦 Fetching box score for 0022300066 (2023-10-25) [9/1312]
✅ Success fetching 0022300066

📦 Fetching box s

KeyboardInterrupt: 