# Imports

In [1]:
import pandas as pd
import time
import random
import os
from nba_api.stats.endpoints import leaguegamelog, boxscoretraditionalv3

pd.set_option('display.max_columns', None)

# Functions

In [2]:
def smart_sleep(min_sec=1.2, max_sec=3.5):
    time.sleep(random.uniform(min_sec, max_sec))

In [3]:
def get_existing_game_ids(csv_path):
    if os.path.exists(csv_path):
        existing = pd.read_csv(csv_path, usecols=['gameId'], dtype={'gameId': str})
        return set(existing['gameId'].unique())
    return set()

In [4]:
# Returns a dataframe of all regular season games with GAME_ID, GAME_DATE, SEASON, and GAME_DATE
def get_all_game_ids_and_dates(season):
    # Get all games for regular season
    print(f"📅 Fetching games for {season} season")
    gamelog = leaguegamelog.LeagueGameLog(
        season=season,
        player_or_team_abbreviation='T',
        season_type_all_star='Regular Season'
    )
    df = gamelog.get_data_frames()[0]
    df = df[['GAME_ID', 'GAME_DATE']]
    df['SEASON'] = season
    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

    # Drop duplicates since done for each team
    df = df.drop_duplicates(subset='GAME_ID')

    return df

In [5]:
def minutes_to_float(min_str):
    try:
        mins, secs = map(int, min_str.split(':'))
        return mins + secs / 60
    except:
        return 0

In [6]:
# Returns a datafarme with all relevant data for a game
def fetch_box_score(game_id):
    # Load boxscores using v3
    box = boxscoretraditionalv3.BoxScoreTraditionalV3(game_id=game_id)

    # Get player and team stats and drop irrelevant columns
    player_df = box.player_stats.get_data_frame()
    player_df = player_df.drop([
        'gameId',
        'teamCity',
        'teamName',
        'teamTricode',
        'teamSlug',
        'nameI',
        'jerseyNum'
    ], axis=1)

    team_df = box.team_stats.get_data_frame()
    team_df = team_df.drop([
        'teamCity',
        'teamTricode',
        'teamSlug'
    ], axis=1)

    # Create opponent version of the team stats
    team_df_opp = team_df.copy()
    team_df_opp = team_df_opp.drop(['gameId', 'minutes'], axis=1)
    team_df_opp = team_df_opp.add_prefix('opp_')  

    # Prepare team stats for merging
    team_df = team_df.add_prefix('team_')
    team_df = team_df.rename(columns={'team_minutes': 'gameMin', 'team_gameId': 'gameId'})

    # Merge player stats with their team stats
    merged_df = player_df.merge(team_df, left_on='teamId', right_on='team_teamId', how='left')
    merged_df = merged_df.drop(['teamId'], axis=1)

    # Determine home and away team IDs
    team_ids = merged_df['team_teamId'].drop_duplicates().tolist()
    if len(team_ids) != 2:
        print(f"⚠️ Unexpected number of teams in GAME_ID {game_id}")
        return None
    away_team_id, home_team_id = team_ids

    # Get team points
    away_score = merged_df.loc[merged_df['team_teamId'] == away_team_id, 'team_points'].values[0]
    home_score = merged_df.loc[merged_df['team_teamId'] == home_team_id, 'team_points'].values[0]

    # Label home/away
    merged_df['home'] = merged_df['team_teamId'] == home_team_id

    # Label won/lost
    winning_team_id = home_team_id if home_score > away_score else away_team_id
    merged_df['won'] = merged_df['team_teamId'] == winning_team_id

    # Add opponent team ID
    def get_opponent_team_id(team_id):
        return home_team_id if team_id == away_team_id else away_team_id

    merged_df['opp_teamId'] = merged_df['team_teamId'].apply(get_opponent_team_id)

    # Merge opponent team stats
    merged_df = merged_df.merge(team_df_opp, on='opp_teamId', how='left')

    return merged_df

In [7]:
def main(season):
    csv_path = f"data/nba_boxscores_{season}.csv"

    print("🔍 Checking existing data...")
    existing_game_ids = get_existing_game_ids(csv_path)

    print("📅 Getting game IDs + dates for seasons...")
    games_df = get_all_game_ids_and_dates(season)
    new_games = games_df[~games_df['GAME_ID'].isin(existing_game_ids)]
    new_games = new_games.reset_index(drop=True)

    print(f"🆕 Found {len(new_games)} new games to fetch.")

    for i, row in new_games.iterrows():
        game_id = row['GAME_ID']
        game_date = row['GAME_DATE']

        try:
            print(f"📦 Fetching box score for {game_id} ({game_date.date()}) [{i+1}/{len(new_games)}]")
            df = fetch_box_score(game_id)
            df['GAME_DATE'] = game_date

            if os.path.exists(csv_path):
                df.to_csv(csv_path, mode='a', header=False, index=False)
            else:
                df.to_csv(csv_path, index=False)

            print(f"✅ Success fetching {game_id}\n")
            smart_sleep()
            
        except Exception as e:
            print(f"❌ Error fetching {game_id}: {e}\n")
            continue

# Main

In [8]:
main('2024-25')

🔍 Checking existing data...
📅 Getting game IDs + dates for seasons...
📅 Fetching games for 2024-25 season
🆕 Found 1230 new games to fetch.
📦 Fetching box score for 0022400062 (2024-10-22) [1/1230]
✅ Success fetching 0022400062

📦 Fetching box score for 0022400061 (2024-10-22) [2/1230]
✅ Success fetching 0022400061

📦 Fetching box score for 0022400071 (2024-10-23) [3/1230]
✅ Success fetching 0022400071

📦 Fetching box score for 0022400064 (2024-10-23) [4/1230]
✅ Success fetching 0022400064

📦 Fetching box score for 0022400068 (2024-10-23) [5/1230]
✅ Success fetching 0022400068

📦 Fetching box score for 0022400065 (2024-10-23) [6/1230]
✅ Success fetching 0022400065

📦 Fetching box score for 0022400067 (2024-10-23) [7/1230]
✅ Success fetching 0022400067

📦 Fetching box score for 0022400066 (2024-10-23) [8/1230]
✅ Success fetching 0022400066

📦 Fetching box score for 0022400063 (2024-10-23) [9/1230]
✅ Success fetching 0022400063

📦 Fetching box score for 0022400069 (2024-10-23) [10/1230]
✅

KeyboardInterrupt: 