In [1]:
import pandas as pd
import time
import random
import os
from nba_api.stats.endpoints import leaguegamelog, boxscoretraditionalv2

pd.set_option('display.max_columns', None)

In [2]:
# === Config ===
SEASONS = ['2024-25']
CSV_PATH = 'nba_boxscores.csv'

def smart_sleep(min_sec=1.2, max_sec=3.5):
    time.sleep(random.uniform(min_sec, max_sec))

In [3]:
def get_existing_game_ids(csv_path):
    if os.path.exists(csv_path):
        existing = pd.read_csv(csv_path, usecols=['GAME_ID'], dtype={'GAME_ID': str})
        return set(existing['GAME_ID'].unique())
    return set()

def get_all_game_ids_and_dates(season):
    all_games = []
    season_types = ['Regular Season', 'Playoffs']

    for season_type in season_types:
        print(f"📅 Fetching {season_type} logs for {season}")
        gamelog = leaguegamelog.LeagueGameLog(
            season=season,
            player_or_team_abbreviation='T',
            season_type_all_star=season_type
        )
        df = gamelog.get_data_frames()[0]
        df = df[['GAME_ID', 'GAME_DATE']]
        df['SEASON'] = season
        df['SEASON_TYPE'] = season_type
        df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
        all_games.append(df)
        smart_sleep()

    combined = pd.concat(all_games, ignore_index=True)
    combined = combined.drop_duplicates(subset='GAME_ID')
    return combined

def fetch_box_score(game_id):
    # Load boxscores
    box = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
    player_df = box.get_data_frames()[0]
    team_df = box.team_stats.get_data_frame()

    # Load team-level stats
    team_df = team_df.drop(['GAME_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'MIN'], axis=1)
    team_df = team_df.add_prefix('TEAM_')
    team_df = team_df.rename(columns={'TEAM_TEAM_ID': 'TEAM_ID'})

    # Create opponent version of the team stats
    team_df_opp = team_df.copy()
    team_df_opp = team_df_opp.add_prefix('OPP_')

    # Merge team stats into player stats
    merged_df = player_df.merge(team_df, on='TEAM_ID', how='left')

    # Determine home and away team IDs
    team_ids = merged_df['TEAM_ID'].drop_duplicates().tolist()
    if len(team_ids) != 2:
        print(f"⚠️ Unexpected number of teams in GAME_ID {game_id}")
        return None
    away_team_id, home_team_id = team_ids

    # Get home vs. away scores
    away_score = merged_df.loc[merged_df['TEAM_ID'] == away_team_id, 'TEAM_PTS'].values[0]
    home_score = merged_df.loc[merged_df['TEAM_ID'] == home_team_id, 'TEAM_PTS'].values[0]

    # Label as home or away
    merged_df['HOME'] = merged_df['TEAM_ID'] == home_team_id

    # Label as won or lost
    winning_team_id = home_team_id if home_score > away_score else away_team_id
    merged_df['WON'] = merged_df['TEAM_ID'] == winning_team_id

    # Add final score comparison
    merged_df['FINAL_SCORE'] = merged_df.apply(
        lambda row: [int(home_score), int(away_score)] if row['TEAM_ID'] == home_team_id else [int(away_score), int(home_score)],
        axis=1
    )

    # Add oponent team stats
    def get_opponent_team_id(team_id):
        return home_team_id if team_id == away_team_id else away_team_id

    merged_df['OPP_TEAM_ID'] = merged_df['TEAM_ID'].apply(get_opponent_team_id)
    merged_df = merged_df.merge(team_df_opp, on='OPP_TEAM_ID', how='left')

    return merged_df


def main(season):
    csv_path = f"data/nba_boxscores_{season}.csv"

    print("🔍 Checking existing data...")
    existing_game_ids = get_existing_game_ids(csv_path)

    print("📅 Getting game IDs + dates for seasons...")
    games_df = get_all_game_ids_and_dates(season)
    new_games = games_df[~games_df['GAME_ID'].isin(existing_game_ids)]
    new_games = new_games.reset_index(drop=True)

    print(f"🆕 Found {len(new_games)} new games to fetch.")

    for i, row in new_games.iterrows():
        game_id = row['GAME_ID']
        game_date = row['GAME_DATE']
        game_type = row['SEASON_TYPE']

        try:
            print(f"📦 Fetching box score for {game_id} ({game_date.date()}) [{i+1}/{len(new_games)}]")
            df = fetch_box_score(game_id)
            df['GAME_DATE'] = game_date
            df['SEASON_TYPE'] = game_type

            if os.path.exists(csv_path):
                df.to_csv(csv_path, mode='a', header=False, index=False)
            else:
                df.to_csv(csv_path, index=False)

            smart_sleep()
        except Exception as e:
            print(f"❌ Error fetching {game_id}: {e}")
            continue

In [4]:
main('2024-25')

🔍 Checking existing data...
📅 Getting game IDs + dates for seasons...
📅 Fetching Regular Season logs for 2024-25
📅 Fetching Playoffs logs for 2024-25
🆕 Found 455 new games to fetch.
📦 Fetching box score for 0022400761 (2025-02-10) [1/455]
📦 Fetching box score for 0022400760 (2025-02-10) [2/455]
📦 Fetching box score for 0022400762 (2025-02-10) [3/455]
📦 Fetching box score for 0022400763 (2025-02-10) [4/455]
📦 Fetching box score for 0022400768 (2025-02-10) [5/455]
📦 Fetching box score for 0022400764 (2025-02-11) [6/455]
📦 Fetching box score for 0022400765 (2025-02-11) [7/455]
📦 Fetching box score for 0022400766 (2025-02-11) [8/455]
📦 Fetching box score for 0022400767 (2025-02-11) [9/455]
📦 Fetching box score for 0022400783 (2025-02-12) [10/455]
📦 Fetching box score for 0022400770 (2025-02-12) [11/455]
📦 Fetching box score for 0022400769 (2025-02-12) [12/455]
📦 Fetching box score for 0022400771 (2025-02-12) [13/455]
📦 Fetching box score for 0022400772 (2025-02-12) [14/455]
📦 Fetching box 