# NBA Playoffs Simulator — Data Collection

Notebook 01 del proyecto. Acá recolecto todo lo que necesito de la API de la NBA:
- Stats avanzadas de la temporada actual (2025-26)
- Game logs para sacar consistencia, momentum y clutch
- Resultados de playoffs de las últimas 10 temporadas (esto es lo que entrena el modelo)
- Stats históricas por temporada regular + standings

In [None]:
!pip install nba_api --quiet

In [None]:
import pandas as pd
import numpy as np
import time
import warnings
import os

from nba_api.stats.endpoints import (
    leaguedashteamstats,
    leaguestandingsv3,
    leaguegamelog
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

os.makedirs('data', exist_ok=True)

# stats.nba.com te bloquea si haces muchas llamadas seguidas
API_DELAY = 0.8

## Stats avanzadas — temporada actual

Lo que realmente mide qué tan bueno es un equipo no es el récord sino las stats avanzadas: Net Rating, eFG%, Pace, etc. Acá traigo todo eso de la temporada actual.

In [None]:
CURRENT_SEASON = '2025-26'

# Advanced: OFF_RATING, DEF_RATING, NET_RATING, PACE, TS%, eFG%, etc.
advanced = leaguedashteamstats.LeagueDashTeamStats(
    season=CURRENT_SEASON,
    measure_type_detailed_defense='Advanced',
    season_type_all_star='Regular Season'
)
df_advanced = advanced.get_data_frames()[0]
time.sleep(API_DELAY)

# Four Factors: eFG%, TOV%, OREB%, FT Rate
four_factors = leaguedashteamstats.LeagueDashTeamStats(
    season=CURRENT_SEASON,
    measure_type_detailed_defense='Four Factors',
    season_type_all_star='Regular Season'
)
df_four = four_factors.get_data_frames()[0]
time.sleep(API_DELAY)

# Base: W, L, PTS, REB, AST, etc.
base = leaguedashteamstats.LeagueDashTeamStats(
    season=CURRENT_SEASON,
    measure_type_detailed_defense='Base',
    season_type_all_star='Regular Season'
)
df_base = base.get_data_frames()[0]
time.sleep(API_DELAY)

print(f'Advanced: {df_advanced.shape}')
print(f'Four Factors: {df_four.shape}')
print(f'Base: {df_base.shape}')

In [None]:
# Standings actuales (conferencia, seed, record)
standings = leaguestandingsv3.LeagueStandingsV3(
    season=CURRENT_SEASON,
    season_type='Regular Season'
)
df_standings = standings.get_data_frames()[0]
time.sleep(API_DELAY)

print(f'Standings: {df_standings.shape}')

In [None]:
# Junto todo en una sola tabla

# De advanced me quedo con las columnas que me interesan
adv_cols = [
    'TEAM_ID', 'TEAM_NAME',
    'W', 'L', 'W_PCT', 'GP',
    'OFF_RATING', 'DEF_RATING', 'NET_RATING',
    'PACE', 'TS_PCT', 'EFG_PCT',
    'AST_PCT', 'AST_TO', 'AST_RATIO',
    'OREB_PCT', 'DREB_PCT', 'REB_PCT',
    'TM_TOV_PCT', 'PIE'
]
adv_cols = [c for c in adv_cols if c in df_advanced.columns]
df_current = df_advanced[adv_cols].copy()

# De four factors solo las columnas del oponente (contexto defensivo)
four_opp_cols = [c for c in df_four.columns if 'OPP' in c]
four_merge = df_four[['TEAM_ID'] + four_opp_cols].copy()
df_current = df_current.merge(four_merge, on='TEAM_ID', how='left')

# De base: PTS, REB, AST, STL, BLK, TOV, PLUS_MINUS
base_extra = ['TEAM_ID', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PLUS_MINUS']
base_extra = [c for c in base_extra if c in df_base.columns]
df_current = df_current.merge(df_base[base_extra], on='TEAM_ID', how='left')

# Standings: conferencia y seed
stand_cols = ['TeamID', 'Conference', 'PlayoffRank', 'ConferenceRecord']
stand_cols = [c for c in stand_cols if c in df_standings.columns]
df_stand_merge = df_standings[stand_cols].rename(columns={'TeamID': 'TEAM_ID'})
df_current = df_current.merge(df_stand_merge, on='TEAM_ID', how='left')

df_current['SEASON'] = CURRENT_SEASON

print(f'Tabla unificada: {df_current.shape[0]} equipos x {df_current.shape[1]} columnas')
df_current.sort_values('NET_RATING', ascending=False).head(10)

## Game logs — temporada actual

Con los resultados juego a juego puedo calcular cosas que las stats agregadas no capturan: qué tan consistente es un equipo, si llega caliente a playoffs (momentum), y si gana los juegos apretados (clutch).

In [None]:
gamelogs = leaguegamelog.LeagueGameLog(
    season=CURRENT_SEASON,
    season_type_all_star='Regular Season'
)
df_gamelogs = gamelogs.get_data_frames()[0]
time.sleep(API_DELAY)

print(f'{df_gamelogs.shape[0]} registros, {df_gamelogs["TEAM_ID"].nunique()} equipos')
df_gamelogs.head(3)

In [None]:
def compute_gamelog_features(df_logs):
    """Calcula consistency, clutch y momentum a partir de game logs."""
    results = []

    for team_id, team_df in df_logs.groupby('TEAM_ID'):
        team_df = team_df.sort_values('GAME_DATE', ascending=True).copy()
        team_name = team_df['TEAM_ABBREVIATION'].values[0]

        # Consistencia: inversa de la desviacion estandar del plus/minus
        std_pm = team_df['PLUS_MINUS'].std()
        consistency = round(1 / std_pm, 4) if std_pm > 0 else 0

        # Clutch: win% en juegos decididos por 5 pts o menos
        clutch_games = team_df[team_df['PLUS_MINUS'].abs() <= 5]
        if len(clutch_games) > 0:
            clutch_wins = (clutch_games['WL'] == 'W').sum()
            clutch_pct = round(clutch_wins / len(clutch_games), 4)
        else:
            clutch_pct = 0.5

        # Momentum: ultimos 15 juegos vs promedio de temporada
        last15 = team_df.tail(15)
        l15_win_pct = round((last15['WL'] == 'W').sum() / len(last15), 4)
        l15_plus_minus = round(last15['PLUS_MINUS'].mean(), 2)
        season_plus_minus = round(team_df['PLUS_MINUS'].mean(), 2)

        results.append({
            'TEAM_ID': team_id,
            'TEAM_ABBR': team_name,
            'consistency_score': consistency,
            'clutch_win_pct': clutch_pct,
            'clutch_games_played': len(clutch_games),
            'last15_win_pct': l15_win_pct,
            'last15_avg_plus_minus': l15_plus_minus,
            'season_avg_plus_minus': season_plus_minus,
            'momentum_delta': round(l15_plus_minus - season_plus_minus, 2)
        })

    return pd.DataFrame(results)


df_gamelog_features = compute_gamelog_features(df_gamelogs)

print(f'Features: {df_gamelog_features.shape}')
print('\nEquipos con mayor momentum:')
df_gamelog_features.sort_values('momentum_delta', ascending=False).head(5)[
    ['TEAM_ABBR', 'consistency_score', 'clutch_win_pct', 'last15_win_pct', 'momentum_delta']
]

In [None]:
# Merge game log features con la tabla principal
df_current = df_current.merge(
    df_gamelog_features.drop(columns=['TEAM_ABBR']),
    on='TEAM_ID',
    how='left'
)

print(f'Tabla completa: {df_current.shape[0]} equipos x {df_current.shape[1]} columnas')

## Playoffs historicos (2015-2025)

Esto es lo que va a entrenar el modelo. Necesito reconstruir cada serie de playoffs desde los game logs: quien jugo contra quien, quien gano, en cuantos juegos, y en que ronda. Con 10 temporadas me da ~150 series.

In [None]:
def get_playoff_series(season):
    """Reconstruye las series de playoffs de una temporada a partir de game logs."""
    try:
        games = leaguegamelog.LeagueGameLog(
            season=season,
            season_type_all_star='Playoffs'
        )
        df = games.get_data_frames()[0]
    except Exception as e:
        print(f'Error en {season}: {e}')
        return pd.DataFrame()

    if df.empty:
        return pd.DataFrame()

    # Armo info de cada juego (los 2 equipos + quien gano)
    game_info = []
    for game_id in df['GAME_ID'].unique():
        game_rows = df[df['GAME_ID'] == game_id].sort_values('TEAM_ID')
        if len(game_rows) != 2:
            continue

        teams = game_rows[['TEAM_ID', 'TEAM_ABBREVIATION', 'WL']].values
        winner_idx = 0 if teams[0][2] == 'W' else 1

        game_info.append({
            'game_id': game_id,
            'game_date': game_rows['GAME_DATE'].values[0],
            'team_a_id': int(teams[0][0]),
            'team_a_abbr': teams[0][1],
            'team_b_id': int(teams[1][0]),
            'team_b_abbr': teams[1][1],
            'winner_id': int(teams[winner_idx][0])
        })

    if not game_info:
        return pd.DataFrame()

    games_df = pd.DataFrame(game_info)

    # Agrupo por par de equipos = una serie
    games_df['series_key'] = games_df.apply(
        lambda x: tuple(sorted([x['team_a_id'], x['team_b_id']])), axis=1
    )

    series_list = []
    for key, group in games_df.groupby('series_key'):
        team_ids = list(key)
        group = group.sort_values('game_date')

        wins = {tid: (group['winner_id'] == tid).sum() for tid in team_ids}

        abbrs = {}
        for _, row in group.iterrows():
            abbrs[row['team_a_id']] = row['team_a_abbr']
            abbrs[row['team_b_id']] = row['team_b_abbr']

        winner_id = max(wins, key=wins.get)
        loser_id = [t for t in team_ids if t != winner_id][0]

        series_list.append({
            'season': season,
            'winner_id': winner_id,
            'winner_abbr': abbrs.get(winner_id, ''),
            'winner_wins': wins[winner_id],
            'loser_id': loser_id,
            'loser_abbr': abbrs.get(loser_id, ''),
            'loser_wins': wins[loser_id],
            'total_games': len(group),
            'first_game_date': group['game_date'].values[0]
        })

    series_df = pd.DataFrame(series_list).sort_values('first_game_date')

    # Asigno ronda por orden cronologico
    # Formato estandar: 8 R1 + 4 R2 + 2 CF + 1 Finals = 15 series
    round_sizes = [8, 4, 2, 1]
    round_labels = [1, 2, 3, 4]
    rounds = []
    idx = 0
    for size, label in zip(round_sizes, round_labels):
        rounds.extend([label] * min(size, max(0, len(series_df) - idx)))
        idx += size
    series_df['round'] = rounds[:len(series_df)]

    return series_df

In [None]:
HISTORICAL_SEASONS = [
    '2015-16', '2016-17', '2017-18', '2018-19', '2019-20',
    '2020-21', '2021-22', '2022-23', '2023-24', '2024-25'
]

all_series = []

for season in HISTORICAL_SEASONS:
    print(f'{season}...', end=' ')
    series = get_playoff_series(season)
    if not series.empty:
        all_series.append(series)
        print(f'{len(series)} series')
    else:
        print('sin datos')
    time.sleep(API_DELAY)

df_historical_playoffs = pd.concat(all_series, ignore_index=True)

print(f'\nTotal: {len(df_historical_playoffs)} series historicas')
print(f'Por ronda: {df_historical_playoffs["round"].value_counts().sort_index().to_dict()}')

In [None]:
# Chequeo rapido: las Finals de cada temporada
finals = df_historical_playoffs[df_historical_playoffs['round'] == 4].copy()
finals['result'] = finals.apply(
    lambda x: f"{x['winner_abbr']} {int(x['winner_wins'])}-{int(x['loser_wins'])} {x['loser_abbr']}",
    axis=1
)
print(finals[['season', 'result']].to_string(index=False))

## Stats historicas por temporada regular

Para armar los features del modelo necesito las stats de temporada regular de cada equipo que jugo playoffs. Asi calculo diferenciales (equipo A vs equipo B) y el XGBoost aprende que combinaciones predicen victorias en series.

In [None]:
def get_season_team_stats(season):
    """Trae stats avanzadas + base de todos los equipos para una temporada."""
    try:
        adv = leaguedashteamstats.LeagueDashTeamStats(
            season=season,
            measure_type_detailed_defense='Advanced',
            season_type_all_star='Regular Season'
        )
        df_adv = adv.get_data_frames()[0]
        time.sleep(API_DELAY)

        base = leaguedashteamstats.LeagueDashTeamStats(
            season=season,
            measure_type_detailed_defense='Base',
            season_type_all_star='Regular Season'
        )
        df_base = base.get_data_frames()[0]
        time.sleep(API_DELAY)

        adv_cols = [
            'TEAM_ID', 'TEAM_NAME',
            'GP', 'W', 'L', 'W_PCT',
            'OFF_RATING', 'DEF_RATING', 'NET_RATING',
            'PACE', 'TS_PCT', 'EFG_PCT',
            'AST_PCT', 'AST_TO', 'AST_RATIO',
            'OREB_PCT', 'DREB_PCT', 'REB_PCT',
            'TM_TOV_PCT', 'PIE'
        ]
        adv_cols = [c for c in adv_cols if c in df_adv.columns]
        df = df_adv[adv_cols].copy()

        base_cols = ['TEAM_ID', 'PTS', 'PLUS_MINUS']
        base_cols = [c for c in base_cols if c in df_base.columns]
        df = df.merge(df_base[base_cols], on='TEAM_ID', how='left')

        df['SEASON'] = season
        return df

    except Exception as e:
        print(f'Error en {season}: {e}')
        return pd.DataFrame()

In [None]:
all_stats = []

for season in HISTORICAL_SEASONS:
    print(f'{season}...', end=' ')
    stats = get_season_team_stats(season)
    if not stats.empty:
        all_stats.append(stats)
        print(f'{len(stats)} equipos')
    else:
        print('sin datos')

df_historical_stats = pd.concat(all_stats, ignore_index=True)

print(f'\nTotal: {len(df_historical_stats)} registros equipo-temporada')

In [None]:
# Mejor equipo por Net Rating cada año (sanity check)
for season in HISTORICAL_SEASONS:
    season_data = df_historical_stats[df_historical_stats['SEASON'] == season]
    if season_data.empty:
        continue
    best = season_data.sort_values('NET_RATING', ascending=False).iloc[0]
    print(f"{season}: {best['TEAM_NAME']:<28} (Net Rating: {best['NET_RATING']:+.1f})")

## Standings historicos

Necesito el seed de cada equipo para saber quien tenia home court en cada serie.

In [None]:
all_standings = []

for season in HISTORICAL_SEASONS:
    print(f'{season}...', end=' ')
    try:
        stand = leaguestandingsv3.LeagueStandingsV3(
            season=season,
            season_type='Regular Season'
        )
        df_s = stand.get_data_frames()[0]
        df_s['SEASON'] = season
        all_standings.append(df_s)
        print(f'{len(df_s)} equipos')
    except Exception as e:
        print(f'Error: {e}')
    time.sleep(API_DELAY)

df_historical_standings = pd.concat(all_standings, ignore_index=True)
print(f'\nTotal: {len(df_historical_standings)} registros')

## Guardar todo en Drive

Monto Drive y guardo los CSVs para que los notebooks siguientes puedan leerlos sin tener que volver a correr todo esto.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = '/content/drive/MyDrive/nba-playoffs-simulator'
DATA_DIR = f'{PROJECT_DIR}/data'
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
import shutil

datasets = {
    'team_stats_2026.csv': df_current,
    'team_gamelogs_2026.csv': df_gamelogs,
    'historical_playoffs.csv': df_historical_playoffs,
    'historical_team_stats.csv': df_historical_stats,
    'historical_standings.csv': df_historical_standings
}

for filename, df in datasets.items():
    local_path = f'data/{filename}'
    df.to_csv(local_path, index=False)
    shutil.copy(local_path, f'{DATA_DIR}/{filename}')
    print(f'{filename:<32} {df.shape}')

print(f'\nGuardado en: {DATA_DIR}')

---

Listo, con esto tengo todo lo que necesito para empezar a armar los features. Los archivos quedan en Drive y el notebook 02 los lee directo de ahí.

**Archivos generados:**
- `team_stats_2026.csv` — 30 equipos, stats actuales + consistency/clutch/momentum
- `team_gamelogs_2026.csv` — game logs juego a juego
- `historical_playoffs.csv` — ~150 series de playoffs (2015-2025)
- `historical_team_stats.csv` — stats por temporada
- `historical_standings.csv` — seeding por temporada