# NBA Playoffs Simulator — Feature Engineering

Notebook 02. Acá transformo los datos crudos del notebook anterior en features que tengan sentido deportivo. La idea es simple: al modelo no le importa si un equipo tiene Net Rating de +8, le importa si es **mejor que su rival**. Por eso todo se calcula como diferenciales (equipo A - equipo B).

También armo los perfiles de los 16 equipos de playoffs para la simulación.

In [None]:
import pandas as pd
import numpy as np
import warnings
import os

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = '/content/drive/MyDrive/nba-playoffs-simulator'
DATA_DIR = f'{PROJECT_DIR}/data'

for f in sorted(os.listdir(DATA_DIR)):
    size = os.path.getsize(f'{DATA_DIR}/{f}') / 1024
    print(f'{f:<36} ({size:.1f} KB)')

In [None]:
df_current = pd.read_csv(f'{DATA_DIR}/team_stats_2026.csv')
df_playoffs = pd.read_csv(f'{DATA_DIR}/historical_playoffs.csv')
df_hist_stats = pd.read_csv(f'{DATA_DIR}/historical_team_stats.csv')
df_hist_standings = pd.read_csv(f'{DATA_DIR}/historical_standings.csv')

print(f'Stats actuales:      {df_current.shape}')
print(f'Playoffs historicos: {df_playoffs.shape}')
print(f'Stats historicas:    {df_hist_stats.shape}')
print(f'Standings historicos: {df_hist_standings.shape}')

## Exploración rápida

Antes de armar features, un vistazo a qué tenemos y si los datos están completos.

In [None]:
# Columnas disponibles en stats historicas
for col in df_hist_stats.columns:
    non_null = df_hist_stats[col].notna().sum()
    print(f'{col:<24} {non_null}/{len(df_hist_stats)}')

print(f'\nTemporadas: {sorted(df_hist_stats["SEASON"].unique())}')

In [None]:
# Como lucen las series historicas
print(df_playoffs.head(10).to_string(index=False))

round_names = {1: 'First Round', 2: 'Conf. Semis', 3: 'Conf. Finals', 4: 'NBA Finals'}
print('\nSeries por ronda:')
for r in sorted(df_playoffs['round'].unique()):
    count = len(df_playoffs[df_playoffs['round'] == r])
    print(f'  Ronda {r} ({round_names.get(r, "?")}): {count} series')

In [None]:
# Standings historicos
print(list(df_hist_standings.columns))
key_stand_cols = [c for c in ['TeamID', 'TeamCity', 'TeamName', 'Conference',
                               'PlayoffRank', 'Record', 'SEASON']
                  if c in df_hist_standings.columns]
df_hist_standings[key_stand_cols].head(10)

## Dataset de entrenamiento

El XGBoost aprende de matchups historicos: dados los perfiles de dos equipos que se enfrentaron en playoffs, quien gano la serie?

Cada fila va a ser una serie, con features como diferenciales entre ambos equipos. Team A siempre es el mejor seed (el favorito), y el target es si gano o no.

In [None]:
# Asigno el seed a cada equipo en cada serie historica

def get_team_seed(team_id, season, standings_df):
    """Busca el seed de un equipo en una temporada."""
    if 'TeamID' in standings_df.columns:
        mask = (standings_df['TeamID'] == team_id) & (standings_df['SEASON'] == season)
    else:
        return None, None

    match = standings_df[mask]
    if len(match) == 0:
        return None, None

    row = match.iloc[0]
    seed = row.get('PlayoffRank', None)
    conf = row.get('Conference', None)
    return seed, conf


seeds_winner = []
seeds_loser = []
confs_winner = []

for _, row in df_playoffs.iterrows():
    w_seed, w_conf = get_team_seed(row['winner_id'], row['season'], df_hist_standings)
    l_seed, l_conf = get_team_seed(row['loser_id'], row['season'], df_hist_standings)
    seeds_winner.append(w_seed)
    seeds_loser.append(l_seed)
    confs_winner.append(w_conf)

df_playoffs['winner_seed'] = seeds_winner
df_playoffs['loser_seed'] = seeds_loser
df_playoffs['conference'] = confs_winner

has_seeds = df_playoffs['winner_seed'].notna().sum()
print(f'Seeds asignados: {has_seeds}/{len(df_playoffs)} series')
df_playoffs[['season', 'winner_abbr', 'winner_seed', 'loser_abbr', 'loser_seed',
             'round', 'total_games']].head(10)

In [None]:
# Team A = mejor seed (favorito)
# Asi el modelo aprende: "dado que A es el favorito, gano o no?"

rows = []

for _, s in df_playoffs.iterrows():
    w_seed = s['winner_seed']
    l_seed = s['loser_seed']

    if pd.isna(w_seed) or pd.isna(l_seed):
        continue

    if w_seed <= l_seed:
        # Ganador era el favorito
        rows.append({
            'season': s['season'],
            'round': s['round'],
            'team_a_id': s['winner_id'],
            'team_a_abbr': s['winner_abbr'],
            'team_a_seed': int(w_seed),
            'team_b_id': s['loser_id'],
            'team_b_abbr': s['loser_abbr'],
            'team_b_seed': int(l_seed),
            'team_a_won': 1,
            'series_games': s['total_games']
        })
    else:
        # Ganador era el underdog (upset)
        rows.append({
            'season': s['season'],
            'round': s['round'],
            'team_a_id': s['loser_id'],
            'team_a_abbr': s['loser_abbr'],
            'team_a_seed': int(l_seed),
            'team_b_id': s['winner_id'],
            'team_b_abbr': s['winner_abbr'],
            'team_b_seed': int(w_seed),
            'team_a_won': 0,
            'series_games': s['total_games']
        })

df_matchups = pd.DataFrame(rows)

print(f'Matchups: {len(df_matchups)} series')
print(f'Tasa de victoria del favorito: {df_matchups["team_a_won"].mean():.1%}')

print('\nPor ronda:')
for r in sorted(df_matchups['round'].unique()):
    subset = df_matchups[df_matchups['round'] == r]
    rate = subset['team_a_won'].mean()
    print(f'  Ronda {r}: {rate:.1%} ({len(subset)} series)')

## Feature engineering

Cada feature es un diferencial: cuanto mejor es el equipo A que el B en esa metrica. Si es positivo, A es mejor. Invierto el signo para DEF_RATING y turnovers porque en esas metricas menor es mejor.

In [None]:
# Features que voy a usar como diferenciales
DIFF_FEATURES = [
    'NET_RATING',
    'OFF_RATING',
    'DEF_RATING',
    'W_PCT',
    'PACE',
    'EFG_PCT',
    'TM_TOV_PCT',
    'REB_PCT',
    'TS_PCT',
    'PIE',
    'AST_RATIO',
    'OREB_PCT',
    'DREB_PCT'
]


def build_matchup_features(matchups_df, stats_df):
    """Calcula diferenciales de stats entre Team A y Team B para cada serie."""
    feature_rows = []

    for _, matchup in matchups_df.iterrows():
        season = matchup['season']
        season_stats = stats_df[stats_df['SEASON'] == season]

        team_a_stats = season_stats[season_stats['TEAM_ID'] == matchup['team_a_id']]
        team_b_stats = season_stats[season_stats['TEAM_ID'] == matchup['team_b_id']]

        if len(team_a_stats) == 0 or len(team_b_stats) == 0:
            continue

        team_a_stats = team_a_stats.iloc[0]
        team_b_stats = team_b_stats.iloc[0]

        row = {
            'season': season,
            'round': matchup['round'],
            'team_a_abbr': matchup['team_a_abbr'],
            'team_b_abbr': matchup['team_b_abbr'],
            'team_a_seed': matchup['team_a_seed'],
            'team_b_seed': matchup['team_b_seed'],
            'team_a_won': matchup['team_a_won'],
        }

        for feat in DIFF_FEATURES:
            if feat in team_a_stats.index and feat in team_b_stats.index:
                val_a = team_a_stats[feat]
                val_b = team_b_stats[feat]
                if pd.notna(val_a) and pd.notna(val_b):
                    # DEF_RATING y TM_TOV_PCT: menor es mejor, invierto el signo
                    if feat in ['DEF_RATING', 'TM_TOV_PCT']:
                        row[f'{feat}_diff'] = round(val_b - val_a, 4)
                    else:
                        row[f'{feat}_diff'] = round(val_a - val_b, 4)
                else:
                    row[f'{feat}_diff'] = 0
            else:
                row[f'{feat}_diff'] = 0

        # Seed diff (positivo = A tiene mejor seed)
        row['seed_diff'] = matchup['team_b_seed'] - matchup['team_a_seed']

        feature_rows.append(row)

    return pd.DataFrame(feature_rows)

In [None]:
df_training = build_matchup_features(df_matchups, df_hist_stats)

diff_cols = [c for c in df_training.columns if c.endswith('_diff')]

print(f'Training set: {df_training.shape[0]} series x {df_training.shape[1]} columnas')
print(f'Features: {len(diff_cols)}')
print(f'Balance: {df_training["team_a_won"].mean():.1%} favorito gana')

In [None]:
# Distribucion de features: cuando el favorito gana vs cuando pierde
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('dark_background')

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
fig.suptitle('Diferenciales: favorito gana vs pierde',
             fontsize=16, fontweight='bold', y=1.02)

key_features = ['NET_RATING_diff', 'OFF_RATING_diff', 'DEF_RATING_diff',
                'W_PCT_diff', 'EFG_PCT_diff', 'seed_diff']
titles = ['Net Rating', 'Ataque', 'Defensa', 'Win %', 'eFG%', 'Seed']

for ax, feat, title in zip(axes.flatten(), key_features, titles):
    if feat not in df_training.columns:
        ax.set_visible(False)
        continue

    wins = df_training[df_training['team_a_won'] == 1][feat]
    losses = df_training[df_training['team_a_won'] == 0][feat]

    ax.hist(wins, bins=15, alpha=0.7, label='Favorito gano', color='#00E676')
    ax.hist(losses, bins=15, alpha=0.7, label='Favorito perdio', color='#FF5252')
    ax.set_title(f'{title}', fontsize=12, fontweight='bold')
    ax.legend(fontsize=9)
    ax.set_xlabel(f'{feat}')

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=150, bbox_inches='tight',
            facecolor='black', edgecolor='none')
plt.show()

# Si las distribuciones se separan bien, el feature tiene poder predictivo

In [None]:
# Correlacion con victoria del favorito
correlations = {}
for col in diff_cols:
    if col in df_training.columns:
        corr = df_training[col].corr(df_training['team_a_won'])
        correlations[col] = corr

corr_sorted = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

for feat, corr in corr_sorted:
    bar = '█' * int(abs(corr) * 40)
    sign = '+' if corr > 0 else '-'
    print(f'{feat:<24} {sign}{abs(corr):.3f}  {bar}')

## Perfiles de equipos actuales (2025-26)

Armo los perfiles de los 16 equipos que van a playoffs esta temporada. Estos son los que despues alimentan la simulacion Monte Carlo.

In [None]:
# Top 8 por conferencia
east_teams = df_current[df_current['Conference'] == 'East'].nsmallest(8, 'PlayoffRank')
west_teams = df_current[df_current['Conference'] == 'West'].nsmallest(8, 'PlayoffRank')
df_playoff_teams = pd.concat([east_teams, west_teams])

print(f'{len(df_playoff_teams)} equipos clasificados\n')

print('WEST:')
for _, t in west_teams.iterrows():
    seed = int(t['PlayoffRank'])
    print(f'  ({seed}) {t["TEAM_NAME"]:<28} {int(t["W"])}-{int(t["L"])}  '
          f'Net: {t["NET_RATING"]:+.1f}')

print(f'\nEAST:')
for _, t in east_teams.iterrows():
    seed = int(t['PlayoffRank'])
    print(f'  ({seed}) {t["TEAM_NAME"]:<28} {int(t["W"])}-{int(t["L"])}  '
          f'Net: {t["NET_RATING"]:+.1f}')

In [None]:
# Tabla de perfiles con todo lo que necesito para la simulacion
feature_cols = DIFF_FEATURES + [
    'TEAM_ID', 'TEAM_NAME', 'W', 'L', 'W_PCT', 'GP',
    'Conference', 'PlayoffRank',
    'consistency_score', 'clutch_win_pct', 'momentum_delta',
    'last15_win_pct', 'last15_avg_plus_minus',
    'season_avg_plus_minus', 'SEASON'
]

available_cols = [c for c in feature_cols if c in df_playoff_teams.columns]
df_profiles = df_playoff_teams[available_cols].copy()
df_profiles['SEED'] = df_profiles['PlayoffRank'].astype(int)

print(f'Perfiles: {df_profiles.shape[0]} equipos x {df_profiles.shape[1]} columnas')

In [None]:
# Rankings por distintas metricas

print('Por Net Rating:')
for i, (_, t) in enumerate(df_profiles.sort_values('NET_RATING', ascending=False).iterrows(), 1):
    conf = 'W' if t['Conference'] == 'West' else 'E'
    print(f'  {i:>2}. [{conf}{int(t["SEED"])}] {t["TEAM_NAME"]:<28} {t["NET_RATING"]:+.2f}')

if 'momentum_delta' in df_profiles.columns:
    print(f'\nPor Momentum (quien llega mas caliente):')
    for i, (_, t) in enumerate(
        df_profiles.sort_values('momentum_delta', ascending=False).head(8).iterrows(), 1):
        conf = 'W' if t['Conference'] == 'West' else 'E'
        print(f'  {i:>2}. [{conf}{int(t["SEED"])}] {t["TEAM_NAME"]:<28} {t["momentum_delta"]:+.2f}')

if 'clutch_win_pct' in df_profiles.columns:
    print(f'\nPor Clutch (quien gana juegos cerrados):')
    for i, (_, t) in enumerate(
        df_profiles.sort_values('clutch_win_pct', ascending=False).head(8).iterrows(), 1):
        conf = 'W' if t['Conference'] == 'West' else 'E'
        print(f'  {i:>2}. [{conf}{int(t["SEED"])}] {t["TEAM_NAME"]:<28} {t["clutch_win_pct"]:.1%}')

## Guardar

In [None]:
import shutil

os.makedirs('data', exist_ok=True)

datasets = {
    'training_matchups.csv': df_training,
    'team_profiles_2026.csv': df_profiles,
    'historical_matchups_raw.csv': df_matchups
}

for filename, df in datasets.items():
    local_path = f'data/{filename}'
    df.to_csv(local_path, index=False)
    shutil.copy(local_path, f'{DATA_DIR}/{filename}')
    print(f'{filename:<32} {df.shape}')

# Guardo la lista de features para que NB03 use las mismas
feature_list = [c for c in df_training.columns if c.endswith('_diff')]
with open(f'{DATA_DIR}/feature_columns.txt', 'w') as f:
    f.write('\n'.join(feature_list))
print(f'\nfeature_columns.txt: {len(feature_list)} features')
print(f'Guardado en: {DATA_DIR}')

---

Listo. El training set tiene ~150 series con diferenciales de 14 features cada una, y los perfiles de los 16 equipos quedan listos para la simulacion.

**Archivos generados:**
- `training_matchups.csv` — series historicas con features para entrenar XGBoost
- `team_profiles_2026.csv` — perfiles de los 16 equipos de playoffs
- `historical_matchups_raw.csv` — matchups sin features (referencia)
- `feature_columns.txt` — lista de features para consistencia entre notebooks