### DAILY GAME IDS

- grabs games based on date
- date, time, home, away, neutral, conference

In [50]:
import glob
import pandas as pd
import numpy as np

DATE = "20251103"
csv_files = glob.glob("data/boxscores/game-info-2026/*.csv")
combined_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

team_map = pd.read_csv("data/teams/map.csv")[["team", "espn"]]

combined_df = combined_df.merge(team_map, left_on="home_team", right_on="espn", how="left").merge(team_map, left_on="away_team", right_on="espn", how="left")

combined_df['home'] = combined_df['team_x']
combined_df['away'] = combined_df['team_y']

combined_df['date'] = pd.to_datetime(combined_df['date_utc']).dt.strftime('%Y%m%d')
combined_df['date_key'] = pd.to_numeric(combined_df['date'], errors='coerce').astype('Int64')

combined_df['date'] = pd.to_datetime(combined_df['date_utc']).dt.strftime('%Y%m%d')
combined_df['date_key'] = pd.to_numeric(combined_df['date'], errors='coerce').astype('Int64')
conferences = pd.concat([pd.read_csv("barttorvik_2024_all.csv")[["Team", "Conf"]], pd.read_csv("barttorvik_2025_all.csv")[["Team", "Conf"]], pd.read_csv("barttorvik_2026_all.csv")[["Team", "Conf"]]], axis=0)
conferences = conferences[conferences['Team'] != "Team"]
conferences['Team'] = conferences['Team'].str.extract(r'^([A-Za-z\s.&]+)')[0].str.strip()
conferences = conferences.drop_duplicates(subset="Team")

# --- normalize team names (strip seeds/suffixes) ---
name_pat = r'^([A-Za-z\s.&\'-]+)'
def clean_team(s):
    if pd.isna(s): return s
    m = re.match(name_pat, str(s))
    base = m.group(1) if m else str(s)
    return re.sub(r'\s+', ' ', base).strip()

combined_df['home_key'] = combined_df['home'].map(clean_team)
combined_df['away_key'] = combined_df['away'].map(clean_team)
conferences['team_key'] = conferences['Team'].map(clean_team)

right = conferences.drop_duplicates(['team_key']).copy()

# --- Build HOME version of the right table ---
home_cols = [c for c in right.columns if c not in ['date_key', 'team_key']]
torvik_home = right.rename(columns={'team_key': 'home_key', **{c: f'{c}_home' for c in home_cols}})

# --- Merge HOME ---
combined_df = combined_df.merge(
    torvik_home,
    on='home_key',
    how='left',
    validate='many_to_one'
)

# --- Build AWAY version of the right table ---
away_cols = [c for c in right.columns if c not in ['date_key', 'team_key']]
torvik_away = right.rename(columns={'team_key': 'away_key', **{c: f'{c}_away' for c in away_cols}})

# --- Merge AWAY ---
combined_df = combined_df.merge(
    torvik_away,
    on='away_key',
    how='left',
    validate='many_to_one'
)

combined_df['season'] = 2026
combined_df['neutral_site'] = np.where(combined_df['neutral_site'] == True, 1, 0)

combined_df = combined_df[["game_id", "date", "date_key", "date_utc", "time_utc", "neutral_site", "home", "away", "Conf_home", "Conf_away"]]
combined_df.columns = ["game_id", "date", "date_key", "date_utc", "time_utc", "neutral_site", "home", "away", "conf_home", "conf_away"]

csv_files = glob.glob(f"daily-box-score-ids/{DATE}/*.csv")
game_id_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

game_ids = list(game_id_df['game_id'])
combined_df[combined_df['game_id'].isin(game_ids)].to_csv("daily-games/daily.csv")

### SEASON GAME INFORMATION AND TEAM STATS

In [34]:
import glob
import pandas as pd

csv_files = glob.glob("data/boxscores/game-info-2026/*.csv")
combined_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

team_map = pd.read_csv("data/teams/map.csv")[["team", "espn"]]

combined_df = combined_df.merge(team_map, left_on="home_team", right_on="espn", how="left").merge(team_map, left_on="away_team", right_on="espn", how="left")

combined_df['home'] = combined_df['team_x']
combined_df['away'] = combined_df['team_y']

combined_df = combined_df.dropna(subset="home").dropna(subset="away").dropna(subset="home_1h")
combined_df = combined_df[['game_id', 'date_utc', 'time_utc', 'neutral_site', 'home',
       'away', 'home_1h', 'away_1h', 'home_2h', 'away_2h', 'home_score',
       'away_score']]

csv_files = glob.glob("data/boxscores/team-stats-2026/*.csv")
team_combined_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

combined_df.sort_values("date_utc")

df = team_combined_df.sort_values(['displayOrder'])

# create a helper column to pair home/away rows by game id
# (if you don't already have a game_id column)
# Split into home and away
home_df = df[df['homeAway'] == 'home'].copy()
away_df = df[df['homeAway'] == 'away'].copy()

# Columns we don't want duplicated (they’ll be renamed anyway)
cols_to_remove = ['homeAway', 'displayOrder', 'abbreviation', 'team_id']

# Rename columns to indicate home/away
home_df = home_df.drop(columns=cols_to_remove).add_suffix('_home')
away_df = away_df.drop(columns=cols_to_remove).add_suffix('_away')

# Merge back together on the shared game_id
# (keep original game_id)
final_df = pd.merge(
    home_df,
    away_df,
    left_on='game_id_home',
    right_on='game_id_away',
    suffixes=('', ''),
    how='inner'
)

# Keep just one copy of game_id
final_df['game_id'] = final_df['game_id_home']
final_df = final_df.drop(columns=['game_id_home', 'game_id_away'])

# Optional: reorder columns to have game_id first
cols = ['game_id'] + [c for c in final_df.columns if c != 'game_id']
final_df = final_df[cols]

final_df = final_df[['game_id', 'assists_home', 'defensiveRebounds_home', 'freeThrowPct_home',
       'threePointFieldGoalsMade-threePointFieldGoalsAttempted_home',
       'fouls_home', 'totalRebounds_home', 'threePointFieldGoalPct_home',
       'teamTurnovers_home', 'pointsInPaint_home', 'technicalFouls_home',
       'totalTechnicalFouls_home', 'largestLead_home',
       'offensiveRebounds_home', 'fieldGoalPct_home',
       'totalTurnovers_home', 'turnoverPoints_home', 'flagrantFouls_home',
       'freeThrowsMade-freeThrowsAttempted_home', 'steals_home',
       'fieldGoalsMade-fieldGoalsAttempted_home', 'blocks_home',
       'fastBreakPoints_home', 'turnovers_home',  'assists_away',
       'defensiveRebounds_away', 'freeThrowPct_away',
       'threePointFieldGoalsMade-threePointFieldGoalsAttempted_away',
       'fouls_away', 'totalRebounds_away', 'threePointFieldGoalPct_away',
       'teamTurnovers_away', 'pointsInPaint_away', 'technicalFouls_away',
       'totalTechnicalFouls_away', 'largestLead_away',
       'offensiveRebounds_away', 'fieldGoalPct_away',
       'totalTurnovers_away', 'turnoverPoints_away', 'flagrantFouls_away',
       'freeThrowsMade-freeThrowsAttempted_away', 'steals_away',
       'fieldGoalsMade-fieldGoalsAttempted_away', 'blocks_away',
       'fastBreakPoints_away', 'turnovers_away']]

combined_df = combined_df.merge(final_df, on="game_id")

csv_files = glob.glob("data/boxscores/officials-2026/*.csv")
officials_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# create a rank/order number per game_id
officials_df['official_number'] = officials_df.groupby('game_id').cumcount() + 1

# pivot to wide format
flat_officials = (
    officials_df.pivot(index='game_id', columns='official_number', values='official_name')
    .rename(columns=lambda x: f'official_{x}')
    .reset_index()
)

flat_officials = flat_officials[["game_id", "official_1", "official_2", "official_3"]]

combined_df = combined_df.merge(flat_officials, on="game_id")

In [21]:
import pandas as pd
import numpy as np
import glob
import warnings

warnings.filterwarnings("ignore")

team_map = pd.read_csv("data/teams/map.csv")[["team", "espn"]]

game_df = pd.read_csv("data/train/game-info-2026.csv", index_col=0)
game_df['date'] = pd.to_datetime(game_df['date_utc']).dt.strftime('%Y%m%d')
game_df['home_margin'] = game_df['home_score'] - game_df['away_score']
game_df['away_margin'] = game_df['away_score'] - game_df['home_score']

csv_files_2026 = glob.glob("daily_csvs_2026/*.csv")
daily_torvik_2026_df = pd.concat((pd.read_csv(f) for f in csv_files_2026), ignore_index=True)
daily_torvik_2026_df = daily_torvik_2026_df[daily_torvik_2026_df['Team'] != "Team"]
daily_torvik_2026_df['Team'] = daily_torvik_2026_df['Team'].str.extract(r'^([A-Za-z\s.&]+)')[0].str.strip()
daily_torvik_2026_df['WAB'] = daily_torvik_2026_df['WAB'].str.replace("+","", regex=False).astype("float")
daily_torvik_2026_df['season'] = 2026
daily_torvik_2026_df = daily_torvik_2026_df[['season','Date', 'Team', 'Rk', 'Conf', 'G', 'AdjOE', 'AdjDE', 'Barthag',
       'EFG%', 'EFGD%', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P%',
       '2P%D', '3P%', '3P%D', '3PR', '3PRD', 'Adj T.', 'WAB']].sort_values(["Date","Team"], ascending=True)
daily_torvik_2026_df.columns = ['season', 'date', 'team', 'rank', 'conf', 'games', 'adj_off_eff', 'adj_def_eff', 'barthag',
       'efg_pct', 'efgd_pct', 'tor', 'tord', 'orb', 'drb', 'ftr', 'ftrd', 'two_pt_pct',
       'two_pt_def_pct', 'three_pt_pct', 'three_pt_def_pct', 'three_pt_rt', 'three_pt_def_rt', 'adj_tempo', 'wab']

assert len(set(daily_torvik_2026_df['team']) - set(team_map['team'])) == 0

daily_torvik_df = daily_torvik_2026_df


import re
game_df['date_key'] = pd.to_numeric(game_df['date'], errors='coerce').astype('Int64')
daily_torvik_df['date_key'] = pd.to_numeric(daily_torvik_df['date'], errors='coerce').astype('Int64')

# --- normalize team names (strip seeds/suffixes) ---
name_pat = r'^([A-Za-z\s.&\'-]+)'
def clean_team(s):
    if pd.isna(s): return s
    m = re.match(name_pat, str(s))
    base = m.group(1) if m else str(s)
    return re.sub(r'\s+', ' ', base).strip()

game_df['home_key'] = game_df['home'].map(clean_team)
game_df['away_key'] = game_df['away'].map(clean_team)
daily_torvik_df['team_key'] = daily_torvik_df['team'].map(clean_team)

right = daily_torvik_df.drop_duplicates(['date_key', 'team_key']).copy()

# --- Build HOME version of the right table ---
home_cols = [c for c in right.columns if c not in ['date_key', 'team_key']]
torvik_home = right.rename(columns={'team_key': 'home_key', **{c: f'{c}_home' for c in home_cols}})

# --- Merge HOME ---
merged_df = game_df.merge(
    torvik_home,
    on=['date_key', 'home_key'],
    how='left',
    validate='many_to_one'
)

# --- Build AWAY version of the right table ---
away_cols = [c for c in right.columns if c not in ['date_key', 'team_key']]
torvik_away = right.rename(columns={'team_key': 'away_key', **{c: f'{c}_away' for c in away_cols}})

# --- Merge AWAY ---
merged_df = merged_df.merge(
    torvik_away,
    on=['date_key', 'away_key'],
    how='left',
    validate='many_to_one'
)

merged_df['season'] = 2026
merged_df['neutral_site'] = np.where(merged_df['neutral_site'] == True, 1, 0)

merged_df = merged_df[[
    'game_id',
    'season',
    'date',
    'date_utc',
    'time_utc',
    'neutral_site',
    'home',
    'away',
    'home_1h',
    'away_1h',
    'home_2h',
    'away_2h',
    'home_score',
    'away_score',
    'home_margin',
    'away_margin',
    'assists_home',
    'fouls_home',
    'technicalFouls_home',
    'flagrantFouls_home',
    'totalRebounds_home',
    'offensiveRebounds_home',
    'defensiveRebounds_home',
    'pointsInPaint_home',
    'turnovers_home',
    'turnoverPoints_home',
    'steals_home',
    'blocks_home',
    'fastBreakPoints_home',
    'assists_away',
    'fouls_away',
    'technicalFouls_away',
    'flagrantFouls_away',
    'totalRebounds_away',
    'offensiveRebounds_away',
    'defensiveRebounds_away',
    'pointsInPaint_away',
    'turnovers_away',
    'turnoverPoints_away',
    'steals_away',
    'blocks_away',
    'fastBreakPoints_away',
    'official_1',
    'official_2',
    'official_3',  
    'rank_home',
    'conf_home',
    'games_home',
    'adj_off_eff_home',
    'adj_def_eff_home',
    'barthag_home',
    'efg_pct_home',
    'efgd_pct_home',
    'tor_home',
    'tord_home',
    'orb_home',
    'drb_home',
    'ftr_home',
    'ftrd_home',
    'two_pt_pct_home',
    'two_pt_def_pct_home',
    'three_pt_pct_home',
    'three_pt_def_pct_home',
    'three_pt_rt_home',
    'three_pt_def_rt_home',
    'adj_tempo_home',
    'wab_home',
    'rank_away',
    'conf_away',
    'games_away',
    'adj_off_eff_away',
    'adj_def_eff_away',
    'barthag_away',
    'efg_pct_away',
    'efgd_pct_away',
    'tor_away',
    'tord_away',
    'orb_away',
    'drb_away',
    'ftr_away',
    'ftrd_away',
    'two_pt_pct_away',
    'two_pt_def_pct_away',
    'three_pt_pct_away',
    'three_pt_def_pct_away',
    'three_pt_rt_away',
    'three_pt_def_rt_away',
    'adj_tempo_away',
    'wab_away']]

### FEATURE SET 1 USING DAILY.CSV

In [39]:
# inference_time_officials_conference.py
from __future__ import annotations
import json
from typing import Dict, Optional, Iterable, Tuple
import numpy as np
import pandas as pd

# ---- Configuration ----
OFFICIAL_COLS = ['official_1', 'official_2', 'official_3']
LOCAL_TZ = 'America/New_York'  # Eastern time

# ---------- Helpers ----------
def _ensure_date_key_str(s: pd.Series) -> pd.Series:
    """Normalize YYYYMMDD to 8-char string from any input series."""
    return s.astype(str).str.extract(r'(\d{8})')[0]

def _build_tipoff_utc(df: pd.DataFrame) -> pd.Series:
    """
    Build timezone-aware UTC datetime from (date_key + time_utc like '21:00Z').
    Requires: 'date_key' (YYYYMMDD) and 'time_utc' ('HH:MMZ' or 'HH:MM').
    """
    if 'date_utc' not in df.columns:
        raise KeyError("Expected 'date_key' (YYYYMMDD).")
    if 'time_utc' not in df.columns:
        raise KeyError("Expected 'time_utc' like '21:00Z' or '21:00'.")

    # use date_key (not 'date'); coerce invalids to NaT
    date_key = _ensure_date_key_str(df['date'])
    t = df['time_utc'].astype(str).str.strip()
    t = np.where(t.str.endswith('Z'), t, t + 'Z')  # ensure trailing Z
    iso_date = pd.to_datetime(date_key, format='%Y%m%d', errors='coerce').dt.strftime('%Y-%m-%d')
    iso = iso_date + ' ' + t
    tipoff_utc = pd.to_datetime(iso, utc=True, errors='coerce', infer_datetime_format=True)
    return tipoff_utc

def _time_features_from_dt(dt: pd.Series, prefix: str) -> pd.DataFrame:
    """
    From a timezone-aware datetime series, produce:
      - {prefix}_hour, {prefix}_minute, {prefix}_second
      - {prefix}_seconds_since_midnight
      - {prefix}_hour_sin, {prefix}_hour_cos (cyclical)
    """
    out = pd.DataFrame(index=dt.index)
    out[f'{prefix}_hour'] = dt.dt.hour.fillna(0).astype('int16')
    out[f'{prefix}_minute'] = dt.dt.minute.fillna(0).astype('int16')
    out[f'{prefix}_second'] = dt.dt.second.fillna(0).astype('int16')
    out[f'{prefix}_seconds_since_midnight'] = (
        out[f'{prefix}_hour'] * 3600 + out[f'{prefix}_minute'] * 60 + out[f'{prefix}_second']
    ).astype('int32')

    two_pi = 2 * np.pi
    out[f'{prefix}_hour_sin'] = np.sin(two_pi * out[f'{prefix}_hour'] / 24.0)
    out[f'{prefix}_hour_cos'] = np.cos(two_pi * out[f'{prefix}_hour'] / 24.0)
    return out

def _add_day_flags(local_dt: pd.Series, base_df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    """
    Day-of-week flags on LOCAL time:
      - {prefix}_is_weekend (Sat/Sun)
      - {prefix}_is_primetime (18:00–22:59)
      - {prefix}_daypart_* one-hots: morning(5–11), afternoon(12–16), evening(17–21), late(other)
    """
    out = base_df.copy()
    dow = local_dt.dt.dayofweek  # Mon=0..Sun=6
    out[f'{prefix}_is_weekend'] = dow.isin([5, 6]).fillna(False).astype('int8')

    hour = local_dt.dt.hour.fillna(0).astype(int)
    out[f'{prefix}_is_primetime'] = ((hour >= 18) & (hour <= 22)).astype('int8')

    def _daypart(h):
        if 5 <= h <= 11:  return 'morning'
        if 12 <= h <= 16: return 'afternoon'
        if 17 <= h <= 21: return 'evening'
        return 'late'

    dp = hour.map(_daypart).astype('category')
    dummies = pd.get_dummies(dp, prefix=f'{prefix}_daypart', dtype='int8')
    out = pd.concat([out, dummies], axis=1)
    return out

# ---------- Transform (apply saved maps) ----------
def transform_officials_with_map(df: pd.DataFrame, mapping: Dict[str, int],
                                 official_cols: Iterable[str] = OFFICIAL_COLS) -> pd.DataFrame:
    """Apply the shared mapping to each official* column, creating *_code columns."""
    out = df.copy()
    unk = mapping.get('UNK', 0)
    for c in official_cols:
        if c in out.columns:
            s = out[c].astype('string')
            out[f'{c}_code'] = s.map(mapping).fillna(unk).astype('int32')
        else:
            out[f'{c}_code'] = unk
    return out

def transform_with_map(series: pd.Series, mapping: Dict[str, int], fill_value: str = 'UNK') -> pd.Series:
    """Transform using a prefit mapping, unknowns go to code for fill_value (default 0)."""
    return series.astype('string').fillna(fill_value).map(mapping).fillna(mapping.get(fill_value, 0)).astype('int32')

# ---------- Public Inference Entry ----------
def load_enc_maps(path: str) -> Dict[str, Dict[str, int]]:
    with open(path, "r") as f:
        return json.load(f)

def build_time_officials_conference_features_inference(
    df: pd.DataFrame,
    enc_maps: Dict[str, Dict[str, int]],
    *,
    add_et_features: bool = True,
    make_conference_dummies: bool = False
) -> Tuple[pd.DataFrame, Dict[str, Dict[str, int]]]:
    """
    INFERENCE version:
      - Uses prefit maps in `enc_maps` to transform officials + conferences
      - Builds UTC/ET time features + flags
      - Does NOT refit any encoder
    Returns: (features_df, enc_maps) for convenience
    """
    out = df.copy()

    # Ensure date_key exists for time parsing
    if 'date_key' not in out.columns:
        if 'date' in out.columns:
            out['date_key'] = _ensure_date_key_str(out['date'])
        else:
            raise KeyError("Expected 'date' or 'date_key' in inference dataframe.")

    # Build UTC time + time features
    out['tipoff_utc'] = _build_tipoff_utc(out)
    utc_feats = _time_features_from_dt(out['tipoff_utc'], prefix='utc')
    out = pd.concat([out, utc_feats], axis=1)

    # Local (ET) features + day flags
    if add_et_features:
        tipoff_et = out['tipoff_utc'].dt.tz_convert(LOCAL_TZ)
        et_feats = _time_features_from_dt(tipoff_et, prefix='et')
        out = pd.concat([out, et_feats], axis=1)
        out = _add_day_flags(tipoff_et, out, prefix='et')

    # Officials (shared map)
    official_map = enc_maps.get('official_map', {'UNK': 0})
    out = transform_officials_with_map(out, official_map, OFFICIAL_COLS)

    # Conferences
    if 'conf_home' in out.columns:
        conf_home_map = enc_maps.get('conf_home_map', {'UNK': 0})
        out['conf_home_code'] = transform_with_map(out['conf_home'], conf_home_map)
        if make_conference_dummies:
            dummies = pd.get_dummies(out['conf_home'].astype('string').fillna('UNK'),
                                     prefix='conf_home', dtype='int8')
            out = pd.concat([out, dummies], axis=1)

    if 'conf_away' in out.columns:
        conf_away_map = enc_maps.get('conf_away_map', {'UNK': 0})
        out['conf_away_code'] = transform_with_map(out['conf_away'], conf_away_map)
        if make_conference_dummies:
            dummies = pd.get_dummies(out['conf_away'].astype('string').fillna('UNK'),
                                     prefix='conf_away', dtype='int8')
            out = pd.concat([out, dummies], axis=1)

    return out, enc_maps

# ---------- Optional: align to training feature set ----------
def align_to_training_features(df_features: pd.DataFrame, train_feature_cols: list) -> pd.DataFrame:
    """
    Reindex to the exact training feature set:
      - add any missing columns (filled with 0),
      - drop any extra columns,
      - keep the same ordering as training.
    Ensures numeric dtype for all features.
    """
    X = df_features.reindex(columns=train_feature_cols, fill_value=0)
    for c in X.columns:
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors='coerce').fillna(0)
    return X

enc_maps = load_enc_maps("data/train/enc_maps.json")

# 2) Apply to new games dataframe (must have: 'date' or 'date_key', and 'time_utc')
features_1, _ = build_time_officials_conference_features_inference(
    pd.read_csv("daily-games/daily.csv"), enc_maps,
    add_et_features=True,
    make_conference_dummies=True
)
features_1 = features_1[['game_id', 'utc_seconds_since_midnight', 'utc_hour_sin', 'utc_hour_cos',
       'et_hour', 'et_minute', 'et_second', 'et_seconds_since_midnight',
       'et_hour_sin', 'et_hour_cos', 'et_is_weekend', 'et_is_primetime',
       'et_daypart_afternoon', 'et_daypart_evening', 'et_daypart_late',
       'et_daypart_morning', 'official_1_code', 'official_2_code',
       'official_3_code', 'conf_home_code', 'conf_away_code']]

### FEATURE SET 2/3 USING 2026 GAME INFORMATION AND TORVVIK RATINGS

In [None]:
# --- assume your dataframe is named df ---
df = merged_df[['game_id', 'season', 'date', 'date_utc', 'time_utc',
       'neutral_site', 'home', 'away', 'home_1h', 'away_1h', 'home_2h',
       'away_2h', 'home_score', 'away_score', 'home_margin',
       'away_margin','rank_home', 'games_home',
       'adj_off_eff_home', 'adj_def_eff_home', 'barthag_home',
       'efg_pct_home', 'efgd_pct_home', 'tor_home', 'tord_home',
       'orb_home', 'drb_home', 'ftr_home', 'ftrd_home', 'two_pt_pct_home',
       'two_pt_def_pct_home', 'three_pt_pct_home',
       'three_pt_def_pct_home', 'three_pt_rt_home',
       'three_pt_def_rt_home', 'adj_tempo_home', 'wab_home', 'rank_away',
       'games_away', 'adj_off_eff_away', 'adj_def_eff_away',
       'barthag_away', 'efg_pct_away', 'efgd_pct_away', 'tor_away',
       'tord_away', 'orb_away', 'drb_away', 'ftr_away', 'ftrd_away',
       'two_pt_pct_away', 'two_pt_def_pct_away', 'three_pt_pct_away',
       'three_pt_def_pct_away', 'three_pt_rt_away',
       'three_pt_def_rt_away', 'adj_tempo_away', 'wab_away']].copy()

# --- 1. Create a datetime for chronological sorting ---
if 'date_utc' in df.columns:
    if 'time_utc' in df.columns:
        df['game_dt'] = pd.to_datetime(
            df['date_utc'].astype(str).str.strip() + ' ' +
            df['time_utc'].fillna('00:00:00').astype(str).str.strip(),
            errors='coerce'
        )
    else:
        df['game_dt'] = pd.to_datetime(df['date_utc'], errors='coerce')
else:
    df['game_dt'] = pd.to_datetime(df['date'], errors='coerce')

# --- 2. Identify feature bases (shared between home/away) ---
suffix_cols = [c for c in df.columns if c.endswith('_home') or c.endswith('_away')]
bases = sorted({c.rsplit('_', 1)[0] for c in suffix_cols})

# --- 3. Map column names for home/away to team/opponent versions ---
home_to_team = {f'{b}_home': b for b in bases}
away_to_team = {f'{b}_away': b for b in bases}
home_to_opp  = {f'{b}_away': f'opp_{b}' for b in bases}
away_to_opp  = {f'{b}_home': f'opp_{b}' for b in bases}

# --- 4. Create long-format team-game table (home + away) ---
id_cols = ['game_id', 'season', 'game_dt', 'neutral_site']
id_cols = [c for c in id_cols if c in df.columns]

# home perspective
home_view = df[id_cols + ['home', 'away'] + suffix_cols].copy()
home_view = home_view.rename(columns={'home': 'team', 'away': 'opponent'})
home_view = home_view.rename(columns={**home_to_team, **home_to_opp})
home_view['is_home'] = True

# away perspective
away_view = df[id_cols + ['home', 'away'] + suffix_cols].copy()
away_view = away_view.rename(columns={'away': 'team', 'home': 'opponent'})
away_view = away_view.rename(columns={**away_to_team, **away_to_opp})
away_view['is_home'] = False

# combine both
team_games = pd.concat([home_view, away_view], ignore_index=True, sort=False)

# --- 5. Sort games chronologically per team/season ---
team_games = team_games.sort_values(['team', 'season', 'game_dt', 'game_id'], ignore_index=True)

team_feature_cols = [c for c in bases if c in team_games.columns]
opp_feature_cols  = [f'opp_{b}' for b in bases if f'opp_{b}' in team_games.columns]

# 2) Compute current values (NO shift) and store under the same names you use at training (`lag1_*`)
#    because for the *next* game these become lag1_*.
if team_feature_cols:
    current_team_vals = (
        team_games.groupby(['team','season'], dropna=False)[team_feature_cols]
        .transform(lambda s: s)  # identity, just to keep shape
    )
    current_team_vals.columns = [f'lag1_{c}' for c in current_team_vals.columns]
    team_games = pd.concat([team_games, current_team_vals], axis=1)

if opp_feature_cols:
    current_opp_vals = (
        team_games.groupby(['team','season'], dropna=False)[opp_feature_cols]
        .transform(lambda s: s)
    )
    current_opp_vals.columns = [f'lag1_{c}' for c in current_opp_vals.columns]
    team_games = pd.concat([team_games, current_opp_vals], axis=1)

# --- B) CURRENT (no-shift) versions of your rolling/EWMA features from build_cbb_features ---

def _roll_no_leak(s, w):
    return s.rolling(w, min_periods=1).mean()

def _rstd_no_leak(s, w=5):
    return s.rolling(w, min_periods=2).std()

def _ewm_no_leak(s, hl):
    return s.ewm(halflife=hl, min_periods=1, adjust=False).mean()

# full_bases is what you used in build_cbb_features after standardizing
full_bases = [b for b in bases if b in team_games.columns]  # reuse your 'bases' from earlier

# windows and halflife consistent with your earlier function
windows = (1, 3, 5, 10)
ewm_halflife = 5

# Compute CURRENT versions (NO shift) but KEEP the SAME column names (ra_*, ra_allowed_*, rstd_*, ewm_*).
for b in full_bases:
    # ra_<b>_w*
    for w in windows:
        team_games[f'ra_{b}_w{w}'] = (
            team_games.groupby(['team','season'])[b].transform(lambda s: _roll_no_leak(s, w))
        )
    # rstd_*, ewm_*
    team_games[f'rstd_{b}_w5'] = team_games.groupby(['team','season'])[b].transform(lambda s: _rstd_no_leak(s, 5))
    team_games[f'ewm_{b}_hl{ewm_halflife}'] = team_games.groupby(['team','season'])[b].transform(lambda s: _ewm_no_leak(s, ewm_halflife))

    # Allowed versions from opponent columns if present
    opp_b = f'opp_{b}'
    if opp_b in team_games.columns:
        for w in windows:
            team_games[f'ra_allowed_{b}_w{w}'] = (
                team_games.groupby(['team','season'])[opp_b].transform(lambda s: _roll_no_leak(s, w))
            )
        team_games[f'rstd_allowed_{b}_w5'] = team_games.groupby(['team','season'])[opp_b].transform(lambda s: _rstd_no_leak(s, 5))
        team_games[f'ewm_allowed_{b}_hl{ewm_halflife}'] = team_games.groupby(['team','season'])[opp_b].transform(lambda s: _ewm_no_leak(s, ewm_halflife))

# 1H/2H points current rolling
for b in ['points_1h', 'points_2h']:
    if b in team_games.columns:
        for w in windows:
            team_games[f'ra_{b}_w{w}'] = team_games.groupby(['team','season'])[b].transform(lambda s: _roll_no_leak(s, w))
        opp_b = f'opp_{b}'
        if opp_b in team_games.columns:
            for w in windows:
                team_games[f'ra_allowed_{b}_w{w}'] = team_games.groupby(['team','season'])[opp_b].transform(lambda s: _roll_no_leak(s, w))

# Venue-dependent margin current rolling
if 'team_margin' in team_games.columns:
    for w in windows:
        team_games[f'ra_margin_homeonly_w{w}'] = (
            team_games.groupby(['team','season','is_home'])['team_margin']
            .transform(lambda s: _roll_no_leak(s.shift(0), w))  # no shift
        )
    for w in windows:
        team_games[f'ra_margin_w{w}'] = team_games.groupby(['team','season'])['team_margin'].transform(lambda s: _roll_no_leak(s, w))

# Recent scoring (points for/against) current rolling
if 'team_score' in team_games.columns:
    for w in windows:
        pf = team_games.groupby(['team','season'])['team_score'].transform(lambda s: _roll_no_leak(s, w))
        pa = team_games.groupby(['team','season'])['opp_score'].transform(lambda s: _roll_no_leak(s, w))
        team_games[f'ra_points_for_w{w}'] = pf
        team_games[f'ra_points_against_w{w}'] = pa
        team_games[f'ra_point_diff_w{w}'] = pf - pa

# Rest days current rolling (rest_days already computed)
for w in windows:
    team_games[f'ra_rest_days_w{w}'] = (
        team_games.groupby(['team','season'])['rest_days'].transform(lambda s: _roll_no_leak(s, w))
    )

# --- C) Opponent rank cumulative – for NEXT game you want history INCLUDING last opponent
#     So use the "inclusive" version at the last row.
if 'opp_rank' in team_games.columns:
    g = team_games.groupby(['team','season'], dropna=False)['opp_rank']
    team_games['opp_rank_cummean_incl'] = g.cumsum() / (g.cumcount() + 1)
    # Keep your leakage-safe version too if you still need it elsewhere:
    cum_sum_prev = g.cumsum().shift(1)
    cnt_prev = g.cumcount()
    team_games['opp_rank_cummean_pre'] = cum_sum_prev / cnt_prev.replace(0, np.nan)

# --- D) Finally, extract the ONE most recent (latest) row per team & season ---
latest_snapshot = (
    team_games
    .sort_values(['team','season','game_dt','game_id'])
    .groupby(['team','season'], as_index=False, sort=False)
    .tail(1)
    .reset_index(drop=True)
)

# --- E) Select the same feature sets you use downstream ---

# features_2-style current snapshot (renamed to lag1_* where relevant)
features_2_current = latest_snapshot[['game_id', 'date', 'team', 'opponent'] + [
    # team-side lag1_* (already created above as current values but under lag1_*)
    'lag1_adj_def_eff','lag1_adj_off_eff','lag1_adj_tempo','lag1_barthag','lag1_drb','lag1_efg_pct',
    'lag1_efgd_pct','lag1_ftr','lag1_ftrd','lag1_games','lag1_orb','lag1_rank',
    'lag1_three_pt_def_pct','lag1_three_pt_def_rt','lag1_three_pt_pct','lag1_three_pt_rt',
    'lag1_tor','lag1_tord','lag1_two_pt_def_pct','lag1_two_pt_pct','lag1_wab',
    # opp-side lag1_* (current opponent aggregates)
    'lag1_opp_adj_def_eff','lag1_opp_adj_off_eff','lag1_opp_adj_tempo','lag1_opp_barthag','lag1_opp_drb',
    'lag1_opp_efg_pct','lag1_opp_efgd_pct','lag1_opp_ftr','lag1_opp_ftrd','lag1_opp_games','lag1_opp_orb',
    'lag1_opp_rank','lag1_opp_three_pt_def_pct','lag1_opp_three_pt_def_rt','lag1_opp_three_pt_pct',
    'lag1_opp_three_pt_rt','lag1_opp_tor','lag1_opp_tord','lag1_opp_two_pt_def_pct','lag1_opp_two_pt_pct',
    'lag1_opp_wab',
    # cum means
    'opp_rank_cummean_incl','opp_rank_cummean_pre'
]].copy()

# features_3-style current snapshot (rolling ra_* etc. with NO shift)
keep_cols_f3 = ['game_id','date','team','rest_days',
    'ra_rest_days_w1','ra_rest_days_w3','ra_rest_days_w5','ra_rest_days_w10',
    'ra_assists_w1','ra_allowed_assists_w1','ra_assists_w3','ra_allowed_assists_w3',
    'ra_assists_w5','ra_allowed_assists_w5','ra_assists_w10','ra_allowed_assists_w10',
    'rstd_assists_w5','ewm_assists_hl5','rstd_allowed_assists_w5','ewm_allowed_assists_hl5',
    'ra_blocks_w1','ra_allowed_blocks_w1','ra_blocks_w3','ra_allowed_blocks_w3','ra_blocks_w5',
    'ra_allowed_blocks_w5','ra_blocks_w10','ra_allowed_blocks_w10','rstd_blocks_w5','ewm_blocks_hl5',
    'rstd_allowed_blocks_w5','ewm_allowed_blocks_hl5',
    'ra_defensiveRebounds_w1','ra_allowed_defensiveRebounds_w1','ra_defensiveRebounds_w3',
    'ra_allowed_defensiveRebounds_w3','ra_defensiveRebounds_w5','ra_allowed_defensiveRebounds_w5',
    'ra_defensiveRebounds_w10','ra_allowed_defensiveRebounds_w10','rstd_defensiveRebounds_w5',
    'ewm_defensiveRebounds_hl5','rstd_allowed_defensiveRebounds_w5','ewm_allowed_defensiveRebounds_hl5',
    'ra_fastBreakPoints_w1','ra_allowed_fastBreakPoints_w1','ra_fastBreakPoints_w3',
    'ra_allowed_fastBreakPoints_w3','ra_fastBreakPoints_w5','ra_allowed_fastBreakPoints_w5',
    'ra_fastBreakPoints_w10','ra_allowed_fastBreakPoints_w10','rstd_fastBreakPoints_w5','ewm_fastBreakPoints_hl5',
    'rstd_allowed_fastBreakPoints_w5','ewm_allowed_fastBreakPoints_hl5',
    'ra_flagrantFouls_w1','ra_allowed_flagrantFouls_w1','ra_flagrantFouls_w3','ra_allowed_flagrantFouls_w3',
    'ra_flagrantFouls_w5','ra_allowed_flagrantFouls_w5','ra_flagrantFouls_w10','ra_allowed_flagrantFouls_w10',
    'rstd_flagrantFouls_w5','ewm_flagrantFouls_hl5','rstd_allowed_flagrantFouls_w5','ewm_allowed_flagrantFouls_hl5',
    'ra_fouls_w1','ra_allowed_fouls_w1','ra_fouls_w3','ra_allowed_fouls_w3','ra_fouls_w5','ra_allowed_fouls_w5',
    'ra_fouls_w10','ra_allowed_fouls_w10','rstd_fouls_w5','ewm_fouls_hl5','rstd_allowed_fouls_w5','ewm_allowed_fouls_hl5',
    'ra_offensiveRebounds_w1','ra_allowed_offensiveRebounds_w1','ra_offensiveRebounds_w3',
    'ra_allowed_offensiveRebounds_w3','ra_offensiveRebounds_w5','ra_allowed_offensiveRebounds_w5',
    'ra_offensiveRebounds_w10','ra_allowed_offensiveRebounds_w10','rstd_offensiveRebounds_w5',
    'ewm_offensiveRebounds_hl5','rstd_allowed_offensiveRebounds_w5','ewm_allowed_offensiveRebounds_hl5',
    'ra_pointsInPaint_w1','ra_allowed_pointsInPaint_w1','ra_pointsInPaint_w3','ra_allowed_pointsInPaint_w3',
    'ra_pointsInPaint_w5','ra_allowed_pointsInPaint_w5','ra_pointsInPaint_w10','ra_allowed_pointsInPaint_w10',
    'rstd_pointsInPaint_w5','ewm_pointsInPaint_hl5','rstd_allowed_pointsInPaint_w5','ewm_allowed_pointsInPaint_hl5',
    'ra_steals_w1','ra_allowed_steals_w1','ra_steals_w3','ra_allowed_steals_w3','ra_steals_w5','ra_allowed_steals_w5',
    'ra_steals_w10','ra_allowed_steals_w10','rstd_steals_w5','ewm_steals_hl5','rstd_allowed_steals_w5','ewm_allowed_steals_hl5',
    'ra_technicalFouls_w1','ra_allowed_technicalFouls_w1','ra_technicalFouls_w3','ra_allowed_technicalFouls_w3',
    'ra_technicalFouls_w5','ra_allowed_technicalFouls_w5','ra_technicalFouls_w10','ra_allowed_technicalFouls_w10',
    'rstd_technicalFouls_w5','ewm_technicalFouls_hl5','rstd_allowed_technicalFouls_w5','ewm_allowed_technicalFouls_hl5',
    'ra_totalRebounds_w1','ra_allowed_totalRebounds_w1','ra_totalRebounds_w3','ra_allowed_totalRebounds_w3',
    'ra_totalRebounds_w5','ra_allowed_totalRebounds_w5','ra_totalRebounds_w10','ra_allowed_totalRebounds_w10',
    'rstd_totalRebounds_w5','ewm_totalRebounds_hl5','rstd_allowed_totalRebounds_w5','ewm_allowed_totalRebounds_hl5',
    'ra_turnoverPoints_w1','ra_allowed_turnoverPoints_w1','ra_turnoverPoints_w3','ra_allowed_turnoverPoints_w3',
    'ra_turnoverPoints_w5','ra_allowed_turnoverPoints_w5','ra_turnoverPoints_w10','ra_allowed_turnoverPoints_w10',
    'rstd_turnoverPoints_w5','ewm_turnoverPoints_hl5','rstd_allowed_turnoverPoints_w5','ewm_allowed_turnoverPoints_hl5',
    'ra_turnovers_w1','ra_allowed_turnovers_w1','ra_turnovers_w3','ra_allowed_turnovers_w3','ra_turnovers_w5',
    'ra_allowed_turnovers_w5','ra_turnovers_w10','ra_allowed_turnovers_w10','rstd_turnovers_w5','ewm_turnovers_hl5',
    'rstd_allowed_turnovers_w5','ewm_allowed_turnovers_hl5',
    'ra_points_1h_w1','ra_points_1h_w3','ra_points_1h_w5','ra_points_1h_w10',
    'ra_allowed_points_1h_w1','ra_allowed_points_1h_w3','ra_allowed_points_1h_w5','ra_allowed_points_1h_w10',
    'ra_points_2h_w1','ra_points_2h_w3','ra_points_2h_w5','ra_points_2h_w10',
    'ra_allowed_points_2h_w1','ra_allowed_points_2h_w3','ra_allowed_points_2h_w5','ra_allowed_points_2h_w10',
    'ra_margin_homeonly_w1','ra_margin_homeonly_w3','ra_margin_homeonly_w5','ra_margin_homeonly_w10',
    'ra_points_for_w1','ra_points_against_w1','ra_point_diff_w1',
    'ra_points_for_w3','ra_points_against_w3','ra_point_diff_w3',
    'ra_points_for_w5','ra_points_against_w5','ra_point_diff_w5',
    'ra_points_for_w10','ra_points_against_w10','ra_point_diff_w10',
    'ra_margin_w1','ra_margin_w3','ra_margin_w5','ra_margin_w10'
]
# keep only columns present
keep_cols_f3 = [c for c in keep_cols_f3 if c in latest_snapshot.columns]
features_3_current = latest_snapshot[keep_cols_f3].copy()

# --- F) Optional: if you want exactly ONE per team (across seasons), keep the latest overall ---
# (You already did something similar with sort_values+drop_duplicates)
features_2_current = features_2_current.sort_values(["date","team"]).drop_duplicates(subset="team", keep="last")
features_3_current = features_3_current.sort_values(["date","team"]).drop_duplicates(subset="team", keep="last")


### INFERENCE DATAFRAME

In [None]:
game_info = pd.read_csv("daily-games/daily.csv")
game_info = game_info.merge(features_1, on="game_id", how="left")

home_merge = features_2.copy()
home_merge = home_merge.rename(columns=lambda c: f"{c}_home" if c not in ["game_id", "team", "opponent"] else c)
merged_home = game_info.merge(
    home_merge,
    left_on=["game_id", "home"],
    right_on=["game_id", "team"],
    how="left",
    validate="1:1"
).drop(columns=["team", "opponent"])

# --- AWAY TEAM MERGE ---
away_merge = features_2.copy()
away_merge = away_merge.rename(columns=lambda c: f"{c}_away" if c not in ["game_id", "team", "opponent"] else c)
game_info = merged_home.merge(
    away_merge,
    left_on=["game_id", "away"],
    right_on=["game_id", "team"],
    how="left",
    validate="1:1"
).drop(columns=["team", "opponent"])

key_cols = ['game_id', 'team']

# --- HOME merge ---
home_feats = features_3.copy()
home_feats = home_feats.rename(columns=lambda c: f"{c}_home" if c not in key_cols else c)

out = game_info.merge(
    home_feats,
    left_on=['game_id', 'home'],
    right_on=['game_id', 'team'],
    how='left',
    validate='1:1'
).drop(columns=['team'])

# --- AWAY merge ---
away_feats = features_3.copy()
away_feats = away_feats.rename(columns=lambda c: f"{c}_away" if c not in key_cols else c)

game_info = out.merge(
    away_feats,
    left_on=['game_id', 'away'],
    right_on=['game_id', 'team'],
    how='left',
    validate='1:1'
).drop(columns=['team'])