In [1]:
import os
import joblib
import numpy as np
import pandas as pd
from typing import List, Optional
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype

# --------------- Config ---------------
DATE = "20251109"
PRED_DATA_PATH = f"data/inference/{DATE}/inference_data.csv"  # your input dataframe (one row per game)
MODEL_DIR      = "./models/cbb_models_fast"           # where model_{target}.joblib live
OUT_CSV        = f"data/predictions/{DATE}/preds.csv"            # optional: will be created
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)

TARGETS = [
    "home_1h", "away_1h", "home_2h", "away_2h",
    "home_score", "away_score", "home_margin", "away_margin"
]

TARGETS = [
    "home_1h", "away_1h", "home_2h", "away_2h",
    "home_score", "away_score", "home_margin", "away_margin"
]

PASSTHRU_COLS = ["game_id","game_id_y","date","date_utc","time_utc","home","away"]
NONNEG_TARGETS = {"home_1h","away_1h","home_2h","away_2h","home_score","away_score"}

def choose_date_column(df: pd.DataFrame) -> Optional[str]:
    if "date" in df.columns: return "date"
    if "date_utc" in df.columns: return "date_utc"
    return None

def _align_X_for_model(raw: pd.DataFrame, mdl) -> pd.DataFrame:
    """
    Build model input with the exact features LightGBM expects:
      • read expected names from Booster
      • create missing as NaN
      • convert any datetime columns to epoch seconds (float)
      • cast to float32
    """
    # Always use Booster names if available
    feat_list = None
    if hasattr(mdl, "_Booster") and mdl._Booster is not None:
        try:
            feat_list = list(mdl._Booster.feature_name())
        except Exception:
            pass
    if feat_list is None:
        feat_list = getattr(mdl, "feature_name_", None)

    if not feat_list:  # rare fallback
        cols = [c for c in raw.columns if is_numeric_dtype(raw[c])]
        return raw[cols].astype(np.float32)

    # Ensure all expected features exist
    for c in feat_list:
        if c not in raw.columns:
            raw[c] = np.nan

    # Convert datetime features to epoch seconds
    for c in feat_list:
        s = raw[c]
        if is_datetime64_any_dtype(s):
            s_dt = pd.to_datetime(s, errors="coerce")
            # convert to ns → float seconds; keep NaT as NaN
            vals = s_dt.view("int64").astype("float64")
            vals[s_dt.isna().values] = np.nan
            raw[c] = vals / 1e9  # seconds; matches “numeric” expectation

    # Keep exactly expected features, ordered, as float32
    X = raw[feat_list].astype(np.float32)
    return X

def run_inference():
    raw = pd.read_csv(PRED_DATA_PATH)

    # parse dates for display only
    for c in ("date","date_utc"):
        if c in raw.columns:
            raw[c] = pd.to_datetime(raw[c], errors="coerce")

    if "season" not in raw.columns:
        # use the season your data belongs to; constant is fine for future games
        raw["season"] = 2026
    # passthrough columns
    keep_cols = [c for c in PASSTHRU_COLS if c in raw.columns]
    id_col = "game_id" if "game_id" in keep_cols else ("game_id_y" if "game_id_y" in keep_cols else None)
    date_col = choose_date_column(raw)

    base_out_cols = []
    if id_col: base_out_cols.append(id_col)
    if date_col: base_out_cols.append(date_col)
    if "home" in raw.columns: base_out_cols.append("home")
    if "away" in raw.columns: base_out_cols.append("away")
    out = raw[base_out_cols].copy()
    out['season'] = 2026

    # predictions per target
    for tgt in TARGETS:
        path = os.path.join(MODEL_DIR, f"model_{tgt}.joblib")
        if not os.path.exists(path):
            out[f"pred_{tgt}"] = np.nan
            continue

        mdl = joblib.load(path)
        X = _align_X_for_model(raw, mdl)  # <-- guarantees numeric float32 only
        preds = mdl.predict(X)
        if tgt in NONNEG_TARGETS:
            preds = np.maximum(preds, 0.0)
        out[f"pred_{tgt}"] = preds.astype(np.float32)

    # tidy columns
    if "game_id_y" in out.columns and "game_id" not in out.columns:
        out = out.rename(columns={"game_id_y": "game_id"})

    pred_cols = [c for c in out.columns if c.startswith("pred_")]
    out = out[ [c for c in ["game_id", date_col, "home", "away"] if c in out.columns] + sorted(pred_cols) ]

    os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)
    out['date'] = DATE
    out.to_csv(OUT_CSV, index=False)
    print(f"Saved predictions → {OUT_CSV}")
    return out

In [2]:
run_inference()

Saved predictions → data/predictions/20251109/preds.csv


Unnamed: 0,game_id,date,home,away,pred_away_1h,pred_away_2h,pred_away_margin,pred_away_score,pred_home_1h,pred_home_2h,pred_home_margin,pred_home_score
0,401828275,20251109,Loyola Chicago,North Texas,26.666357,34.672138,-1.740406,65.047501,29.9016,36.272564,1.740406,66.316307
1,401817528,20251109,Georgia,Morehead St.,29.58209,29.952282,-24.423098,58.717934,46.566063,46.145061,24.423098,86.975853
2,401824023,20251109,Arizona St.,Utah Tech,30.509418,31.553225,-18.979048,65.01606,40.192181,44.782463,18.979048,82.679131
3,401830318,20251109,Hawaii,East Texas A&M,33.330387,30.703096,-16.692621,58.976173,40.355034,37.755444,16.692621,75.803062
4,401823514,20251109,Missouri,VMI,35.677647,38.794434,-5.826262,77.785561,37.756989,39.025505,5.826262,81.670845
5,401829052,20251109,Campbell,Western Michigan,33.493343,38.630901,-1.55599,74.609253,31.587376,40.030197,1.55599,72.070824
6,401826738,20251109,Baylor,Washington,34.669079,37.567245,-3.431311,74.991577,36.048763,40.861118,3.431311,74.031189
7,401823554,20251109,San Diego St.,Idaho St.,31.584257,31.607227,-6.110785,61.059406,34.17234,33.607368,6.110785,67.217331
8,401826738,20251109,Montana St.,Denver,32.840107,33.296062,-9.858198,64.945076,35.273357,43.164532,9.858198,79.089409
9,401820541,20251109,North Dakota,Cal St. Northridge,29.011581,29.961418,-11.439844,64.940147,31.903051,37.166569,11.439844,73.328018
