In [1]:
# Main
# Multi-Output Targets with Per-Target Ensembling
# ---------------------------------------------------------------------
# Imports
import os, re, json, sys
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

import joblib
import tabulate

# Optional plotting
try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except Exception:
    HAS_MPL = False

# 3rd-party regressors
import lightgbm as lgb
try:
    import xgboost as xgb
except ImportError as e:
    raise RuntimeError("xgboost is not installed. Run: pip install xgboost") from e
try:
    from catboost import CatBoostRegressor
except ImportError as e:
    raise RuntimeError("catboost is not installed. Run: pip install catboost") from e

# ---------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------
DATA_PATH   = "games_2020_to_2023_6.csv"
ARTIFACT_DIR = "./artifacts_ensemble"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

TARGET_COLS = ['owners', 'players', 'copiesSold', 'revenue']
POST_RELEASE = ['wishlists', 'avgPlaytime', 'followers', 'reviews', 'reviewScore']
RANDOM_STATE = 42

# ---------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------
def sanitize_column_names(columns):
    return [re.sub(r'[^A-Za-z0-9_]+', '_', str(c)) for c in columns]

def build_days_since_release(df):
    needed = {'release_year','release_month','release_day','extract_year','extract_month','extract_day'}
    if not needed.issubset(df.columns):
        raise ValueError(f"Missing date part columns: {needed - set(df.columns)}")
    rel = pd.to_datetime(dict(
        year=df['release_year'].astype(int),
        month=df['release_month'].astype(int),
        day=df['release_day'].astype(int)
    ), errors='coerce')
    ext = pd.to_datetime(dict(
        year=df['extract_year'].astype(int),
        month=df['extract_month'].astype(int),
        day=df['extract_day'].astype(int)
    ), errors='coerce')
    df['days_since_release'] = (ext - rel).dt.days
    return df

def evaluate_predictions(y_true, y_pred, targets):
    rows = []
    for i, t in enumerate(targets):
        rmse = float(np.sqrt(mean_squared_error(y_true[:, i], y_pred[:, i])))
        mae  = float(mean_absolute_error(y_true[:, i], y_pred[:, i]))
        r2   = float(r2_score(y_true[:, i], y_pred[:, i]))
        rows.append({"target": t, "RMSE": rmse, "MAE": mae, "R2": r2})
    return pd.DataFrame(rows)

def rmse(true, pred):
    return float(np.sqrt(np.mean((true - pred) ** 2)))

# ---------------------------------------------------------------------
# Load & Clean
# ---------------------------------------------------------------------
df = pd.read_csv(DATA_PATH)
print("Raw shape:", df.shape)

# Drop obvious non-numeric id/text if present
for c in ["steamid", "name"]:
    if c in df.columns:
        df.drop(columns=[c], inplace=True, errors="ignore")

df = build_days_since_release(df)

if "required_age" in df.columns:
    df["required_age"] = df["required_age"].astype(float)

# Convert booleans to ints
for col in df.columns:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)

df.columns = sanitize_column_names(df.columns)

# Keep rows with all targets
df = df.dropna(subset=TARGET_COLS).copy()
# Drop rows where all targets are 0
df = df[~((df[TARGET_COLS] == 0).all(axis=1))].copy()
# Drop remaining NaNs
df = df.dropna().copy()

print("Clean shape:", df.shape)
print("Columns:", list(df.columns)[:20], "...")

# ---------------------------------------------------------------------
# Train/Test Split with Leakage-Safe X
# ---------------------------------------------------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
X = df[numeric_cols].drop(
    columns=[c for c in TARGET_COLS if c in numeric_cols] +
            [c for c in POST_RELEASE if c in numeric_cols],
    errors="ignore",
)
y = df[TARGET_COLS].copy()

X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# Log-transform targets (helps heavy tails)
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# Save feature order for inference
features_used = X_train.columns.tolist()
joblib.dump(np.array(features_used), Path(ARTIFACT_DIR) / "features_used.pkl")

# ---------------------------------------------------------------------
# Outlier handling: Feature winsorization (train-fit, apply to both)
# ---------------------------------------------------------------------
def is_binary(col: pd.Series) -> bool:
    u = pd.unique(col.dropna())
    if len(u) <= 2:
        s = set(u.tolist())
        return s.issubset({0, 1})
    return False

def compute_caps(df: pd.DataFrame, lower_q=0.01, upper_q=0.99):
    """Return per-column (lo, hi) quantile caps for non-binary numeric columns."""
    caps = {}
    for c in df.columns:
        if np.issubdtype(df[c].dtype, np.number) and not is_binary(df[c]):
            lo = df[c].quantile(lower_q)
            hi = df[c].quantile(upper_q)
            if pd.notnull(lo) and pd.notnull(hi) and lo < hi:
                caps[c] = (float(lo), float(hi))
    return caps

def apply_caps(df: pd.DataFrame, caps: dict) -> pd.DataFrame:
    df2 = df.copy()
    for c, (lo, hi) in caps.items():
        if c in df2.columns:
            df2[c] = df2[c].clip(lower=lo, upper=hi)
    return df2

# Keep originals for reporting
_Xtr_before = X_train.copy()
_Xte_before = X_test.copy()

# Fit caps on TRAIN only
feature_caps = compute_caps(_Xtr_before, lower_q=0.01, upper_q=0.99)

# Count how many would be clipped BEFORE applying
clip_report = {}
for c, (lo, hi) in feature_caps.items():
    if c in _Xtr_before:
        tr_low  = (_Xtr_before[c] < lo).sum()
        tr_high = (_Xtr_before[c] > hi).sum()
    else:
        tr_low = tr_high = 0
    if c in _Xte_before:
        te_low  = (_Xte_before[c] < lo).sum()
        te_high = (_Xte_before[c] > hi).sum()
    else:
        te_low = te_high = 0
    clip_report[c] = (lo, hi, int(tr_low), int(tr_high), int(te_low), int(te_high))

# Apply caps
X_train = apply_caps(X_train, feature_caps)
X_test  = apply_caps(X_test,  feature_caps)

# Logically bounded safety constraint
neg_train_dsr = neg_test_dsr = 0
if 'days_since_release' in X_train.columns:
    neg_train_dsr = (X_train['days_since_release'] < 0).sum()
    neg_test_dsr  = (X_test['days_since_release']  < 0).sum()
    X_train['days_since_release'] = X_train['days_since_release'].clip(lower=0)
    X_test['days_since_release']  = X_test['days_since_release'].clip(lower=0)

# ===== Inspect outlier handling results =====
print("\n[Outlier Handling] Winsorization applied to non-binary numeric features:")
for c, (lo, hi, tr_low, tr_high, te_low, te_high) in clip_report.items():
    print(f" - {c:>24s}: cap=({lo:.6g}, {hi:.6g}), "
          f"train clipped=({tr_low}+{tr_high}), test clipped=({te_low}+{te_high})")

if 'days_since_release' in X_train.columns:
    print("\n[Outlier Handling] Safety constraint:")
    print(f" - days_since_release clipped to >= 0 "
          f"(post-clip negatives still in train={int(neg_train_dsr)}, test={int(neg_test_dsr)})")


# ---------------------------------------------------------------------
# Baseline MultiOutput Models (kept for reference/compat)
# ---------------------------------------------------------------------
# LightGBM MultiOutput
lgb_params = dict(
    random_state=RANDOM_STATE, n_jobs=-1,
    n_estimators=600, learning_rate=0.02,
    num_leaves=63, max_depth=-1, subsample=0.8, colsample_bytree=0.8,
)
lgb_base = lgb.LGBMRegressor(**lgb_params)
lgb_model = MultiOutputRegressor(lgb_base)
print("Training MultiOutput LightGBM...")
lgb_model.fit(X_train.values, y_train_log.values)

y_pred_log = lgb_model.predict(X_test.values)
y_pred     = np.expm1(y_pred_log)
y_true     = np.expm1(y_test_log.values)
lgb_metrics = evaluate_predictions(y_true, y_pred, TARGET_COLS)
print("\n[LGBM MultiOutput] Per-target metrics:\n", lgb_metrics.to_string(index=False))
print("Average R²:", lgb_metrics["R2"].mean())
joblib.dump(lgb_model, os.path.join(ARTIFACT_DIR, "lgb_model.pkl"))
joblib.dump({"feature_order": features_used, "lgb_params": lgb_params},
            os.path.join(ARTIFACT_DIR, "lgb_model_meta.pkl"))

# XGBoost MultiOutput
xgb_params = dict(
    objective="reg:squarederror", random_state=RANDOM_STATE,
    n_estimators=800, learning_rate=0.03, max_depth=6,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, reg_alpha=0.0,
    tree_method="hist", nthread=-1,
)
xgb_base = xgb.XGBRegressor(**xgb_params)
xgb_mo_model = MultiOutputRegressor(xgb_base)
print("\nTraining MultiOutput XGBoost...")
xgb_mo_model.fit(X_train.values, y_train_log.values)
xgb_pred_log = xgb_mo_model.predict(X_test.values)
xgb_pred     = np.expm1(xgb_pred_log)
xgb_true     = np.expm1(y_test_log.values)
xgb_mo_metrics = evaluate_predictions(xgb_true, xgb_pred, TARGET_COLS)
print("\n[XGB MultiOutput] Per-target metrics:\n", xgb_mo_metrics.to_string(index=False))
print("Average R²:", xgb_mo_metrics["R2"].mean())
joblib.dump(xgb_mo_model, os.path.join(ARTIFACT_DIR, "xgb_model.pkl"))
joblib.dump({"xgb_params": xgb_params}, os.path.join(ARTIFACT_DIR, "xgb_params_base.pkl"))

# CatBoost MultiOutput
cb_params = dict(
    loss_function="RMSE", random_seed=RANDOM_STATE,
    n_estimators=1200, learning_rate=0.03, depth=8, l2_leaf_reg=3.0,
    subsample=0.8, verbose=0, thread_count=-1
)
cb_base = CatBoostRegressor(**cb_params)
cb_mo_model = MultiOutputRegressor(cb_base)
print("\nTraining MultiOutput CatBoost...")
cb_mo_model.fit(X_train.values, y_train_log.values)
cb_pred_log = cb_mo_model.predict(X_test.values)
cb_pred     = np.expm1(cb_pred_log)
cb_true     = np.expm1(y_test_log.values)
cb_mo_metrics = evaluate_predictions(cb_true, cb_pred, TARGET_COLS)
print("\n[CatBoost MultiOutput] Per-target metrics:\n", cb_mo_metrics.to_string(index=False))
print("Average R²:", cb_mo_metrics["R2"].mean())
joblib.dump(cb_mo_model, os.path.join(ARTIFACT_DIR, "cb_model.pkl"))
joblib.dump({"cb_params": cb_params}, os.path.join(ARTIFACT_DIR, "catboost_params_base.pkl"))

# ---------------------------------------------------------------------
# Per-Target Models + Weighted Ensembling
# ---------------------------------------------------------------------
def make_lgb():
    return lgb.LGBMRegressor(
        random_state=RANDOM_STATE, n_estimators=800, learning_rate=0.03,
        num_leaves=63, subsample=0.8, colsample_bytree=0.8, n_jobs=-1
    )

def make_xgb():
    return xgb.XGBRegressor(
        objective="reg:squarederror", random_state=RANDOM_STATE,
        n_estimators=900, learning_rate=0.03, max_depth=7,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, reg_alpha=0.0,
        tree_method="hist", nthread=-1
    )

def make_cb():
    return CatBoostRegressor(
        loss_function="RMSE", random_seed=RANDOM_STATE,
        n_estimators=1200, learning_rate=0.03, depth=8, l2_leaf_reg=3.0,
        subsample=0.8, rsm=0.8, verbose=0, thread_count=-1
    )

ALGORITHMS = {"lgb": make_lgb, "xgb": make_xgb, "cb": make_cb}

# Train per-target estimators and collect validation predictions
per_target_models   = {alg: {} for alg in ALGORITHMS}     # per_target_models["cb"]["revenue"] = est
per_target_valpreds = {alg: {} for alg in ALGORITHMS}     # store log-space preds on X_test for each target

print("\n[Per-Target] Training per algorithm, per target...")
for alg_name, factory in ALGORITHMS.items():
    for target in TARGET_COLS:
        est = factory()
        est.fit(X_train, y_train_log[target].values)
        per_target_models[alg_name][target] = est
        per_target_valpreds[alg_name][target] = est.predict(X_test)  # log-space

# Compute target-wise ensemble weights from inverse RMSE (normal space)
ensemble_weights = {}  # {target: {'lgb': w, 'xgb': w, 'cb': w}}
for target in TARGET_COLS:
    y_true_normal = np.expm1(y_test_log[target].values)
    rmses = {}
    for alg_name in ALGORITHMS.keys():
        yhat_normal = np.expm1(per_target_valpreds[alg_name][target])
        rmses[alg_name] = rmse(y_true_normal, yhat_normal)

    inv = {alg: (1.0 / rmses[alg]) if rmses[alg] > 0 else 0.0 for alg in rmses}
    s = sum(inv.values()) or 1.0
    ensemble_weights[target] = {alg: inv[alg] / s for alg in inv}

print("\nEnsemble weights per target:")
for t, w in ensemble_weights.items():
    print(f"  {t}: {w}")

# Evaluate ensemble on holdout
y_pred_ens = np.zeros((X_test.shape[0], len(TARGET_COLS)), dtype=float)
for t_idx, target in enumerate(TARGET_COLS):
    weights = ensemble_weights[target]
    blend = np.zeros(X_test.shape[0], dtype=float)
    for alg_name in ALGORITHMS.keys():
        yhat = np.expm1(per_target_valpreds[alg_name][target])  # normal space
        blend += weights[alg_name] * yhat
    y_pred_ens[:, t_idx] = blend

y_true_ens = np.column_stack([np.expm1(y_test_log[t].values) for t in TARGET_COLS])
ens_metrics = evaluate_predictions(y_true_ens, y_pred_ens, TARGET_COLS)
print("\n[Ensemble] Per-target metrics:\n", ens_metrics.to_string(index=False))
print("[Ensemble] Average R²:", ens_metrics["R2"].mean())

# Also evaluate each per-target algorithm as a “set” for comparison
def eval_per_target_family(per_target_models_family):
    preds = np.column_stack([
        np.expm1(per_target_models_family[t].predict(X_test)) for t in TARGET_COLS
    ])
    true  = y_true_ens
    return evaluate_predictions(true, preds, TARGET_COLS)

for alg in ALGORITHMS.keys():
    fam_metrics = eval_per_target_family(per_target_models[alg])
    print(f"\n[{alg.upper()} Per-Target] metrics:\n", fam_metrics.to_string(index=False))
    print(f"[{alg.upper()} Per-Target] Average R²:", fam_metrics["R2"].mean())

# Save artifacts
joblib.dump(per_target_models,   os.path.join(ARTIFACT_DIR, "per_target_models.pkl"))
joblib.dump(ensemble_weights,    os.path.join(ARTIFACT_DIR, "ensemble_weights.pkl"))
joblib.dump(TARGET_COLS,         os.path.join(ARTIFACT_DIR, "targets.pkl"))
joblib.dump(features_used,       os.path.join(ARTIFACT_DIR, "feature_order.pkl"))

print("\nSaved ensemble artifacts to:", ARTIFACT_DIR)

# ---------------------------------------------------------------------
# Inference helpers (for API use)
# ---------------------------------------------------------------------
def _build_input_df(user_input: dict, features_used: list) -> pd.DataFrame:
    """Build 1-row DF in the exact training feature order (zeros by default)."""
    input_data = {f: 0 for f in features_used}

    # Minimal example mapping
    # Basics
    input_data['price']          = user_input.get('price', 0)
    input_data['is_free']        = int(user_input.get('is_free', False))
    input_data['required_age']   = user_input.get('required_age', 0)
    input_data['achievements']   = user_input.get('achievements', 0)
    input_data['english']        = int(user_input.get('english', True))

    # Platforms / tags / genres
    platform_flags = ['windows', 'mac', 'linux']
    tag_flags = [
        'Single-player','Family Sharing','Steam Achievements','Steam Cloud',
        'Full controller support','Multi-player','Partial Controller Support',
        'Steam Trading Cards','PvP','Co-op','Steam Leaderboards','Remote Play Together',
        'Online PvP','Shared/Split Screen','Tracked Controller Support','VR Only',
        'Shared/Split Screen PvP','Online Co-op','Stats','Shared/Split Screen Co-op'
    ]
    genre_flags = ['Indie','Casual','Adventure','Action','Simulation',
                   'Strategy','RPG','Free To Play','Sports','Racing']

    for flag in platform_flags + tag_flags + genre_flags:
        if flag in input_data:
            input_data[flag] = int(user_input.get(flag, False))

    # Encoded publisher
    if 'publisherClass_encoded' in input_data:
        input_data['publisherClass_encoded'] = user_input.get('publisherClass_encoded', 0)

    # Days since release from provided dates
    if 'days_since_release' in input_data and 'release_date' in user_input and 'extract_date' in user_input:
        release_date = pd.to_datetime(user_input['release_date'])
        extract_date = pd.to_datetime(user_input['extract_date'])
        input_data['days_since_release'] = (extract_date - release_date).days

    return pd.DataFrame([input_data])[features_used]

def predict_game_success_single(user_input: dict, per_target_family: dict, features_used: list):
    """
    Predict using a single algorithm's per-target models dict.
    per_target_family: dict like {'owners': est, 'players': est, ...}
    """
    X1 = _build_input_df(user_input, features_used)
    out_vals = []
    for t in TARGET_COLS:
        yhat_log = float(per_target_family[t].predict(X1)[0])
        out_vals.append(float(np.expm1(yhat_log)))
    return {
        'owners': int(round(out_vals[0])),
        'players': int(round(out_vals[1])),
        'copiesSold': int(round(out_vals[2])),
        'revenue': float(out_vals[3]),
    }

def predict_game_success_ensemble(user_input: dict, per_target_models: dict, ensemble_weights: dict, features_used: list):
    """
    Weighted ensemble in NORMAL space per target using inverse-RMSE weights.
    per_target_models: {'lgb': {target: est}, 'xgb': {...}, 'cb': {...}}
    ensemble_weights: {target: {'lgb': w, 'xgb': w, 'cb': w}}
    """
    X1 = _build_input_df(user_input, features_used)
    out = {}
    for t in TARGET_COLS:
        blend = 0.0
        w = ensemble_weights[t]
        for alg_name, fam in per_target_models.items():
            est = fam[t]
            yhat_log = float(est.predict(X1)[0])
            yhat = float(np.expm1(yhat_log))
            blend += w[alg_name] * yhat
        if t == "revenue":
            out[t] = float(blend)
        else:
            out[t] = int(round(blend))
    return out

Raw shape: (39194, 56)
Clean shape: (39189, 55)
Columns: ['price', 'is_free', 'release_year', 'release_month', 'release_day', 'extract_year', 'extract_month', 'extract_day', 'publisherClass_encoded', 'required_age', 'achievements', 'english', 'windows', 'mac', 'linux', 'Single_player', 'Family_Sharing', 'Steam_Achievements', 'Steam_Cloud', 'Full_controller_support'] ...
Train size: (31351, 46), Test size: (7838, 46)

[Outlier Handling] Winsorization applied to non-binary numeric features:
 -                    price: cap=(0, 49.99), train clipped=(0+288), test clipped=(0+72)
 -             release_year: cap=(2020, 2023), train clipped=(0+0), test clipped=(0+0)
 -            release_month: cap=(1, 12), train clipped=(0+0), test clipped=(0+0)
 -              release_day: cap=(1, 31), train clipped=(0+0), test clipped=(0+0)
 -   publisherClass_encoded: cap=(0, 2), train clipped=(0+185), test clipped=(0+56)
 -             achievements: cap=(0, 100), train clipped=(0+161), test clipped=(0+4




[LGBM MultiOutput] Per-target metrics:
     target         RMSE           MAE       R2
    owners 6.308407e+05  63396.039463 0.208309
   players 5.177119e+05  46641.951505 0.159307
copiesSold 4.483062e+05  38804.663454 0.203197
   revenue 5.330994e+06 400082.532757 0.122953
Average R²: 0.17344145709763836

Training MultiOutput XGBoost...

[XGB MultiOutput] Per-target metrics:
     target         RMSE           MAE        R2
    owners 7.280718e+05  68686.819886 -0.054545
   players 6.501451e+05  51417.816363 -0.325811
copiesSold 4.855512e+05  42526.455183  0.065301
   revenue 1.319714e+07 540021.658753 -4.374846
Average R²: -1.172474886861489

Training MultiOutput CatBoost...

[CatBoost MultiOutput] Per-target metrics:
     target         RMSE           MAE       R2
    owners 6.441577e+05  65410.169095 0.174531
   players 5.298309e+05  47916.877716 0.119487
copiesSold 4.685685e+05  40185.384975 0.129543
   revenue 5.055254e+06 393965.284621 0.211335
Average R²: 0.15872391124976656

[

In [34]:
# ---------------------------------------------------------------------
# Demo on a specific game row
# ---------------------------------------------------------------------
def get_game_input_format(df, steamid=None, row_num=None):
    if steamid is not None:
        game = df[df["steamid"] == steamid]
        if game.empty:
            raise ValueError(f"No game found with steamid {steamid}")
        game = game.iloc[0]
    elif row_num is not None:
        game = df.iloc[row_num]
    else:
        raise ValueError("You must provide either a steamid or a row_num.")

    release_date = f"{int(game['release_year'])}-{int(game['release_month']):02d}-{int(game['release_day']):02d}"
    extract_date = f"{int(game['extract_year'])}-{int(game['extract_month']):02d}-{int(game['extract_day']):02d}"

    input_data = {
        "price": float(game.get("price", 0)),
        "is_free": bool(game.get("is_free", False)),
        "required_age": int(game.get("required_age", 0)),
        "achievements": int(game.get("achievements", 0)),
        "english": bool(game.get("english", True)),
        "windows": bool(game.get("windows", True)),
        "mac": bool(game.get("mac", False)),
        "linux": bool(game.get("linux", False)),
        "release_date": release_date,
        "extract_date": extract_date,
        "publisherClass_encoded": int(game.get("publisherClass_encoded", 0)),
    }

    # Add boolean flags if present
    boolean_cols = game.index[
        game.index.isin([
            "Single-player","Family Sharing","Steam Achievements","Steam Cloud",
            "Full controller support","Multi-player","Partial Controller Support",
            "Steam Trading Cards","PvP","Co-op","Steam Leaderboards","Remote Play Together",
            "Online PvP","Shared/Split Screen","Tracked Controller Support","VR Only",
            "Shared/Split Screen PvP","Online Co-op","Stats","Shared/Split Screen Co-op",
            "Indie","Casual","Adventure","Action","Simulation","Strategy","RPG",
            "Free To Play","Sports","Racing"
        ])
    ]
    for col in boolean_cols:
        input_data[col] = bool(game[col])

    # Extract targets for comparison
    players = game.get("players", np.nan)
    owners = game.get("owners", np.nan)
    copies_sold = game.get("copiesSold", np.nan)
    revenue = game.get("revenue", np.nan)

    return (input_data, players, owners, copies_sold, revenue)

# Demo (uncomment to run a quick check)
df_demo = pd.read_csv("steam_dataset.csv")
steamid = 315210
#315210
#2124490
#235520
#1302990

#340020
#290100
#24880
#333640
#282880
#251570


(
    game_dict, players, owners, copies_sold, revenue
) = get_game_input_format(df_demo, steamid=steamid)

# Single families
preds_lgb = predict_game_success_single(game_dict, per_target_models["lgb"], features_used)
preds_xgb = predict_game_success_single(game_dict, per_target_models["xgb"], features_used)
preds_cb  = predict_game_success_single(game_dict, per_target_models["cb"],  features_used)
# Ensemble
preds_ens = predict_game_success_ensemble(game_dict, per_target_models, ensemble_weights, features_used)

print("\nLGB per-target:", preds_lgb)
print("XGB per-target:", preds_xgb)
print("CB  per-target:", preds_cb)
print("Ensemble      :", preds_ens)

# Optional: compare one set to actuals
actual = {"owners": owners, "players": players, "copiesSold": copies_sold, "revenue": revenue}
headers = ["Metric", "Predicted", "Actual", "Absolute Error", "Percentage Error", "Accuracy"]
rows = []
def append_rows(preds):
    for key in actual:
        pred_val = preds[key]
        actual_val = actual[key]
        abs_error = abs(actual_val - pred_val) if pd.notna(actual_val) else np.nan
        pct_error = (abs_error / actual_val * 100) if (pd.notna(actual_val) and actual_val != 0) else np.nan
        accuracy = 100 - pct_error if pd.notna(pct_error) else np.nan
        if key == "revenue":
            row = [key, f"${pred_val:,.2f}", f"${actual_val:,.2f}", f"${abs_error:,.2f}" if pd.notna(abs_error) else "N/A",
                   f"{pct_error:.2f}%" if pd.notna(pct_error) else "N/A", f"{accuracy:.2f}%" if pd.notna(accuracy) else "N/A"]
        else:
            row = [key, f"{pred_val:,}", f"{actual_val:,}", f"{int(abs_error):,}" if pd.notna(abs_error) else "N/A",
                   f"{pct_error:.2f}%" if pd.notna(pct_error) else "N/A", f"{accuracy:.2f}%" if pd.notna(accuracy) else "N/A"]
        rows.append(row)

append_rows(preds_ens)
print("\n[Ensemble] Example vs Actuals")
print(tabulate.tabulate(rows, headers=headers, tablefmt="grid"))


  df_demo = pd.read_csv("steam_dataset.csv")



LGB per-target: {'owners': 442705, 'players': 407731, 'copiesSold': 366512, 'revenue': 12116192.385623004}
XGB per-target: {'owners': 244657, 'players': 209119, 'copiesSold': 82370, 'revenue': 3825492.177672367}
CB  per-target: {'owners': 673210, 'players': 422103, 'copiesSold': 592582, 'revenue': 9122949.247156745}
Ensemble      : {'owners': 454109, 'players': 349639, 'copiesSold': 354191, 'revenue': 8297518.15234535}

[Ensemble] Example vs Actuals
+------------+---------------+---------------+------------------+--------------------+------------+
| Metric     | Predicted     | Actual        | Absolute Error   | Percentage Error   | Accuracy   |
| owners     | 454,109       | 436,490.0     | 17,619           | 4.04%              | 95.96%     |
+------------+---------------+---------------+------------------+--------------------+------------+
| players    | 349,639       | 436,490.0     | 86,851           | 19.90%             | 80.10%     |
+------------+---------------+---------------