# Main

## Model

In [1]:
# %% [markdown]
# # Build the Model — Multi-Output (owners, players, copiesSold, revenue)

# %% [markdown]
# ## Import Libraries

import os
import sys
import re
import json
import numpy as np
import pandas as pd

# Matplotlib optional (avoid Python 3.13 crash)
try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except Exception as e:
    HAS_MPL = False
    print("Matplotlib unavailable; skipping plots.\n", e)

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import lightgbm as lgb
from pathlib import Path
import joblib
import tabulate
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# ---------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------
DATA_PATH = "games_2020_to_2023_6.csv"
ARTIFACT_DIR = "./artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

TARGET_COLS = ['owners', 'players', 'copiesSold', 'revenue']

# Post-release columns to EXCLUDE from features (leakage)
POST_RELEASE = ['wishlists', 'avgPlaytime', 'followers', 'reviews', 'reviewScore']

In [3]:
# ---------------------------------------------------------------------
# Utils
# ---------------------------------------------------------------------
def sanitize_column_names(columns):
    """Keep only letters, numbers, and underscores in column names."""
    sanitized = []
    for col in columns:
        clean_col = re.sub(r'[^A-Za-z0-9_]+', '_', str(col))
        sanitized.append(clean_col)
    return sanitized

def build_days_since_release(df):
    """
    Construct days_since_release from the separate date parts:
    release_year, release_month, release_day, extract_year, extract_month, extract_day
    Does NOT rely on release_date or extract_date columns.
    """
    needed = {'release_year','release_month','release_day','extract_year','extract_month','extract_day'}
    if not needed.issubset(df.columns):
        raise ValueError(f"Missing one or more required date part columns: {needed - set(df.columns)}")

    # Build datetime safely
    rel = pd.to_datetime(dict(
        year=df['release_year'].astype(int),
        month=df['release_month'].astype(int),
        day=df['release_day'].astype(int)
    ), errors='coerce')

    ext = pd.to_datetime(dict(
        year=df['extract_year'].astype(int),
        month=df['extract_month'].astype(int),
        day=df['extract_day'].astype(int)
    ), errors='coerce')

    df['days_since_release'] = (ext - rel).dt.days
    return df

def basic_feature_engineering(df):
    """Add a few robust ratios/interactions (safe even if some columns absent)."""
    if 'price' in df.columns and 'achievements' in df.columns:
        df['price_per_achievement'] = df['price'] / (df['achievements'] + 1)
    if 'wishlists' in df.columns and 'days_since_release' in df.columns:
        df['wishlists_per_day'] = df['wishlists'] / (df['days_since_release'] + 1)
    if 'price' in df.columns and 'days_since_release' in df.columns:
        df['price_x_age'] = df['price'] * np.log1p(df['days_since_release'])
    return df

def evaluate_predictions(y_true, y_pred, targets):
    rows = []
    for i, t in enumerate(targets):
        rmse = float(np.sqrt(mean_squared_error(y_true[:, i], y_pred[:, i])))
        mae  = float(mean_absolute_error(y_true[:, i], y_pred[:, i]))
        r2   = float(r2_score(y_true[:, i], y_pred[:, i]))
        rows.append({"target": t, "RMSE": rmse, "MAE": mae, "R2": r2})
    return pd.DataFrame(rows)


In [4]:
# ---------------------------------------------------------------------
# Load & Clean
# ---------------------------------------------------------------------
df = pd.read_csv(DATA_PATH)
print("Raw shape:", df.shape)

# Drop obvious non-numeric id/text if present
for c in ["steamid", "name"]:
    if c in df.columns:
        df.drop(columns=[c], inplace=True)

# Build days_since_release from parts (no release_date/extract_date columns needed)
df = build_days_since_release(df)

# Optional: ensure required_age numeric if present
if "required_age" in df.columns:
    df["required_age"] = df["required_age"].astype(float)

# Convert booleans to ints (for safety)
for col in df.columns:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)

# Sanitize column names for safety in sklearn
df.columns = sanitize_column_names(df.columns)

# Keep rows with all targets available (cannot train otherwise)
df = df.dropna(subset=TARGET_COLS).copy()

# Optional: drop rows where all targets are 0 (no signal)
df = df[~((df[TARGET_COLS] == 0).all(axis=1))].copy()

# Any remaining NaNs -> drop (you can impute if you prefer)
df = df.dropna().copy()

print("Clean shape:", df.shape)
print("Columns:", list(df.columns)[:20], "...")

Raw shape: (39194, 56)
Clean shape: (39189, 55)
Columns: ['price', 'is_free', 'release_year', 'release_month', 'release_day', 'extract_year', 'extract_month', 'extract_day', 'publisherClass_encoded', 'required_age', 'achievements', 'english', 'windows', 'mac', 'linux', 'Single_player', 'Family_Sharing', 'Steam_Achievements', 'Steam_Cloud', 'Full_controller_support'] ...


In [5]:
# ---------------------------------------------------------------------
# Train/Test Split with Leakage-Safe X
# ---------------------------------------------------------------------
# Select numeric columns only for features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# X: numeric minus targets and minus post-release leakage columns
X = df[numeric_cols].drop(
    columns=[c for c in TARGET_COLS if c in numeric_cols]
    + [c for c in POST_RELEASE if c in numeric_cols],
    errors="ignore",
)
y = df[TARGET_COLS].copy()

# Final cleanup in case of infs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Log-transform targets (helps on heavy-tailed counts/revenue)
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

Train size: (31351, 46), Test size: (7838, 46)


In [6]:
# # ===== Outlier handling: Feature winsorization (train-fit, apply to both) =====
# def is_binary(col: pd.Series) -> bool:
#     u = pd.unique(col.dropna())
#     if len(u) <= 2:
#         s = set(u.tolist())
#         return s.issubset({0, 1})
#     return False

# def compute_caps(df: pd.DataFrame, lower_q=0.01, upper_q=0.99):
#     """Return per-column (lo, hi) quantile caps for non-binary numeric columns."""
#     caps = {}
#     for c in df.columns:
#         if np.issubdtype(df[c].dtype, np.number) and not is_binary(df[c]):
#             lo = df[c].quantile(lower_q)
#             hi = df[c].quantile(upper_q)
#             if pd.notnull(lo) and pd.notnull(hi) and lo < hi:
#                 caps[c] = (float(lo), float(hi))
#     return caps

# def apply_caps(df: pd.DataFrame, caps: dict) -> pd.DataFrame:
#     df2 = df.copy()
#     for c, (lo, hi) in caps.items():
#         if c in df2.columns:
#             df2[c] = df2[c].clip(lower=lo, upper=hi)
#     return df2

# # Keep originals for reporting
# _Xtr_before = X_train.copy()
# _Xte_before = X_test.copy()

# # Fit caps on TRAIN only
# feature_caps = compute_caps(_Xtr_before, lower_q=0.01, upper_q=0.99)

# # Count how many would be clipped BEFORE applying
# clip_report = {}
# for c, (lo, hi) in feature_caps.items():
#     if c in _Xtr_before:
#         tr_low  = (_Xtr_before[c] < lo).sum()
#         tr_high = (_Xtr_before[c] > hi).sum()
#     else:
#         tr_low = tr_high = 0
#     if c in _Xte_before:
#         te_low  = (_Xte_before[c] < lo).sum()
#         te_high = (_Xte_before[c] > hi).sum()
#     else:
#         te_low = te_high = 0
#     clip_report[c] = (lo, hi, int(tr_low), int(tr_high), int(te_low), int(te_high))

# # Apply caps
# X_train = apply_caps(X_train, feature_caps)
# X_test  = apply_caps(X_test,  feature_caps)

# # (Optional) logically bounded safety constraint
# neg_train_dsr = neg_test_dsr = 0
# if 'days_since_release' in X_train.columns:
#     neg_train_dsr = (X_train['days_since_release'] < 0).sum()
#     neg_test_dsr  = (X_test['days_since_release']  < 0).sum()
#     X_train['days_since_release'] = X_train['days_since_release'].clip(lower=0)
#     X_test['days_since_release']  = X_test['days_since_release'].clip(lower=0)

# # ===== Inspect outlier handling results =====
# print("\n[Outlier Handling] Winsorization applied to non-binary numeric features:")
# for c, (lo, hi, tr_low, tr_high, te_low, te_high) in clip_report.items():
#     print(f" - {c:>24s}: cap=({lo:.6g}, {hi:.6g}), "
#           f"train clipped=({tr_low}+{tr_high}), test clipped=({te_low}+{te_high})")

# if 'days_since_release' in X_train.columns:
#     print("\n[Outlier Handling] Safety constraint:")
#     print(f" - days_since_release clipped to >= 0 "
#           f"(post-clip negatives still in train={int(neg_train_dsr)}, test={int(neg_test_dsr)})")



In [7]:
# ---------------------------------------------------------------------
# Model: LightGBM + MultiOutput
# ---------------------------------------------------------------------
lgb_params = dict(
    random_state=42,
    n_jobs=-1,
    n_estimators=600,
    learning_rate=0.02,
    num_leaves=63,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
)

base_lgb = lgb.LGBMRegressor(**lgb_params)
lgb_model = MultiOutputRegressor(base_lgb)

print("Training MultiOutput LightGBM...")
lgb_model.fit(X_train.values, y_train_log.values)

# Predict & invert log
y_pred_log = lgb_model.predict(X_test.values)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test_log.values)

# Metrics
metrics_df = evaluate_predictions(y_true, y_pred, TARGET_COLS)
print("\nPer-target metrics:")
print(metrics_df.to_string(index=False))
print("\nAverage R²:", metrics_df["R2"].mean())

# R² in %
metrics_df_pct = metrics_df.copy()
metrics_df_pct["R2_%"] = (metrics_df_pct["R2"] * 100).round(2)
print("\nPer-target metrics (R² in %):")
print(metrics_df_pct[["target", "RMSE", "MAE", "R2_%"]].to_string(index=False))
print("\nAverage R² (%):", round(metrics_df_pct["R2_%"].mean(), 2))

# ---------------------------------------------------------------------
# Feature Importance (averaged across targets)
# ---------------------------------------------------------------------
try:
    importances = []
    for est in lgb_model.estimators_:
        importances.append(est.feature_importances_)
    importances = np.vstack(importances)
    avg_importance = importances.mean(axis=0)
    fi = pd.Series(avg_importance, index=X_train.columns).sort_values(ascending=False)
    print("\nTop 20 features by average importance:")
    print(fi.head(20).to_string())
except Exception as e:
    print("Feature importance not available:", e)

# Save
joblib.dump(lgb_model, os.path.join(ARTIFACT_DIR, "lgb_model.pkl"))
joblib.dump(
    {"feature_order": X_train.columns.tolist(), "lgb_params": lgb_params},
    os.path.join(ARTIFACT_DIR, "lgb_model_meta.pkl"),
)
print("\nSaved model + meta to:", ARTIFACT_DIR)

Training MultiOutput LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 615
[LightGBM] [Info] Number of data points in the train set: 31351, number of used features: 42
[LightGBM] [Info] Start training from score 6.493952
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004948 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 615
[LightGBM] [Info] Number of data points in the train set: 31351, number of used features: 42
[LightGBM] [Info] Start training from score 6.406776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003829 seconds.
You can set `force_row_wise=true` to remove the overh




Per-target metrics:
    target         RMSE           MAE       R2
    owners 6.336046e+05  63056.830430 0.201356
   players 5.222761e+05  46552.737653 0.144419
copiesSold 4.532626e+05  38698.174255 0.185481
   revenue 5.136580e+06 393680.157435 0.185756

Average R²: 0.1792528519085004

Per-target metrics (R² in %):
    target         RMSE           MAE  R2_%
    owners 6.336046e+05  63056.830430 20.14
   players 5.222761e+05  46552.737653 14.44
copiesSold 4.532626e+05  38698.174255 18.55
   revenue 5.136580e+06 393680.157435 18.58

Average R² (%): 17.93

Top 20 features by average importance:
days_since_release         6107.50
achievements               4533.75
price                      4490.00
release_day                3474.50
release_month              1902.50
publisherClass_encoded     1078.00
Casual                     1000.50
Action                      911.00
Adventure                   903.75
Indie                       881.25
Simulation                  865.00
Steam_Cloud  

In [8]:
# ---------------------------------------------------------------------
# Model: XGBoost + MultiOutput (baseline)
# ---------------------------------------------------------------------
try:
    import xgboost as xgb
except ImportError as e:
    raise RuntimeError("xgboost is not installed. Run: pip install xgboost") from e

xgb_params = dict(
    objective="reg:squarederror",
    random_state=42,
    n_estimators=800,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    tree_method="hist",
    nthread=-1,
)

xgb_base = xgb.XGBRegressor(**xgb_params)
xgb_model = MultiOutputRegressor(xgb_base)

print("\nTraining MultiOutput XGBoost (baseline)...")
xgb_model.fit(X_train.values, y_train_log.values)

# Predict & invert log
xgb_pred_log = xgb_model.predict(X_test.values)
xgb_pred = np.expm1(xgb_pred_log)
xgb_true = np.expm1(y_test_log.values)

# Metrics
xgb_metrics = evaluate_predictions(xgb_true, xgb_pred, TARGET_COLS)
print("\n[XGBoost] Per-target metrics:")
print(xgb_metrics.to_string(index=False))
print("\n[XGBoost] Average R²:", xgb_metrics["R2"].mean())

# R² in %
xgb_metrics_pct = xgb_metrics.copy()
xgb_metrics_pct["R2_%"] = (xgb_metrics_pct["R2"] * 100).round(2)
print("\n[XGBoost] Per-target metrics (R² in %):")
print(xgb_metrics_pct[["target", "RMSE", "MAE", "R2_%"]].to_string(index=False))
print("\n[XGBoost] Average R² (%):", round(xgb_metrics_pct["R2_%"].mean(), 2))

# Optional: feature importance (avg across outputs)
try:
    imps = np.vstack([est.feature_importances_ for est in xgb_model.estimators_])
    fi_avg = imps.mean(axis=0)
    fi_xgb = pd.Series(fi_avg, index=X_train.columns).sort_values(ascending=False)
    print("\n[XGBoost] Top 20 features by average importance:")
    print(fi_xgb.head(20).to_string())
except Exception as e:
    print("XGBoost feature importance not available:", e)

# Save
joblib.dump(xgb_model, os.path.join(ARTIFACT_DIR, "xgb_model.pkl"))
joblib.dump({"xgb_params": xgb_params}, os.path.join(ARTIFACT_DIR, "xgb_params_base.pkl"))


Training MultiOutput XGBoost (baseline)...

[XGBoost] Per-target metrics:
    target         RMSE           MAE        R2
    owners 6.888263e+05  66610.942076  0.056078
   players 5.633629e+05  49094.654984  0.004509
copiesSold 5.036946e+05  42459.389821 -0.005856
   revenue 7.624402e+06 451296.845311 -0.793983

[XGBoost] Average R²: -0.18481299772229737

[XGBoost] Per-target metrics (R² in %):
    target         RMSE           MAE   R2_%
    owners 6.888263e+05  66610.942076   5.61
   players 5.633629e+05  49094.654984   0.45
copiesSold 5.036946e+05  42459.389821  -0.59
   revenue 7.624402e+06 451296.845311 -79.40

[XGBoost] Average R² (%): -18.48

[XGBoost] Top 20 features by average importance:
is_free                   0.309149
publisherClass_encoded    0.219722
Family_Sharing            0.120056
Steam_Trading_Cards       0.091796
price                     0.032425
Online_Co_op              0.018815
achievements              0.015663
Free_To_Play              0.015631
Online_PvP 

['./artifacts\\xgb_params_base.pkl']

In [9]:
# ---------------------------------------------------------------------
# Model: CatBoost + MultiOutput (baseline)
# ---------------------------------------------------------------------
try:
    from catboost import CatBoostRegressor
except ImportError as e:
    raise RuntimeError("catboost is not installed. Run: pip install catboost") from e

cb_params = dict(
    loss_function="RMSE",
    random_seed=42,
    n_estimators=1200,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=3.0,
    subsample=0.8,
    verbose=0,
    thread_count=-1
)

cb_base = CatBoostRegressor(**cb_params)
cb_model = MultiOutputRegressor(cb_base)

print("\nTraining MultiOutput CatBoost (baseline)...")
cb_model.fit(X_train.values, y_train_log.values)

# Predict & invert log
cb_pred_log = cb_model.predict(X_test.values)
cb_pred = np.expm1(cb_pred_log)
cb_true = np.expm1(y_test_log.values)

# Metrics
cb_metrics = evaluate_predictions(cb_true, cb_pred, TARGET_COLS)
print("\n[CatBoost] Per-target metrics:")
print(cb_metrics.to_string(index=False))
print("\n[CatBoost] Average R²:", cb_metrics["R2"].mean())

# R² in %
cb_metrics_pct = cb_metrics.copy()
cb_metrics_pct["R2_%"] = (cb_metrics_pct["R2"] * 100).round(2)
print("\n[CatBoost] Per-target metrics (R² in %):")
print(cb_metrics_pct[["target", "RMSE", "MAE", "R2_%"]].to_string(index=False))
print("\n[CatBoost] Average R² (%):", round(cb_metrics_pct["R2_%"].mean(), 2))

# Optional: feature importance (avg across outputs)
try:
    imps = np.vstack([est.get_feature_importance() for est in cb_model.estimators_])
    fi_avg = imps.mean(axis=0)
    fi_cb = pd.Series(fi_avg, index=X_train.columns).sort_values(ascending=False)
    print("\n[CatBoost] Top 20 features by average importance:")
    print(fi_cb.head(20).to_string())
except Exception as e:
    print("CatBoost feature importance not available:", e)

# Save
joblib.dump(cb_model, os.path.join(ARTIFACT_DIR, "cb_model.pkl"))
joblib.dump({"cb_params": cb_params}, os.path.join(ARTIFACT_DIR, "catboost_params_base.pkl"))



Training MultiOutput CatBoost (baseline)...

[CatBoost] Per-target metrics:
    target         RMSE           MAE       R2
    owners 6.493415e+05  65354.760083 0.161191
   players 5.400553e+05  47822.017634 0.085176
copiesSold 4.685344e+05  39927.846956 0.129669
   revenue 5.018579e+06 389178.421713 0.222737

[CatBoost] Average R²: 0.14969343070685762

[CatBoost] Per-target metrics (R² in %):
    target         RMSE           MAE  R2_%
    owners 6.493415e+05  65354.760083 16.12
   players 5.400553e+05  47822.017634  8.52
copiesSold 4.685344e+05  39927.846956 12.97
   revenue 5.018579e+06 389178.421713 22.27

[CatBoost] Average R² (%): 14.97

[CatBoost] Top 20 features by average importance:
publisherClass_encoded    26.036092
price                     14.841283
Family_Sharing             9.737297
achievements               8.328099
days_since_release         4.367805
is_free                    4.075334
Steam_Trading_Cards        2.969327
release_day                2.495467
Indie    

['./artifacts\\catboost_params_base.pkl']

In [10]:
# ---------------------------------------------------------------------
# Loss Plot
# ---------------------------------------------------------------------

# X_tr, X_val, y_tr, y_val = train_test_split(
#     X_train, y_train_log, test_size=0.2, random_state=42
# )

# cb_base_params = dict(
#     loss_function="RMSE",
#     random_seed=42,
#     n_estimators=1200,
#     learning_rate=0.03,
#     depth=8,
#     l2_leaf_reg=3.0,
#     subsample=0.8,
#     rsm=0.8,
#     verbose=0,
#     od_type="Iter",   # enable overfitting detector
#     od_wait=100,
#     use_best_model=True
# )

# models = {}
# evals_results = {}

# for t in TARGET_COLS:
#     m = CatBoostRegressor(**cb_base_params)
#     m.fit(
#         X_tr, y_tr[t],
#         eval_set=(X_val, y_val[t]),
#         verbose=100  # prints progress every 100 iters
#     )
#     models[t] = m
#     evals_results[t] = m.get_evals_result()  # {'learn':{'RMSE':[...]}, 'validation':{'RMSE':[...]}}

# # Plot curves
# for t in TARGET_COLS:
#     learn_rmse = evals_results[t]['learn']['RMSE']
#     val_rmse   = evals_results[t]['validation']['RMSE']

#     plt.figure()
#     plt.plot(learn_rmse, label='train RMSE')
#     plt.plot(val_rmse, label='val RMSE')
#     plt.title(f'CatBoost learning curves — {t} (log1p scale)')
#     plt.xlabel('Iteration')
#     plt.ylabel('RMSE')
#     plt.legend()
#     plt.show()

In [11]:
# # ---------------------------------------------------------------------
# # Per-target CatBoost (tune REVENUE only, train all targets separately)
# # ---------------------------------------------------------------------
# try:
#     from catboost import CatBoostRegressor
# except ImportError as e:
#     raise RuntimeError("catboost is not installed. Run: pip install catboost") from e

# from sklearn.model_selection import RandomizedSearchCV

# # Baseline params for non-revenue targets (same spirit as your baseline)
# cb_base_params = dict(
#     loss_function="RMSE",
#     random_seed=42,
#     n_estimators=1200,
#     learning_rate=0.03,
#     depth=8,
#     l2_leaf_reg=3.0,
#     subsample=0.8,
#     rsm=0.8,          
#     verbose=0,
#     thread_count=-1,
#     od_type="Iter",
#     od_wait=100
# )

# # Search space for revenue tuning (log-space target)
# rev_param_dist = {
#     "n_estimators":     [1200, 1600, 2000, 3000],
#     "learning_rate":    [0.01, 0.02, 0.03, 0.06],
#     "depth":            [6, 8, 10],
#     "l2_leaf_reg":      [1.0, 3.0, 5.0, 7.0, 9.0],
#     "subsample":        [0.7, 0.8, 1.0],
#     "rsm":              [0.7, 0.8, 1.0],
#     "random_strength":  [0.0, 0.5, 1.0, 2.0],
#     "min_data_in_leaf": [1, 5, 20, 50],
# }

# print("\n[Tune] CatBoost for 'revenue' (log-space) ...")
# cb_rev_base = CatBoostRegressor(**cb_base_params)

# rev_tuner = RandomizedSearchCV(
#     estimator=cb_rev_base,
#     param_distributions=rev_param_dist,
#     n_iter=30,                          
#     scoring="neg_root_mean_squared_error",
#     cv=3,
#     verbose=1,
#     random_state=42,
#     n_jobs=-1,
# )

# rev_tuner.fit(X_train, y_train_log["revenue"])
# cb_rev_best = rev_tuner.best_params_
# print("Best params for revenue:", cb_rev_best)

# # Build per-target estimators (revenue uses tuned params)
# print("\n[Train] Per-target CatBoost models ...")
# cb_models_by_target = {}
# for t in TARGET_COLS:
#     if t == "revenue":
#         est = CatBoostRegressor(**{**cb_base_params, **cb_rev_best})
#     else:
#         est = CatBoostRegressor(**cb_base_params)
#     est.fit(X_train, y_train_log[t])
#     cb_models_by_target[t] = est

# # Predict all targets (stack per-target preds) & invert log
# cb_pt_pred_log = np.column_stack([cb_models_by_target[t].predict(X_test) for t in TARGET_COLS])
# cb_pt_pred = np.expm1(cb_pt_pred_log)
# cb_pt_true = np.expm1(y_test_log.values)

# # Metrics
# cb_pt_metrics = evaluate_predictions(cb_pt_true, cb_pt_pred, TARGET_COLS)
# print("\n[Per-target CatBoost] Per-target metrics (raw):")
# print(cb_pt_metrics.to_string(index=False))
# print("\n[Per-target CatBoost] Average R²:", cb_pt_metrics["R2"].mean())

# cb_pt_metrics_pct = cb_pt_metrics.copy()
# cb_pt_metrics_pct["R2_%"] = (cb_pt_metrics_pct["R2"] * 100).round(2)
# print("\n[Per-target CatBoost] Per-target metrics (R² in %):")
# print(cb_pt_metrics_pct[["target", "RMSE", "MAE", "R2_%"]].to_string(index=False))
# print("\n[Per-target CatBoost] Average R² (%):", round(cb_pt_metrics_pct["R2_%"].mean(), 2))

# # Save artifacts
# joblib.dump(cb_models_by_target, os.path.join(ARTIFACT_DIR, "catboost_per_target_models.pkl"))
# joblib.dump(cb_rev_best, os.path.join(ARTIFACT_DIR, "catboost_revenue_best_params.pkl"))


In [12]:
# # ---------------------------------------------------------------------
# # Save artifact
# # ---------------------------------------------------------------------
# joblib.dump(model, os.path.join(ARTIFACT_DIR, "lgb_multioutput.pkl"))
# joblib.dump({"feature_order": X_train.columns.tolist(), "lgb_params": lgb_params}, os.path.join(ARTIFACT_DIR, "model_meta.pkl"))
# print("\nSaved model + meta to:", ARTIFACT_DIR)

# # ---------------------------------------------------------------------
# # Helper: Predict up to a planned EXTRACT date (for dev UI)
# # ---------------------------------------------------------------------
# def predict_until_deadline(input_row: dict, feature_order=None):
#     """
#     input_row: dict containing the same pre-release fields used in X (numeric only),
#                plus the required date parts:
#                'release_year','release_month','release_day','extract_year','extract_month','extract_day'
#     NOTE: Do NOT include POST_RELEASE fields; this simulates pre-release planning.

#     Returns: dict with predictions for the 4 targets.
#     """
#     # Build a 1-row DataFrame
#     xr = pd.DataFrame([input_row]).copy()

#     # Build days_since_release from parts
#     if not {'release_year','release_month','release_day','extract_year','extract_month','extract_day'}.issubset(xr.columns):
#         raise ValueError("Missing release_* or extract_* date parts in input_row.")
#     rel = pd.to_datetime(dict(
#         year=xr['release_year'].astype(int),
#         month=xr['release_month'].astype(int),
#         day=xr['release_day'].astype(int)
#     ), errors='coerce')
#     ext = pd.to_datetime(dict(
#         year=xr['extract_year'].astype(int),
#         month=xr['extract_month'].astype(int),
#         day=xr['extract_day'].astype(int)
#     ), errors='coerce')
#     xr['days_since_release'] = (ext - rel).dt.days

#     # Basic engineered features (same as training)
#     if 'price' in xr.columns and 'achievements' in xr.columns:
#         xr['price_per_achievement'] = xr['price'] / (xr['achievements'] + 1)
#     if 'wishlists' in xr.columns and 'days_since_release' in xr.columns:
#         xr['wishlists_per_day'] = xr['wishlists'] / (xr['days_since_release'] + 1)
#     if 'price' in xr.columns and 'days_since_release' in xr.columns:
#         xr['price_x_age'] = xr['price'] * np.log1p(xr['days_since_release'])

#     # Convert bools to ints
#     for c in xr.columns:
#         if xr[c].dtype == bool:
#             xr[c] = xr[c].astype(int)

#     # Keep numeric only & drop post-release leakage
#     numeric_cols = xr.select_dtypes(include=[np.number]).columns.tolist()
#     xr_num = xr[numeric_cols].drop(columns=[c for c in POST_RELEASE if c in numeric_cols], errors='ignore')

#     # Reindex to training feature order (missing cols -> 0, extras dropped)
#     if feature_order is None:
#         feature_order = X_train.columns.tolist()
#     xr_num = xr_num.reindex(columns=feature_order, fill_value=0.0)

#     pred_log = model.predict(xr_num.values)
#     pred = np.expm1(pred_log)[0]
#     return dict(zip(TARGET_COLS, pred.astype(float)))

## Prediction

In [13]:
features_used = X_train.columns.tolist()

print("Features used for prediction:")
for feature in features_used:
    print(f" - {feature}")

features_array = np.array(features_used)
joblib.dump(np.array(features_used), Path(ARTIFACT_DIR) / "features_used.pkl")

Features used for prediction:
 - price
 - is_free
 - release_year
 - release_month
 - release_day
 - extract_year
 - extract_month
 - extract_day
 - publisherClass_encoded
 - required_age
 - achievements
 - english
 - windows
 - mac
 - linux
 - Single_player
 - Family_Sharing
 - Steam_Achievements
 - Steam_Cloud
 - Full_controller_support
 - Multi_player
 - Partial_Controller_Support
 - Steam_Trading_Cards
 - PvP
 - Co_op
 - Steam_Leaderboards
 - Remote_Play_Together
 - Online_PvP
 - Shared_Split_Screen
 - Tracked_Controller_Support
 - VR_Only
 - Shared_Split_Screen_PvP
 - Online_Co_op
 - Stats
 - Shared_Split_Screen_Co_op
 - Indie
 - Casual
 - Adventure
 - Action
 - Simulation
 - Strategy
 - RPG
 - Free_To_Play
 - Sports
 - Racing
 - days_since_release


['artifacts\\features_used.pkl']

In [14]:
def predict_game_success(user_input: dict, model, features_used: list):
    """
    Predict success metrics for a new game based on user input.

    Parameters:
        user_input (dict): Raw user inputs.
        model: Trained MultiOutputRegressor model.
        features_used (list): List of feature column names used in training.
    """
    # 1. Base dictionary with all 0s
    input_data = {feature: 0 for feature in features_used}

    # 2. Basic direct inputs
    input_data['price'] = user_input.get('price', 0)
    input_data['is_free'] = int(user_input.get('is_free', False))
    input_data['required_age'] = user_input.get('required_age', 0)
    input_data['achievements'] = user_input.get('achievements', 0)
    input_data['english'] = int(user_input.get('english', True))

    # 3. Platform & features flags
    platform_flags = ['windows', 'mac', 'linux']
    tag_flags = [
        'Single-player', 'Family Sharing', 'Steam Achievements', 'Steam Cloud',
        'Full controller support', 'Multi-player', 'Partial Controller Support',
        'Steam Trading Cards', 'PvP', 'Co-op', 'Steam Leaderboards', 'Remote Play Together',
        'Online PvP', 'Shared/Split Screen', 'Tracked Controller Support', 'VR Only',
        'Shared/Split Screen PvP', 'Online Co-op', 'Stats', 'Shared/Split Screen Co-op'
    ]
    genre_flags = ['Indie', 'Casual', 'Adventure', 'Action', 'Simulation',
                   'Strategy', 'RPG', 'Free To Play', 'Sports', 'Racing']

    for flag in platform_flags + tag_flags + genre_flags:
        input_data[flag] = int(user_input.get(flag, False))

    # 4. Publisher Class (encoded) — still numeric
    input_data['publisherClass_encoded'] = user_input.get('publisherClass_encoded', 0)

    # 5. Days Since Release
    release_date = pd.to_datetime(user_input['release_date'])
    extract_date = pd.to_datetime(user_input['extract_date'])
    input_data['days_since_release'] = (extract_date - release_date).days

    # Convert to DataFrame with correct feature order
    input_df = pd.DataFrame([input_data])[features_used]

    # Predict (trained on log-transformed targets)
    y_pred_log = model.predict(input_df)
    y_pred = np.expm1(y_pred_log)

    return {
        'owners': int(y_pred[0][0]),
        'players': int(y_pred[0][1]),
        'copiesSold': int(y_pred[0][2]),
        'revenue': float(y_pred[0][3])
    }

In [15]:
def get_game_input_format(df, steamid=None, row_num=None):
    if steamid is not None:
        game = df[df["steamid"] == steamid]
        if game.empty:
            raise ValueError(f"No game found with steamid {steamid}")
        game = game.iloc[0]
    elif row_num is not None:
        game = df.iloc[row_num]
    else:
        raise ValueError("You must provide either a steamid or a row_num.")

    release_date = f"{int(game['release_year'])}-{int(game['release_month']):02d}-{int(game['release_day']):02d}"
    extract_date = f"{int(game['extract_year'])}-{int(game['extract_month']):02d}-{int(game['extract_day']):02d}"

    input_data = {
        "price": float(game["price"]),
        "is_free": bool(game["is_free"]),
        "required_age": int(game["required_age"]),
        "achievements": int(game["achievements"]),
        "english": bool(game["english"]),
        "windows": bool(game["windows"]),
        "mac": bool(game["mac"]),
        "linux": bool(game["linux"]),
        "release_date": release_date,
        "extract_date": extract_date,
        "publisherClass_encoded": int(game["publisherClass_encoded"]),
    }

    # Add boolean flags
    boolean_cols = game.index[
        game.index.isin([
            "Single-player","Family Sharing","Steam Achievements","Steam Cloud",
            "Full controller support","Multi-player","Partial Controller Support",
            "Steam Trading Cards","PvP","Co-op","Steam Leaderboards","Remote Play Together",
            "Online PvP","Shared/Split Screen","Tracked Controller Support","VR Only",
            "Shared/Split Screen PvP","Online Co-op","Stats","Shared/Split Screen Co-op",
            "Indie","Casual","Adventure","Action","Simulation","Strategy","RPG",
            "Free To Play","Sports","Racing"
        ])
    ]
    for col in boolean_cols:
        input_data[col] = bool(game[col])

    # Extract targets for comparison
    players = game["players"]
    owners = game["owners"]
    copies_sold = game["copiesSold"]
    revenue = game["revenue"]
    wishlists = game["wishlists"]
    avg_playtime = game["avgPlaytime"]
    followers = game["followers"]
    reviews = game["reviews"]
    review_score = game["reviewScore"]

    return (
        input_data,
        players,
        owners,
        copies_sold,
        revenue,
        wishlists,
        avg_playtime,
        followers,
        reviews,
        review_score,
    )

In [16]:
# # --- universal input builder (keeps your behavior) ---
# def _build_input_df(user_input: dict, features_used: list) -> pd.DataFrame:
#     # start with zeros for every training feature
#     input_data = {feature: 0 for feature in features_used}

#     # basics
#     input_data['price'] = user_input.get('price', 0)
#     input_data['is_free'] = int(user_input.get('is_free', False))
#     input_data['required_age'] = user_input.get('required_age', 0)
#     input_data['achievements'] = user_input.get('achievements', 0)
#     input_data['english'] = int(user_input.get('english', True))

#     # platform/tags/genres
#     platform_flags = ['windows', 'mac', 'linux']
#     tag_flags = [
#         'Single-player','Family Sharing','Steam Achievements','Steam Cloud',
#         'Full controller support','Multi-player','Partial Controller Support',
#         'Steam Trading Cards','PvP','Co-op','Steam Leaderboards','Remote Play Together',
#         'Online PvP','Shared/Split Screen','Tracked Controller Support','VR Only',
#         'Shared/Split Screen PvP','Online Co-op','Stats','Shared/Split Screen Co-op'
#     ]
#     genre_flags = ['Indie','Casual','Adventure','Action','Simulation',
#                    'Strategy','RPG','Free To Play','Sports','Racing']
#     for flag in platform_flags + tag_flags + genre_flags:
#         input_data[flag] = int(user_input.get(flag, False))

#     # numeric class encoding (if present)
#     input_data['publisherClass_encoded'] = user_input.get('publisherClass_encoded', 0)

#     # days_since_release from user-provided dates
#     release_date = pd.to_datetime(user_input['release_date'])
#     extract_date = pd.to_datetime(user_input['extract_date'])
#     input_data['days_since_release'] = (extract_date - release_date).days

#     # build 1-row DF in training feature order
#     return pd.DataFrame([input_data])[features_used]


# # --- predict with MultiOutputRegressor OR per-target dict ---
# def predict_game_success(user_input: dict, estimator, features_used: list,
#                          target_cols=('owners','players','copiesSold','revenue')):
#     """
#     Works with:
#       - MultiOutputRegressor models (e.g., LightGBM, XGBoost, CatBoost baseline)
#       - Per-target models dict, e.g. {'owners': est, 'players': est, ...} or {0: est, 1: est, ...}
#     All models expected to be trained on log1p(target).
#     """
#     X = _build_input_df(user_input, features_used)

#     # Case A: dict of per-target estimators
#     if isinstance(estimator, dict):
#         preds = {}
#         for idx, t in enumerate(target_cols):
#             # allow either name or index keys
#             est = estimator.get(t, estimator.get(idx))
#             if est is None:
#                 raise ValueError(f"No estimator found for target '{t}' (or index {idx}) in models dict.")
#             log_pred = est.predict(X)[0]
#             preds[t] = float(np.expm1(log_pred))

#         # cast to desired types
#         return {
#             'owners': int(round(preds['owners'])),
#             'players': int(round(preds['players'])),
#             'copiesSold': int(round(preds['copiesSold'])),
#             'revenue': float(preds['revenue']),
#         }

#     # Case B: MultiOutputRegressor (or any estimator returning shape (n,4))
#     else:
#         y_pred_log = estimator.predict(X)
#         # Ensure 2D
#         y_pred_log = np.atleast_2d(y_pred_log)
#         y_pred = np.expm1(y_pred_log)[0]

#         return {
#             'owners': int(round(y_pred[0])),
#             'players': int(round(y_pred[1])),
#             'copiesSold': int(round(y_pred[2])),
#             'revenue': float(y_pred[3]),
#         }


In [17]:
# Load your dataset
df = pd.read_csv("steam_dataset.csv")

steamid = 235520
#24880
#2124490
#315210
#340020
#290100


# --- Step 1: Get game input and actual values from dataset
(
    game_dict,
    players,
    owners,
    copies_sold,
    revenue,
    wishlists,
    avg_playtime,
    followers,
    reviews,
    review_score,
) = get_game_input_format(df, steamid=steamid)

preds = predict_game_success(
    game_dict, lgb_model, features_used
)

preds2 = predict_game_success(
    game_dict, xgb_model, features_used
)

preds3 = predict_game_success(
    game_dict, cb_model, features_used
)

# print(preds)
for k, v in preds.items():
    print(f"{k}: {v:,.0f}" if k != "revenue" else f"{k}: ${v:,.2f}\n")
    
for k, v in preds2.items():
    print(f"{k}: {v:,.0f}" if k != "revenue" else f"{k}: ${v:,.2f}\n")

for k, v in preds3.items():
    print(f"{k}: {v:,.0f}" if k != "revenue" else f"{k}: ${v:,.2f}\n")



  df = pd.read_csv("steam_dataset.csv")


owners: 14,733
players: 11,845
copiesSold: 9,017
revenue: $38,362.87

owners: 36,930
players: 19,844
copiesSold: 12,910
revenue: $47,276.65

owners: 23,092
players: 13,996
copiesSold: 10,671
revenue: $28,754.22



In [18]:
actual = {
    "owners": owners,
    "players": players,
    "copiesSold": copies_sold,
    "revenue": revenue,
}

# Define headers and rows
headers = ["Metric", "Predicted", "Actual", "Absolute Error", "Percentage Error", "Accuracy"]

# Build table rows
rows = []

def print_preds(preds):
    for key in preds:
        pred_val = preds[key]
        actual_val = actual[key]
        abs_error = abs(actual_val - pred_val)
        pct_error = abs_error / actual_val * 100 if actual_val != 0 else 0
        accuracy = 100 - pct_error 

        if key == "revenue":
            row = [
                key,
                f"${pred_val:,.2f}",
                f"${actual_val:,.2f}",
                f"${abs_error:,.2f}",
                f"{pct_error:.2f}%",
                f"{accuracy:.2f}%",
            ]
        else:
            row = [
                key,
                f"{pred_val:,}",
                f"{actual_val:,}",
                f"{abs_error:,}",
                f"{pct_error:.2f}%",
                f"{accuracy:.2f}%",
            ]
        rows.append(row)


print_preds(preds)
print_preds(preds2)
print_preds(preds3)
print(tabulate.tabulate(rows, headers=headers, tablefmt="grid"))

+------------+-------------+------------+------------------+--------------------+------------+
| Metric     | Predicted   | Actual     | Absolute Error   | Percentage Error   | Accuracy   |
| owners     | 14,733      | 27,609.0   | 12,876.0         | 46.64%             | 53.36%     |
+------------+-------------+------------+------------------+--------------------+------------+
| players    | 11,845      | 15,060.0   | 3,215.0          | 21.35%             | 78.65%     |
+------------+-------------+------------+------------------+--------------------+------------+
| copiesSold | 9,017       | 11,412.0   | 2,395.0          | 20.99%             | 79.01%     |
+------------+-------------+------------+------------------+--------------------+------------+
| revenue    | $38,362.87  | $65,992.00 | $27,629.13       | 41.87%             | 58.13%     |
+------------+-------------+------------+------------------+--------------------+------------+
| owners     | 36,930      | 27,609.0   | 9,321.0 

In [19]:
# Save the target game's data to a json file
game_row = df[df["steamid"] == steamid].to_dict(orient="records")[0]

output = {
    "steamid": steamid,
    "dataset_entry": game_row
}

with open(f"./jsons/game_{steamid}.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"Saved game data game_{steamid}.json")

Saved game data game_235520.json
