In [23]:
import json
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

import re

In [2]:
df = pd.read_csv('/Users/farazahmed/Documents/GitHub/steam-discount-forecast/model_data/steam_games_final.csv')
df.head()

Unnamed: 0,itad_id,appid,title,release_date,early_access,mature,launch_price,days_to_first_sale,Windows,Mac,...,release_year,release_month,release_day,release_quarter,release_weekday,is_holiday_season,is_summer_sale_window,time_since_steam_launch,is_series,publisher_game_count
0,018d937e-e9ba-71b7-b901-de864dd7397c,1040510.0,Princess of Zeven,2023-10-28,0,1,12.99,54,1,0,...,2023,10,28,4,5,0,0,7351,0,95
1,018d937e-e9c0-7185-a3a5-de3b8bff7956,832360.0,Hotel Magnate,2021-10-05,1,0,24.99,50,1,1,...,2021,10,5,4,1,0,0,6598,0,1
2,018d937e-e9c1-71fc-9f4a-472d6505c3cb,2361080.0,Step by Step,2023-10-06,0,0,9.99,35,1,0,...,2023,10,6,4,4,0,0,7329,0,1
3,018d937e-e9cb-728b-8309-979905bf3e82,965990.0,Destiny's Sword,2022-10-17,1,0,8.99,66,1,0,...,2022,10,17,4,0,0,0,6975,0,12
4,018d937e-e9ce-718b-9715-111f51df7457,896520.0,Stay Out of the House,2022-10-14,0,0,14.99,39,1,1,...,2022,10,14,4,4,0,0,6972,0,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30947 entries, 0 to 30946
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   itad_id                  30947 non-null  object 
 1   appid                    30947 non-null  float64
 2   title                    30947 non-null  object 
 3   release_date             30947 non-null  object 
 4   early_access             30947 non-null  int64  
 5   mature                   30947 non-null  int64  
 6   launch_price             30947 non-null  float64
 7   days_to_first_sale       30947 non-null  int64  
 8   Windows                  30947 non-null  int64  
 9   Mac                      30947 non-null  int64  
 10  Linux                    30947 non-null  int64  
 11  Achievements             30947 non-null  int64  
 12  Developers               30947 non-null  object 
 13  Publishers               30947 non-null  object 
 14  Categories            

### Encoding categorical features

### Encoding Categories and Genres

The **Categories** and **Genres** columns are stored as semicolon-separated strings.  
Since each game can belong to multiple categories/genres, I am applying **multi-hot encoding** to represent them as binary features.

- **Categories kept (3 total):**
  - Single-player → `cat_singleplayer`
  - Family Sharing → `cat_familysharing`
  - Other → `cat_other`

- **Genres kept (10 total):**
  - Indie → `gen_indie`
  - RPG → `gen_rpg`
  - Casual → `gen_casual`
  - Simulation → `gen_simulation`
  - Strategy → `gen_strategy`
  - Action → `gen_action`
  - Adventure → `gen_adventure`
  - Sports → `gen_sports`
  - Racing → `gen_racing`
  - Massively Multiplayer → `gen_massivelymultiplayer`

> Note: I am excluding **Early Access** from genres, since there is already a dedicated `early_access` column.

**Outcome**:  
This step creates **13 new binary columns** (3 for categories, 10 for genres). Each column is `1` if the game belongs to that category/genre, `0` otherwise.  
This preserves multi-label information while keeping the feature space compact and interpretable.

In [4]:
# one-hot encoding "Categories" and "Genres" columns
df[['Categories', 'Genres']]

Unnamed: 0,Categories,Genres
0,Family Sharing; Other; Single-player,Indie; RPG
1,Family Sharing; Other; Single-player,Casual; Early Access; Indie; Simulation; Strategy
2,Family Sharing; Multi-player; Other; Single-pl...,Action; Adventure; Casual; Indie; Strategy
3,Family Sharing; Single-player,Adventure; Early Access; Indie; RPG; Simulatio...
4,Family Sharing; Other; Single-player,Action; Adventure; Indie; Simulation
...,...,...
30942,Family Sharing; Other; Single-player,Adventure
30943,Family Sharing; Single-player,Adventure; Casual; Indie
30944,Family Sharing; Other; Single-player,Action; Adventure; Casual; Indie
30945,Family Sharing; Single-player,Casual; Indie; Simulation; Strategy


In [6]:

CATEGORIES = ["Single-player", "Family Sharing", "Other"]
GENRES     = ["Indie", "RPG", "Casual", "Simulation", "Strategy",
              "Action", "Adventure", "Sports", "Racing", "Massively Multiplayer"]
# NOTE: "Early Access" intentionally excluded because these is already a dedicated boolean column

# output column names
CAT_COLS = {name: f"cat_{name.lower().replace(' ', '').replace('-', '')}" for name in CATEGORIES}
GEN_COLS = {name: f"gen_{name.lower().replace(' ', '').replace('-', '')}" for name in GENRES}

def multi_hot_exact(df: pd.DataFrame, col: str, keep_tokens: list[str], outmap: dict[str, str]):
    """Multi-hot encode semicolon-separated exact tokens from df[col] into columns in outmap."""
    # init columns
    for out in outmap.values():
        if out not in df.columns:
            df[out] = 0

    # split, strip, match exactly
    split_series = df[col].fillna("").apply(lambda s: [t.strip() for t in s.split(";") if t.strip()])
    for idx, tokens in split_series.items():
        for tok in tokens:
            if tok in keep_tokens:               # exact match
                df.at[idx, outmap[tok]] = 1
    return df

# applying to the df
df = multi_hot_exact(df, "Categories", CATEGORIES, CAT_COLS)
df = multi_hot_exact(df, "Genres", GENRES, GEN_COLS)

# ensure int dtype (0/1)
for c in list(CAT_COLS.values()) + list(GEN_COLS.values()):
    df[c] = df[c].astype(int)

# quick sanity checks (optional)
print("Category flags:", df[list(CAT_COLS.values())].sum().sort_values(ascending=False))
print("Genre flags:",    df[list(GEN_COLS.values())].sum().sort_values(ascending=False))

Category flags: cat_familysharing    30444
cat_singleplayer     30274
cat_other            22505
dtype: int64
Genre flags: gen_indie                   22081
gen_casual                  14426
gen_adventure               13107
gen_action                  12694
gen_simulation               6631
gen_strategy                 5917
gen_rpg                      5745
gen_sports                   1312
gen_racing                   1155
gen_massivelymultiplayer      375
dtype: int64


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30947 entries, 0 to 30946
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   itad_id                   30947 non-null  object 
 1   appid                     30947 non-null  float64
 2   title                     30947 non-null  object 
 3   release_date              30947 non-null  object 
 4   early_access              30947 non-null  int64  
 5   mature                    30947 non-null  int64  
 6   launch_price              30947 non-null  float64
 7   days_to_first_sale        30947 non-null  int64  
 8   Windows                   30947 non-null  int64  
 9   Mac                       30947 non-null  int64  
 10  Linux                     30947 non-null  int64  
 11  Achievements              30947 non-null  int64  
 12  Developers                30947 non-null  object 
 13  Publishers                30947 non-null  object 
 14  Catego

### Target Encoding for Developers and Publishers

For high-cardinality features like **Developers** and **Publishers**, I am using **target encoding**.  
This transforms each entity into a numerical value based on the average target (`days_to_first_sale`), with smoothing toward the global mean to avoid extreme values for rare entities.

#### Training Phase
- I apply **K-Fold cross-validation target encoding** to avoid leakage.  
- Each row’s encoding is computed from other folds, ensuring it never "sees" its own target.  
- For games with multiple developers/publishers, I take the **average** of their encodings.  
- Rare entities are smoothed toward the global mean using:  

$$
\hat{\mu}_e = \frac{n_e \cdot \bar{y}_e + m \cdot \bar{y}}{n_e + m}
$$

where:  
- \( \bar{y}_e \) = mean target for entity *e*  
- \( n_e \) = number of samples for entity *e*  
- \( \bar{y} \) = global mean of the target  
- \( m \) = smoothing factor

#### Inference / Production Phase
- After CV encoding, I fit **final maps on the entire training set**:  
  - `dev_map_full` / `dev_global` → Developers  
  - `pub_map_full` / `pub_global` → Publishers  
- These maps are **saved as JSON artifacts** alongside the model.  
- When making predictions:
  - Split the semicolon-separated names.
  - Map each entity to its encoding (if present).
  - Take the **average** if multiple are found.
  - If none are found (unseen entity), use the **global mean**.

#### Outcome
This approach:
- Avoids target leakage during training,
- Provides stable values for rare or unseen entities,
- Handles multi-valued cells gracefully,
- Produces two new features: `dev_te` and `pub_te`, ready for modeling.

In [13]:
TARGET_COL = "days_to_first_sale"  # continuous target
N_SPLITS = 5
RANDOM_STATE = 42
SMOOTH_M = 50  # smoothing strength; larger → more pull toward global mean

def split_multi(s: str):
    if pd.isna(s): 
        return []
    return [t.strip() for t in str(s).split(";") if t.strip()]

def _build_entity_stats(train_df, col_entities, target_col, m_smooth):
    """
    Build per-entity smoothed mean target from train_df.
    Returns: dict {entity: encoded_value}, and global_mean.
    """
    global_mean = train_df[target_col].mean()

    # explode to one row per entity
    tmp = (train_df[[col_entities, target_col]]
           .assign(_list = train_df[col_entities].apply(split_multi))
           .explode("_list")
           .dropna(subset=["_list"]))

    # entity counts and sums
    grp = tmp.groupby("_list")[target_col].agg(["count", "mean"]).rename_axis("entity")
    # smoothed mean: (n*mean + m*global) / (n + m)
    enc = (grp["count"] * grp["mean"] + m_smooth * global_mean) / (grp["count"] + m_smooth)
    return enc.to_dict(), float(global_mean)

def _apply_entity_encoding(val_df, col_entities, entity_to_enc, global_mean):
    """
    Map each row's semicolon-separated entities to an encoded value (mean of entities).
    If none known → global_mean.
    """
    vals = []
    for s in val_df[col_entities].fillna(""):
        ents = split_multi(s)
        if not ents:
            vals.append(global_mean)
            continue
        mapped = [entity_to_enc.get(e) for e in ents if e in entity_to_enc]
        if mapped:
            vals.append(float(np.mean(mapped)))
        else:
            vals.append(global_mean)
    return np.array(vals, dtype=float)

def target_encode_multivalued_cv(df, col_entities, target_col=TARGET_COL, n_splits=N_SPLITS, random_state=RANDOM_STATE, m_smooth=SMOOTH_M):
    """
    Leakage-safe CV target encoding for a multi-valued categorical column.
    Returns a numpy array of encoded values aligned to df.index.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    enc_values = np.zeros(len(df), dtype=float)

    for train_idx, val_idx in kf.split(df):
        train_df = df.iloc[train_idx]
        val_df   = df.iloc[val_idx]

        entity_to_enc, global_mean = _build_entity_stats(train_df, col_entities, target_col, m_smooth)
        enc_values[val_idx] = _apply_entity_encoding(val_df, col_entities, entity_to_enc, global_mean)

    return enc_values

# ---- Run encoders for Developers and Publishers ----
df["dev_te"] = target_encode_multivalued_cv(df, "Developers")
df["pub_te"] = target_encode_multivalued_cv(df, "Publishers")

# (Optional) keep also the global, full-fit maps for inference on new data:
dev_map_full, dev_global = _build_entity_stats(df, "Developers", TARGET_COL, SMOOTH_M)
pub_map_full, pub_global = _build_entity_stats(df, "Publishers", TARGET_COL, SMOOTH_M)

# Example of applying to a new/inference dataframe "df_new":
# df_new["dev_te"] = _apply_entity_encoding(df_new, "Developers", dev_map_full, dev_global)
# df_new["pub_te"] = _apply_entity_encoding(df_new, "Publishers", pub_map_full, pub_global)

# Quick sanity check
df[["Developers","dev_te","Publishers","pub_te","days_to_first_sale"]].head()

Unnamed: 0,Developers,dev_te,Publishers,pub_te,days_to_first_sale
0,Lovely Pretty Ultra Loving You,98.847362,Kagura Games,73.159126,54
1,Arcade Oven,97.595929,Arcade Oven,97.595929,50
2,"Tripp Rainey,Brice DiPiazza",97.595929,Trippster Studios,97.595929,35
3,2Dogs Games Ltd.,98.112332,Bonus Stage Publishing,89.460277,66
4,Puppet Combo,97.7131,Puppet Combo,97.7131,39


In [16]:
# saving the mappings for inference later

with open("/Users/farazahmed/Documents/GitHub/steam-discount-forecast/artifacts/dev_te_map.json", "w") as f:
    json.dump({"map": dev_map_full, "global": dev_global, "m_smooth": SMOOTH_M}, f)

with open("/Users/farazahmed/Documents/GitHub/steam-discount-forecast/artifacts/pub_te_map.json", "w") as f:
    json.dump({"map": pub_map_full, "global": pub_global, "m_smooth": SMOOTH_M}, f)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30947 entries, 0 to 30946
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   itad_id                   30947 non-null  object 
 1   appid                     30947 non-null  float64
 2   title                     30947 non-null  object 
 3   release_date              30947 non-null  object 
 4   early_access              30947 non-null  int64  
 5   mature                    30947 non-null  int64  
 6   launch_price              30947 non-null  float64
 7   days_to_first_sale        30947 non-null  int64  
 8   Windows                   30947 non-null  int64  
 9   Mac                       30947 non-null  int64  
 10  Linux                     30947 non-null  int64  
 11  Achievements              30947 non-null  int64  
 12  Developers                30947 non-null  object 
 13  Publishers                30947 non-null  object 
 14  Catego

In [19]:
df_model = df.copy()
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30947 entries, 0 to 30946
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   itad_id                   30947 non-null  object 
 1   appid                     30947 non-null  float64
 2   title                     30947 non-null  object 
 3   release_date              30947 non-null  object 
 4   early_access              30947 non-null  int64  
 5   mature                    30947 non-null  int64  
 6   launch_price              30947 non-null  float64
 7   days_to_first_sale        30947 non-null  int64  
 8   Windows                   30947 non-null  int64  
 9   Mac                       30947 non-null  int64  
 10  Linux                     30947 non-null  int64  
 11  Achievements              30947 non-null  int64  
 12  Developers                30947 non-null  object 
 13  Publishers                30947 non-null  object 
 14  Catego

### Train–Test Split Strategy

I am splitting the dataset **chronologically by release date** instead of random sampling.  
This avoids data leakage from the future into the past and better reflects how the model will be used in practice (predicting sales timing for newly released games).

Two approaches are possible:
- **Fixed year split**: train on games from 2021–2023, test on games released in 2024.  
- **Proportional split (used here)**: sort by release date and cut at the 80th percentile, giving ~80% older games for training and ~20% newer games for testing.

#### Handling Target Encodings
- Target encodings for `Developers` and `Publishers` are recomputed **using only the training data**.  
- The same maps are then applied to both train and test sets.  
- For unseen entities in the test set, the **global mean** from the training set is used as a fallback.

#### Final Setup
- Features: all engineered and encoded columns (after dropping IDs, raw text, and `release_date`).  
- Target: `days_to_first_sale`.  
- Train/test split ensures a realistic evaluation and prepares the encoding artifacts for production use.

In [20]:
# ensuring datetime
df_model["release_date"] = pd.to_datetime(df_model["release_date"])

# sorting by time
df_model = df_model.sort_values("release_date").reset_index(drop=True)

# cutoff at 80% time point
cutoff_date = df_model["release_date"].quantile(0.80)
train = df_model[df_model["release_date"] <= cutoff_date].copy()
test  = df_model[df_model["release_date"] >  cutoff_date].copy()

print("Cutoff:", cutoff_date.date())
print("Train:", train.shape, "Test:", test.shape)

Cutoff: 2024-02-16
Train: (24775, 41) Test: (6172, 41)


In [21]:
# building maps on TRAIN ONLY
dev_map_tr, dev_global_tr = _build_entity_stats(train, "Developers", "days_to_first_sale", SMOOTH_M)
pub_map_tr, pub_global_tr = _build_entity_stats(train, "Publishers", "days_to_first_sale", SMOOTH_M)

# applying to train/test
train["dev_te"] = _apply_entity_encoding(train, "Developers", dev_map_tr, dev_global_tr)
train["pub_te"] = _apply_entity_encoding(train, "Publishers", pub_map_tr, pub_global_tr)

test["dev_te"]  = _apply_entity_encoding(test,  "Developers", dev_map_tr, dev_global_tr)
test["pub_te"]  = _apply_entity_encoding(test,  "Publishers", pub_map_tr, pub_global_tr)

In [22]:
drop_cols = ["itad_id","appid","title","Developers","Publishers","Categories","Genres"]
X_train = train.drop(columns=drop_cols + ["days_to_first_sale"])
y_train = train["days_to_first_sale"].values

X_test  = test.drop(columns=drop_cols + ["days_to_first_sale"])
y_test  = test["days_to_first_sale"].values

# dropping release_date right before fitting (I will keep it for reporting)
X_train = X_train.drop(columns=["release_date"])
X_test  = X_test.drop(columns=["release_date"])

In [24]:
ARTIFACT_DIR = "/Users/farazahmed/Documents/GitHub/steam-discount-forecast/artifacts/map_at_inference"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

payload_dev = {"map": dev_map_tr, "global": dev_global_tr, "m_smooth": SMOOTH_M}
payload_pub = {"map": pub_map_tr, "global": pub_global_tr, "m_smooth": SMOOTH_M}

with open(f"{ARTIFACT_DIR}/dev_te_map.json", "w") as f:
    json.dump(payload_dev, f)

with open(f"{ARTIFACT_DIR}/pub_te_map.json", "w") as f:
    json.dump(payload_pub, f)

# saving split metadata & feature list for reproducibility
split_meta = {
    "cutoff_date": str(cutoff_date),
    "n_train": len(train), "n_test": len(test),
    "features": [c for c in X_train.columns],
    "target": "days_to_first_sale",
}
with open(f"{ARTIFACT_DIR}/split_meta.json", "w") as f:
    json.dump(split_meta, f, indent=2)