# Model Training

This notebook focuses on training the machine learning model using the prepared data. It includes steps for selecting features, training the model, and evaluating its performance.

**Local Model for quickly test**

In [None]:
import os
import json
from pathlib import Path
import polars as pl
import pandas as pd
from catboost import CatBoostClassifier


def _print(msg: str):
    print(f"[train CatBoostClassifier local] {msg}")


def load_cat_features_list(root: Path) -> list[str]:
    p = root / "cat_features.json"
    if p.exists():
        try:
            return json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            pass
    return []


def main():
    root = Path("C:/Projects/trendyol/data")
    # Prefer v2_plus if exists, else fall back to v2
    train_path= root / "train_data_v5.parquet"

    features_importance_dir = root / "models/local/features_importance"
    features_importance_dir.mkdir(parents=True, exist_ok=True)

    out_dir = root / "models/local"
    out_dir.mkdir(parents=True, exist_ok=True)

    _print(f"Loading train parquet… {train_path}")
    df_pl = pl.read_parquet(str(train_path))
    _print(f"Loaded {len(df_pl)} rows.")

    # Data splitting based on time
    if "ts_hour" in df_pl.columns and df_pl["ts_hour"].dtype != pl.Datetime:
        df_pl = df_pl.with_columns(pl.col("ts_hour").str.to_datetime())
    df_pl = df_pl.sort("ts_hour")
    split_idx = int(len(df_pl) * 0.85)
    train_pl, val_pl = df_pl[:split_idx], df_pl[split_idx:]
    train_pd = train_pl.to_pandas()
    val_pd = val_pl.to_pandas()

    # Targets and features
    targets = ["ordered", "clicked"]
    exclude_cols = set(targets + [
        "ts_hour",
        "session_id",
        "content_creation_date",
        "update_date",
        "added_to_cart",
        "added_to_fav"
    ])
    features = [c for c in train_pd.columns if c not in exclude_cols]

    # Cat features from file if available
    cat_features_list = load_cat_features_list(root)
    # Fallback: infer by dtype
    if not cat_features_list:
        for c in features:
            if str(train_pd[c].dtype) in ("object", "string", "string[python]"):
                cat_features_list.append(c)

    _print(f"n_features={len(features)}, n_cats={len(cat_features_list)}")

    # Split targets
    X_train = train_pd[features]
    y_order_train = train_pd["ordered"]
    y_click_train = train_pd["clicked"]

    # Base params
    base_params = dict(
        iterations=5000,
        eval_metric="AUC",
        loss_function="Logloss",
        task_type="GPU",
        random_seed=42,
        verbose=100,
        learning_rate=0.025183,
        early_stopping_rounds=200
    )

    # ordered model
    _print("Training CatBoost for 'ordered'…")
    order_pos = max(1, int(y_order_train.sum()))
    order_neg = max(1, int((y_order_train == 0).sum()))
    order_spw = order_neg / order_pos

    params_ordered = base_params | {"scale_pos_weight": order_spw}
    model_ordered = CatBoostClassifier(**params_ordered, cat_features=cat_features_list)
    model_ordered.fit(X_train, y_order_train, eval_set=(val_pd[features], val_pd["ordered"]), use_best_model=True)
    ordered_path = out_dir / "model_ordered_local.cbm"
    model_ordered.save_model(str(ordered_path))
    _print(f"Saved: {ordered_path}")

    # Feature importance as a DataFrame and save in a format that works
    fi = model_ordered.get_feature_importance(prettified=False)
    df_fi = pd.DataFrame({"feature": features, "importance": fi})
    df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

    # Prefer parquet, fallback to CSV if parquet not available, always also save JSON

    df_fi.to_json(str(features_importance_dir / "importance_ordered.json"), orient="records", lines=True)
    print("_" * 80)
    # clicked model
    _print("Training CatBoost for 'clicked'…")
    click_pos = max(1, int(y_click_train.sum()))
    click_neg = max(1, int((y_click_train == 0).sum()))
    click_spw = click_neg / click_pos

    params_clicked = base_params | {"scale_pos_weight": click_spw}
    model_clicked = CatBoostClassifier(**params_clicked, cat_features=cat_features_list)
    model_clicked.fit(X_train, y_click_train, eval_set=(val_pd[features], val_pd["clicked"]), use_best_model=True)
    clicked_path = out_dir / "model_clicked_local.cbm"
    model_clicked.save_model(str(clicked_path))
    _print(f"Saved: {clicked_path}")

    # Feature importance as a DataFrame and save in a format that works
    fi = model_clicked.get_feature_importance(prettified=False)
    df_fi = pd.DataFrame({"feature": features, "importance": fi})
    df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

    # Prefer parquet, fallback to CSV if parquet not available, always also save JSON

    df_fi.to_json(str(features_importance_dir / "importance_clicked.json"), orient="records", lines=True)
    _print("Done.")


if __name__ == "__main__":
    main()


**Real model for competition**

In [None]:
import os
import json
from pathlib import Path
import polars as pl
import pandas as pd
from catboost import CatBoostClassifier


def _print(msg: str):
    print(f"[train CatBoostClassifier real] {msg}")


def load_cat_features_list(root: Path) -> list[str]:
    p = root / "cat_features.json"
    if p.exists():
        try:
            return json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            pass
    return []


def main():
    root = Path("C:/Projects/trendyol")
    # Prefer v2_plus if exists, else fall back to v2
    train_path= root / "data/train_data_v5.parquet"

    features_importance_dir = root / "models/real/features_importance"
    features_importance_dir.mkdir(parents=True, exist_ok=True)

    out_dir = root / "models/real"
    out_dir.mkdir(parents=True, exist_ok=True)

    _print(f"Loading train parquet… {train_path}")
    df_pl = pl.read_parquet(str(train_path))
    _print(f"Loaded {len(df_pl)} rows.")

    train_pd = df_pl.to_pandas()

    # Targets and features
    targets = ["ordered", "clicked"]
    exclude_cols = set(targets + [
        "ts_hour",
        "session_id",
        "content_creation_date",
        "update_date",
        "added_to_cart",
        "added_to_fav"
    ])
    features = [c for c in train_pd.columns if c not in exclude_cols]

    # Cat features from file if available
    cat_features_list = load_cat_features_list(root)
    # Fallback: infer by dtype
    if not cat_features_list:
        for c in features:
            if str(train_pd[c].dtype) in ("object", "string", "string[python]"):
                cat_features_list.append(c)

    _print(f"n_features={len(features)}, n_cats={len(cat_features_list)}")

    # Split targets
    X_train = train_pd[features]
    y_order_train = train_pd["ordered"]
    y_click_train = train_pd["clicked"]

    # Base params
    base_params = dict(
        loss_function="Logloss",
        task_type="GPU",
        random_seed=42,
        verbose=100,
        learning_rate=0.025183
    )

    # ordered model
    _print("Training CatBoost for 'ordered'…")
    order_pos = max(1, int(y_order_train.sum()))
    order_neg = max(1, int((y_order_train == 0).sum()))
    order_spw = order_neg / order_pos

    params_ordered = base_params | {"scale_pos_weight": order_spw, "iterations": 486}
    model_ordered = CatBoostClassifier(**params_ordered, cat_features=cat_features_list)
    model_ordered.fit(X_train, y_order_train)
    ordered_path = out_dir / "model_ordered_real.cbm"
    model_ordered.save_model(str(ordered_path))
    _print(f"Saved: {ordered_path}")

    # Feature importance as a DataFrame and save in a format that works
    fi = model_ordered.get_feature_importance(prettified=False)
    df_fi = pd.DataFrame({"feature": features, "importance": fi})
    df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

    # Prefer parquet, fallback to CSV if parquet not available, always also save JSON

    df_fi.to_json(str(features_importance_dir / "importance_ordered.json"), orient="records", lines=True)
    print("_" * 80)
    # clicked model
    _print("Training CatBoost for 'clicked'…")
    click_pos = max(1, int(y_click_train.sum()))
    click_neg = max(1, int((y_click_train == 0).sum()))
    click_spw = click_neg / click_pos

    params_clicked = base_params | {"scale_pos_weight": click_spw, "iterations": 1421}
    model_clicked = CatBoostClassifier(**params_clicked, cat_features=cat_features_list)
    model_clicked.fit(X_train, y_click_train)
    clicked_path = out_dir / "model_clicked_real.cbm"
    model_clicked.save_model(str(clicked_path))
    _print(f"Saved: {clicked_path}")

    # Feature importance as a DataFrame and save in a format that works
    fi = model_clicked.get_feature_importance(prettified=False)
    df_fi = pd.DataFrame({"feature": features, "importance": fi})
    df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

    # Prefer parquet, fallback to CSV if parquet not available, always also save JSON

    df_fi.to_json(str(features_importance_dir / "importance_clicked.json"), orient="records", lines=True)
    _print("Done.")


if __name__ == "__main__":
    main()
