# 00 QuickStart：CTR 预测（DeepFM）

- **目标**：用最小示例跑通 CTR 训练流程：数据预处理（Dense/Sparse）→ 特征构造 → `CTRTrainer` 训练/验证/测试 **AUC**。
- **数据**：Criteo sample（仓库内置）。
- **默认**：不启用实验跟踪（`model_logger=None`），不导出 ONNX。

## 环境
- 必需：`torch`、`pandas`、`scikit-learn`
- 可选（实验跟踪）：`wandb` / `swanlab` / `tensorboardX`
- 可选（ONNX 导出/推理）：`onnx>=1.20.0`、`onnxruntime`


In [1]:
import os
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm import tqdm

from torch_rechub.basic.features import DenseFeature, SparseFeature
from torch_rechub.models.ranking import DeepFM
from torch_rechub.trainers import CTRTrainer
from torch_rechub.utils.data import DataGenerator

# 可选：实验跟踪（默认关闭）
# from torch_rechub.basic.tracking import WandbLogger, SwanLabLogger, TensorBoardXLogger

SEED = 2022
DEVICE = "cpu"  # 可改为 "cuda:0"

# 数据路径（相对 notebook 所在的 tutorials/ 目录）
DATASET_PATH = "../examples/ranking/data/criteo/criteo_sample.csv"

# 训练配置：尽量保持 5-10 分钟内可跑通
EPOCH = 2
BATCH_SIZE = 2048
LR = 1e-3
WEIGHT_DECAY = 1e-3
EARLYSTOP_PATIENCE = 4

# 可选开关
USE_TRACKING = False
LOGGER_TYPE = None  # "wandb" | "swanlab" | "tensorboard" | None
PROJECT_NAME = "criteo-ctr"
EXPORT_ONNX = False
ONNX_PATH = "deepfm.onnx"

torch.manual_seed(SEED)
print("DATASET_PATH:", os.path.abspath(DATASET_PATH))


DATASET_PATH: e:\RecommendSystemProject\torch-rechub\examples\ranking\data\criteo\criteo_sample.csv


In [2]:
def convert_numeric_feature(val: float | int) -> int:
    """与 examples/ranking/run_criteo.py 保持一致：把 dense 值离散化成一个新 sparse 特征。"""
    v = int(val)
    if v > 2:
        return int(np.log(v) ** 2)
    else:
        return v - 2


def get_criteo_data_dict(data_path: str):
    """复用并对齐仓库现有 Criteo 预处理风格。"""
    data = pd.read_csv(data_path, compression="gzip") if data_path.endswith(".gz") else pd.read_csv(data_path)
    print("data load finished, shape=", data.shape)

    dense_features = [f for f in data.columns.tolist() if f.startswith("I")]
    sparse_features = [f for f in data.columns.tolist() if f.startswith("C")]

    data[sparse_features] = data[sparse_features].fillna("0")
    data[dense_features] = data[dense_features].fillna(0)

    # dense → dense(归一化) + dense_cat(离散化后作为 sparse)
    for feat in tqdm(dense_features, desc="discretize dense"):
        sparse_features.append(feat + "_cat")
        data[feat + "_cat"] = data[feat].apply(lambda x: convert_numeric_feature(x))

    sca = MinMaxScaler()
    data[dense_features] = sca.fit_transform(data[dense_features])

    for feat in tqdm(sparse_features, desc="label encode sparse"):
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    dense_feas = [DenseFeature(feature_name) for feature_name in dense_features]
    sparse_feas = [
        SparseFeature(feature_name, vocab_size=data[feature_name].nunique(), embed_dim=16)
        for feature_name in sparse_features
    ]

    y = data["label"]
    x = data.drop(columns=["label"])
    return dense_feas, sparse_feas, x, y


In [3]:
dense_feas, sparse_feas, x, y = get_criteo_data_dict(DATASET_PATH)

dg = DataGenerator(x, y)
train_dl, val_dl, test_dl = dg.generate_dataloader(split_ratio=[0.7, 0.1], batch_size=BATCH_SIZE)

model = DeepFM(
    deep_features=dense_feas,
    fm_features=sparse_feas,
    mlp_params={"dims": [256, 128], "dropout": 0.2, "activation": "relu"},
)

# 默认不启用 logger
model_logger = None

# 如需启用：把 USE_TRACKING=True 并设置 LOGGER_TYPE
# if USE_TRACKING:
#     loggers = []
#     if LOGGER_TYPE == "wandb":
#         loggers.append(WandbLogger(project=PROJECT_NAME, name=f"deepfm-{SEED}", config={"lr": LR, "batch_size": BATCH_SIZE, "seed": SEED}, tags=["criteo", "ctr", "deepfm"]))
#     elif LOGGER_TYPE == "swanlab":
#         loggers.append(SwanLabLogger(project=PROJECT_NAME, experiment_name=f"deepfm-{SEED}", config={"lr": LR, "batch_size": BATCH_SIZE, "seed": SEED}))
#     elif LOGGER_TYPE == "tensorboard":
#         loggers.append(TensorBoardXLogger(log_dir=f"./runs/deepfm-{SEED}"))
#     model_logger = loggers if loggers else None

ctr_trainer = CTRTrainer(
    model,
    optimizer_params={"lr": LR, "weight_decay": WEIGHT_DECAY},
    n_epoch=EPOCH,
    earlystop_patience=EARLYSTOP_PATIENCE,
    device=DEVICE,
    model_path="./",
    model_logger=model_logger,
)

ctr_trainer.fit(train_dl, val_dl)
auc = ctr_trainer.evaluate(ctr_trainer.model, test_dl)
print(f"test auc: {auc}")


data load finished, shape= (115, 40)


discretize dense: 100%|██████████| 13/13 [00:00<00:00, 4045.55it/s]
label encode sparse: 100%|██████████| 39/39 [00:00<00:00, 9743.16it/s]

the samples of train : val : test are  80 : 11 : 24





epoch: 0


train: 100%|██████████| 1/1 [00:00<00:00, 11.15it/s]
validation: 100%|██████████| 1/1 [00:00<00:00, 285.11it/s]


epoch: 0 validation: auc: 0.4666666666666667
epoch: 1


train: 100%|██████████| 1/1 [00:00<00:00, 84.24it/s]
validation: 100%|██████████| 1/1 [00:00<00:00, 500.04it/s]


epoch: 1 validation: auc: 0.4


validation: 100%|██████████| 1/1 [00:00<00:00, 322.89it/s]

test auc: 0.3375





In [4]:
# 可选：导出 ONNX（默认关闭）
# 说明：需要 onnx>=1.20.0，且某些环境需要额外安装 onnxruntime 做推理验证。

if EXPORT_ONNX:
    try:
        ctr_trainer.export_onnx(ONNX_PATH, verbose=False, device=DEVICE)
        print("exported:", ONNX_PATH)
    except Exception as e:
        print("ONNX export failed:", repr(e))
