# 01 序列兴趣建模：DIN（Amazon-Electronics）

- **目标**：演示序列/历史特征（`history_features`）与目标物品特征（`target_features`）的构造方式，并用 `CTRTrainer` 跑通 DIN 的训练/验证/测试 **AUC**。
- **数据**：Amazon-Electronics sample（仓库内置）。
- **对齐要求**：数据处理沿用 `tutorials/DIN.ipynb` 的风格：`create_seq_features + SequenceFeature`。
- **默认**：不启用实验跟踪（`model_logger=None`），不导出 ONNX。


In [5]:
import os
import numpy as np
import pandas as pd
import torch

from torch_rechub.basic.features import SparseFeature, SequenceFeature
from torch_rechub.models.ranking import DIN
from torch_rechub.trainers import CTRTrainer
from torch_rechub.utils.data import DataGenerator, create_seq_features, df_to_dict

# 可选：实验跟踪（默认关闭）
# from torch_rechub.basic.tracking import WandbLogger, SwanLabLogger, TensorBoardXLogger

SEED = 2022
DEVICE = "cpu"  # 可改为 "cuda:0"

DATASET_PATH = "../examples/ranking/data/amazon-electronics/amazon_electronics_sample.csv"

SEQ_MAX_LEN = 50
DROP_SHORT = 0  # sample 数据较小，通常不丢弃

EPOCH = 2
BATCH_SIZE = 4096
LR = 1e-3
WEIGHT_DECAY = 1e-3
EARLYSTOP_PATIENCE = 4

USE_TRACKING = False
LOGGER_TYPE = None  # "wandb" | "swanlab" | "tensorboard" | None
PROJECT_NAME = "amazon-electronics-din"

EXPORT_ONNX = False
ONNX_PATH = "din.onnx"

torch.manual_seed(SEED)
print("DATASET_PATH:", os.path.abspath(DATASET_PATH))


DATASET_PATH: e:\RecommendSystemProject\torch-rechub\examples\ranking\data\amazon-electronics\amazon_electronics_sample.csv


In [6]:
data = pd.read_csv(DATASET_PATH)
print("raw shape:", data.shape)
print(data.head())

# 严格对齐 tutorials/DIN.ipynb：用 create_seq_features 构造滑窗样本与序列特征
train_df, val_df, test_df = create_seq_features(
    data=data,
    seq_feature_col=["item_id", "cate_id"],
    max_len=SEQ_MAX_LEN,
    drop_short=DROP_SHORT,
    shuffle=True,
)

print("train/val/test:", train_df.shape, val_df.shape, test_df.shape)

# vocab_size 取各列的最大 id（create_seq_features 已经 LabelEncode +1，0 留给 padding）
# history_* 是 list，需要从 list-of-lists 取最大值。

def max_from_list_col(df: pd.DataFrame, col: str) -> int:
    arr = np.asarray(df[col].tolist())
    return int(arr.max())

n_users = int(max(train_df["user_id"].max(), val_df["user_id"].max(), test_df["user_id"].max()))
n_items = int(max(
    train_df["target_item"].max(), val_df["target_item"].max(), test_df["target_item"].max(),
    max_from_list_col(train_df, "history_item"),
    max_from_list_col(val_df, "history_item"),
    max_from_list_col(test_df, "history_item"),
))
n_cates = int(max(
    train_df["target_cate"].max(), val_df["target_cate"].max(), test_df["target_cate"].max(),
    max_from_list_col(train_df, "history_cate"),
    max_from_list_col(val_df, "history_cate"),
    max_from_list_col(test_df, "history_cate"),
))

print({"n_users": n_users, "n_items": n_items, "n_cates": n_cates})

# 转成模型输入 dict
train = df_to_dict(train_df)
val = df_to_dict(val_df)
test = df_to_dict(test_df)

train_y, val_y, test_y = train["label"], val["label"], test["label"]
for d in (train, val, test):
    del d["label"]

train_x, val_x, test_x = train, val, test


raw shape: (100, 4)
   user_id  item_id        time  cate_id
0        0    13179  1400457600      584
1        0    29247  1400457600      339
2        0    28326  1400457600      587
3        0    17993  1400457600      513
4        0    62275  1400457600      115
train/val/test: (134, 6) (32, 6) (32, 6)
{'n_users': 16, 'n_items': 99, 'n_cates': 67}


In [7]:
# 构造特征：目标物品/类目 + 用户 id + 历史序列（与 target 共享 embedding）

target_features = [
    SparseFeature("target_item", vocab_size=n_items + 1, embed_dim=64),
    SparseFeature("target_cate", vocab_size=n_cates + 1, embed_dim=64),
]

features = target_features + [
    SparseFeature("user_id", vocab_size=n_users + 1, embed_dim=64),
]

history_features = [
    SequenceFeature(
        "history_item",
        vocab_size=n_items + 1,
        embed_dim=64,
        pooling="concat",
        shared_with="target_item",
    ),
    SequenceFeature(
        "history_cate",
        vocab_size=n_cates + 1,
        embed_dim=64,
        pooling="concat",
        shared_with="target_cate",
    ),
]

# dataloader

dg = DataGenerator(train_x, train_y)
train_dl, val_dl, test_dl = dg.generate_dataloader(
    x_val=val_x,
    y_val=val_y,
    x_test=test_x,
    y_test=test_y,
    batch_size=BATCH_SIZE,
)

model = DIN(
    features=features,
    history_features=history_features,
    target_features=target_features,
    mlp_params={"dims": [256, 128]},
    attention_mlp_params={"dims": [256, 128]},
)

model_logger = None
# if USE_TRACKING:
#     ... 参见 04_Experiment_Tracking_Light.ipynb 的统一演示

ctr_trainer = CTRTrainer(
    model,
    optimizer_params={"lr": LR, "weight_decay": WEIGHT_DECAY},
    n_epoch=EPOCH,
    earlystop_patience=EARLYSTOP_PATIENCE,
    device=DEVICE,
    model_path="./",
    model_logger=model_logger,
)

ctr_trainer.fit(train_dl, val_dl)
auc = ctr_trainer.evaluate(ctr_trainer.model, test_dl)
print(f"test auc: {auc}")


epoch: 0


train: 100%|██████████| 1/1 [00:00<00:00, 14.70it/s]
validation: 100%|██████████| 1/1 [00:00<00:00, 134.78it/s]


epoch: 0 validation: auc: 0.375
epoch: 1


train: 100%|██████████| 1/1 [00:00<00:00, 15.06it/s]
validation: 100%|██████████| 1/1 [00:00<00:00, 166.21it/s]


epoch: 1 validation: auc: 0.357421875


validation: 100%|██████████| 1/1 [00:00<00:00, 166.06it/s]

test auc: 0.87109375





In [8]:
# 可选：导出 ONNX（默认关闭）

if EXPORT_ONNX:
    try:
        ctr_trainer.export_onnx(ONNX_PATH, verbose=False, device=DEVICE)
        print("exported:", ONNX_PATH)
    except Exception as e:
        print("ONNX export failed:", repr(e))
