# 02 匹配/召回：DSSM（MovieLens-1M） + Annoy 轻量检索

- **目标**：跑通双塔召回：MovieLens-1M 预处理 → `MatchDataGenerator` → `MatchTrainer` 训练 → 导出 user/item tower → 基于 embedding 做轻量 topk 检索（Annoy）。
- **数据**：`ml-1m_sample.csv`（仓库内置），跑通后可替换为 `ml-1m.csv`。

## MatchTrainer 的 mode 简述
- `mode=0`：point-wise / 采样式匹配（更常用的简单设置）
- `mode=1`：（按实现）可用于不同的采样/损失设定
- `mode=2`：（按实现）可用于更复杂的匹配训练方式

注：本教程默认使用 `mode=0`，仅做最小可跑通示例。


In [1]:
import os
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

from torch_rechub.basic.features import SparseFeature, SequenceFeature
from torch_rechub.models.matching import DSSM
from torch_rechub.trainers import MatchTrainer
from torch_rechub.utils.data import MatchDataGenerator, df_to_dict
from torch_rechub.utils.match import Annoy, gen_model_input, generate_seq_feature_match

SEED = 2022
DEVICE = "cpu"  # 可改为 "cuda:0"

DATASET_PATH = "../examples/matching/data/ml-1m/ml-1m_sample.csv"  # 或 ml-1m.csv
SAVE_DIR = "../examples/matching/data/ml-1m/saved/"
RAW_ID_MAPS_PATH = os.path.join(SAVE_DIR, "raw_id_maps.npy")

SEQ_MAX_LEN = 50
NEG_RATIO = 3
MODE = 0

EPOCH = 2
BATCH_SIZE = 4096
LR = 1e-4
WEIGHT_DECAY = 1e-6

EXPORT_ONNX = False

torch.manual_seed(SEED)
os.makedirs(SAVE_DIR, exist_ok=True)
print("DATASET_PATH:", os.path.abspath(DATASET_PATH))
print("SAVE_DIR:", os.path.abspath(SAVE_DIR))


DATASET_PATH: e:\RecommendSystemProject\torch-rechub\examples\matching\data\ml-1m\ml-1m_sample.csv
SAVE_DIR: e:\RecommendSystemProject\torch-rechub\examples\matching\data\ml-1m\saved


In [2]:
# 严格对齐 tutorials/Matching.ipynb / examples/matching/run_ml_dssm.py：
# - genres -> cate_id（取第一个 genre）
# - LabelEncoder +1（0 留给 padding）
# - 保存 raw_id_maps.npy

data = pd.read_csv(DATASET_PATH)
print("raw shape:", data.shape)

# genres -> cate_id
data["cate_id"] = data["genres"].apply(lambda x: str(x).split("|")[0])

sparse_features = ["user_id", "movie_id", "gender", "age", "occupation", "zip", "cate_id"]
user_col, item_col = "user_id", "movie_id"

feature_max_idx = {}
user_map, item_map = None, None

for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1
    feature_max_idx[feature] = int(data[feature].max() + 1)
    if feature == user_col:
        user_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)}
    if feature == item_col:
        item_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)}

np.save(RAW_ID_MAPS_PATH, np.array((user_map, item_map), dtype=object))
print("saved raw_id_maps:", RAW_ID_MAPS_PATH)

user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates("user_id")
item_profile = data[["movie_id", "cate_id"]].drop_duplicates("movie_id")

print("user_profile:", user_profile.shape, "item_profile:", item_profile.shape)


raw shape: (100, 10)
saved raw_id_maps: ../examples/matching/data/ml-1m/saved/raw_id_maps.npy
user_profile: (2, 5) item_profile: (93, 2)


In [3]:
# 生成训练/测试样本（滑窗 + 负采样）
# 注意：generate_seq_feature_match 会生成包含 label 的 df_train/df_test

df_train, df_test = generate_seq_feature_match(
    data,
    user_col=user_col,
    item_col=item_col,
    time_col="timestamp",
    item_attribute_cols=[],
    sample_method=1,
    mode=MODE,
    neg_ratio=NEG_RATIO,
    min_item=0,
)

x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=SEQ_MAX_LEN)
y_train = x_train["label"]
x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=SEQ_MAX_LEN)
y_test = x_test["label"]

# 训练数据字典里保留 label 以外的字段
x_train = {k: v for k, v in x_train.items() if k != "label"}

all_item = df_to_dict(item_profile)
test_user = x_test  # test_user 中仍包含 label/movie_id 等，用于生成 test dataloader & 评估

print("x_train keys:", list(x_train.keys())[:10], "...", "len=", len(x_train))
print("test_user keys:", list(test_user.keys())[:10], "...", "len=", len(test_user))
print("y_train shape:", np.asarray(y_train).shape, "y_test shape:", np.asarray(y_test).shape)


preprocess data


generate sequence features: 100%|██████████| 2/2 [00:00<00:00, 2770.35it/s]

n_train: 384, n_test: 2
0 cold start user dropped 
x_train keys: ['user_id', 'movie_id', 'hist_movie_id', 'histlen_movie_id', 'gender', 'age', 'occupation', 'zip', 'cate_id'] ... len= 9
test_user keys: ['user_id', 'movie_id', 'hist_movie_id', 'histlen_movie_id', 'label', 'gender', 'age', 'occupation', 'zip', 'cate_id'] ... len= 10
y_train shape: (384,) y_test shape: (2,)





In [4]:
# 构造双塔特征（与 examples/matching/run_ml_dssm.py 对齐）

user_cols = ["user_id", "gender", "age", "occupation", "zip"]
item_cols = ["movie_id", "cate_id"]

user_features = [SparseFeature(name, vocab_size=feature_max_idx[name], embed_dim=16) for name in user_cols]
user_features += [
    SequenceFeature(
        "hist_movie_id",
        vocab_size=feature_max_idx["movie_id"],
        embed_dim=16,
        pooling="mean",
        shared_with="movie_id",
    )
]

item_features = [SparseFeature(name, vocab_size=feature_max_idx[name], embed_dim=16) for name in item_cols]

model = DSSM(
    user_features,
    item_features,
    temperature=0.02,
    user_params={"dims": [128, 64], "activation": "prelu"},
    item_params={"dims": [128, 64], "activation": "prelu"},
)

trainer = MatchTrainer(
    model,
    mode=MODE,
    optimizer_params={"lr": LR, "weight_decay": WEIGHT_DECAY},
    n_epoch=EPOCH,
    device=DEVICE,
    model_path=SAVE_DIR,
)

dg = MatchDataGenerator(x=x_train, y=y_train)
# 注意：MatchDataGenerator.generate_dataloader 的签名是 (x_test_user, x_all_item, batch_size, ...)
train_dl, test_dl, item_dl = dg.generate_dataloader(test_user, all_item, batch_size=BATCH_SIZE)

trainer.fit(train_dl)

print("inference embedding...")
user_emb = trainer.inference_embedding(model=model, mode="user", data_loader=test_dl, model_path=SAVE_DIR)
item_emb = trainer.inference_embedding(model=model, mode="item", data_loader=item_dl, model_path=SAVE_DIR)

print("user_emb:", tuple(user_emb.shape), "item_emb:", tuple(item_emb.shape))


epoch: 0


train: 100%|██████████| 1/1 [00:18<00:00, 18.90s/it]


epoch: 1


train: 100%|██████████| 1/1 [00:20<00:00, 20.84s/it]


inference embedding...


user inference: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]
item inference: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]

user_emb: (2, 64) item_emb: (93, 64)





In [5]:
# 轻量检索示例：Annoy topk
# - 用 item tower embedding 建索引
# - 对若干个 user embedding 做 topk query

annoy = Annoy(n_trees=10)
annoy.fit(item_emb)

user_map, item_map = np.load(RAW_ID_MAPS_PATH, allow_pickle=True)

TOPK = 10
N_SHOW = 5

hits = 0
for i in range(min(N_SHOW, len(test_user[user_col]))):
    uid_enc = int(test_user[user_col][i])
    true_item_enc = int(test_user[item_col][i])

    idx, scores = annoy.query(v=user_emb[i], n=TOPK)
    rec_item_enc = all_item[item_col][idx]

    hit = int(true_item_enc in set(rec_item_enc.tolist()))
    hits += hit

    rec_raw = [item_map[int(x)] for x in rec_item_enc.tolist()]
    print(f"user(raw)={user_map[uid_enc]} true_item(raw)={item_map[true_item_enc]} hit@{TOPK}={hit}")
    print("rec:", rec_raw)

print(f"sample hit@{TOPK} over {min(N_SHOW, len(test_user[user_col]))} users: {hits}/{min(N_SHOW, len(test_user[user_col]))}")


user(raw)=2 true_item(raw)=434 hit@10=0
rec: [np.int64(2194), np.int64(1197), np.int64(3035), np.int64(292), np.int64(3578), np.int64(2916), np.int64(2881), np.int64(1270), np.int64(3071), np.int64(1103)]
user(raw)=1 true_item(raw)=48 hit@10=0
rec: [np.int64(2194), np.int64(1197), np.int64(3035), np.int64(292), np.int64(3578), np.int64(2916), np.int64(2881), np.int64(1270), np.int64(3071), np.int64(1103)]
sample hit@10 over 2 users: 0/2


In [6]:
# 可选：导出 ONNX（默认关闭）
# - 双塔：分别导出 user_tower / item_tower

if EXPORT_ONNX:
    try:
        user_onnx = os.path.join(SAVE_DIR, "user_tower.onnx")
        item_onnx = os.path.join(SAVE_DIR, "item_tower.onnx")
        trainer.export_onnx(user_onnx, mode="user")
        trainer.export_onnx(item_onnx, mode="item")
        print("exported:", user_onnx)
        print("exported:", item_onnx)
    except Exception as e:
        print("ONNX export failed:", repr(e))
