# DeepMatch 样例代码
- https://github.com/shenweichen/DeepMatch
- https://deepmatch.readthedocs.io/en/latest/

# 下载movielens-1M数据 安装依赖包

In [None]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip
! unzip -o ml-1m.zip

# 导入需要的库

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler

In [2]:
class MovieLenDataProcessor:
    def __init__(self, seq_max_len=50, negsample=0):
        self.seq_max_len = seq_max_len
        self.negsample = negsample

    def gen_data_set(self, data):
        """生成训练集和测试集"""
        data.sort_values("timestamp", inplace=True)

        # 获取唯一的电影ID和电影ID到类型的映射
        item_ids = data['movie_id'].unique()
        item_id_genres_map = dict(zip(data['movie_id'].values, data['genres'].values))

        train_set, test_set = [], []
        for reviewerID, hist in tqdm(data.groupby('user_id')):
            pos_list = hist['movie_id'].tolist()
            genres_list = hist['genres'].tolist()
            rating_list = hist['rating'].tolist()

            neg_list = self.generate_neg_samples(pos_list, item_ids) if self.negsample > 0 else []
            # 为当前用户生成训练和测试集
            self.append_train_and_test_sets(train_set, test_set, reviewerID, pos_list, genres_list,
                                            rating_list, neg_list, item_id_genres_map)
        # 打乱训练和测试集
        random.shuffle(train_set)
        random.shuffle(test_set)

        print(f'Train set entry size: {len(train_set[0])}, Test set entry size: {len(test_set[0])}')

        return train_set, test_set

    def generate_neg_samples(self, pos_list, item_ids):
        """生成负样本"""
        candidate_set = list(set(item_ids) - set(pos_list))
        return np.random.choice(candidate_set, size=len(pos_list) * self.negsample, replace=True)

    def append_train_and_test_sets(self, train_set, test_set, reviewerID, pos_list, genres_list,
                                   rating_list, neg_list, item_id_genres_map):
        """生成训练集和测试集条目并添加到相应集合"""
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]
            genres_hist = genres_list[:i]
            seq_len = min(i, self.seq_max_len)

            if i != len(pos_list) - 1:
                train_set.append(self.create_entry(
                    reviewerID, pos_list, genres_list, rating_list, hist, genres_hist, seq_len, i, 1))

                for negi in range(self.negsample):
                    neg_id = neg_list[i * self.negsample + negi]
                    neg_genres = item_id_genres_map[neg_id]
                    train_set.append(self.create_entry(reviewerID, neg_list, genres_list, rating_list, hist,
                                                       genres_hist, seq_len, i, 0, neg_genres))
            else:
                test_set.append(self.create_entry(
                    reviewerID, pos_list, genres_list, rating_list, hist, genres_hist, seq_len, i, 1))

    @staticmethod
    def create_entry(reviewerID, mov_list, genres_list, rating_list, hist, genres_hist, seq_len, index, label, genres_id=None):
        """生成条目"""
        if genres_id is None:
            genres_id = genres_list[index]
        return (
            reviewerID,
            mov_list[index],
            label,
            hist[::-1][:seq_len],
            seq_len,
            genres_hist[::-1][:seq_len],
            genres_id,
            rating_list[index]
        )

    def gen_model_input(self, train_set, user_profile):
        """生成模型输入数据和标签"""
        train_model_input = self.extract_features(train_set)
        train_model_input.update(self.add_user_profile_features(train_model_input['user_id'], user_profile))
        train_label = np.array([line[2] for line in train_set])
        return train_model_input, train_label

    def extract_features(self, train_set):
        """从训练集提取用户和电影ID、历史记录、电影类型等特征"""
        train_uid = np.array([line[0] for line in train_set])
        train_iid = np.array([line[1] for line in train_set])
        train_seq = [line[3] for line in train_set]
        train_hist_len = np.array([line[4] for line in train_set])
        train_seq_genres = [line[5] for line in train_set]
        train_genres = np.array([line[6] for line in train_set])

        train_seq_pad = pad_sequences(train_seq, maxlen=self.seq_max_len, padding='post', truncating='post', value=0)
        train_seq_genres_pad = pad_sequences(train_seq_genres, maxlen=self.seq_max_len, padding='post', truncating='post', value=0)

        return {
            "user_id": train_uid,
            "movie_id": train_iid,
            "hist_movie_id": train_seq_pad,
            "hist_genres": train_seq_genres_pad,
            "hist_len": train_hist_len,
            "genres": train_genres
        }

    @staticmethod
    def add_user_profile_features(user_ids, user_profile):
        """根据用户ID添加用户特征"""
        user_features = {}
        for feature in ["gender", "age", "occupation", "zip"]:
            user_features[feature] = user_profile.loc[user_ids][feature].values
        return user_features

# 读取数据

In [3]:
class MoiveLenDataLoader:
    def __init__(self, data_path="./"):
        self.data_path = data_path
        self.user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
        self.rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
        self.movie_cols = ['movie_id', 'title', 'genres']

    def load_users(self):
        """加载用户数据"""
        user_file = f"{self.data_path}ml-1m/users.dat"
        return pd.read_csv(user_file, sep='::', header=None, names=self.user_cols, engine='python')

    def load_ratings(self):
        """加载评分数据"""
        ratings_file = f"{self.data_path}ml-1m/ratings.dat"
        return pd.read_csv(ratings_file, sep='::', header=None, names=self.rating_cols, engine='python')

    def load_movies(self):
        """加载电影数据，并处理电影类型"""
        movies_file = f"{self.data_path}ml-1m/movies.dat"
        movies = pd.read_csv(movies_file, sep='::', header=None, names=self.movie_cols, encoding="unicode_escape", engine='python')
        movies['genres'] = movies['genres'].map(lambda x: x.split('|')[0])
        return movies

    def load_data(self):
        """加载并合并所有数据"""
        users = self.load_users()
        ratings = self.load_ratings()
        movies = self.load_movies()

        data = pd.merge(ratings, movies, on='movie_id')
        data = pd.merge(data, users, on='user_id')
        return data

# 使用示例
data_loader = MoiveLenDataLoader(data_path="./")
data = data_loader.load_data()

In [4]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation,F,1,10,48067


# 构建特征列，训练模型，导出embedding

In [5]:
SEQ_LEN = 50
BATCH_SIZE = 1024
negsample = 0

data_processor = MovieLenDataProcessor(seq_max_len=SEQ_LEN, negsample=negsample)

In [6]:
# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`

class FeatureProcessor:
    def __init__(self, data, sparse_features):
        self.data = data.copy()
        self.sparse_features = sparse_features
        self.feature_max_idx = {}

    def label_encode_sparse_features(self):
        """对稀疏特征进行 Label Encoding 并生成特征最大索引值"""
        for feature in self.sparse_features:
            lbe = LabelEncoder()
            self.data[feature] = lbe.fit_transform(self.data[feature]) + 1
            self.feature_max_idx[feature] = self.data[feature].max() + 1
            
    def extract_user_item_profiles(self):
        """提取用户特征和物品特征"""
        user_profile = self.data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id').set_index("user_id")
        item_profile = self.data[["movie_id"]].drop_duplicates('movie_id')
        return user_profile, item_profile

    def generate_user_item_lists(self):
        """生成用户的物品交互列表"""
        return self.data.groupby("user_id")['movie_id'].apply(list)

    def process_features(self):
        """对稀疏特征进行处理并提取用户和物品特征"""
        self.label_encode_sparse_features()
        user_profile, item_profile = self.extract_user_item_profiles()
        user_item_list = self.generate_user_item_lists()
        return user_profile, item_profile, user_item_list

# 稀疏特征列表
sparse_features = ["movie_id", "user_id",
                    "gender", "age", "occupation", "zip", "genres"]

# 创建特征处理器并处理特征
feature_processor = FeatureProcessor(data, sparse_features)

user_profile, item_profile, user_item_list = feature_processor.process_features()
train_set, test_set = data_processor.gen_data_set(feature_processor.data)
train_model_input, train_label = data_processor.gen_model_input(train_set, user_profile)
test_model_input, test_label = data_processor.gen_model_input(test_set, user_profile)

100%|██████████| 6040/6040 [00:07<00:00, 844.38it/s] 


Train set entry size: 8, Test set entry size: 8


In [9]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

from collections import Counter

class FeatureConfig:
    def __init__(self, feature_max_idx, embedding_dim=32, seq_len=50):
        self.feature_max_idx = feature_max_idx
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len
        
    def get_user_feature_columns(self):
        """生成用户特征列配置"""
        return [
            SparseFeat('user_id', self.feature_max_idx['user_id'], 16),
            SparseFeat("gender", self.feature_max_idx['gender'], 16),
            SparseFeat("age", self.feature_max_idx['age'], 16),
            SparseFeat("occupation", self.feature_max_idx['occupation'], 16),
            SparseFeat("zip", self.feature_max_idx['zip'], 16),
            VarLenSparseFeat(SparseFeat('hist_movie_id', self.feature_max_idx['movie_id'], self.embedding_dim,
                                        embedding_name="movie_id"), self.seq_len, 'mean', 'hist_len'),
            VarLenSparseFeat(SparseFeat('hist_genres', self.feature_max_idx['genres'], self.embedding_dim,
                                        embedding_name="genres"), self.seq_len, 'mean', 'hist_len')
        ]

    def get_item_feature_columns(self):
        """生成物品特征列配置"""
        return [SparseFeat('movie_id', self.feature_max_idx['movie_id'], self.embedding_dim)]

class NegativeSamplerConfig:
    def __init__(self, train_model_input, item_feature_columns):
        self.train_counter = Counter(train_model_input['movie_id'])
        self.item_count = [self.train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)]

    def get_sampler_config(self, num_sampled=255):
        """生成负采样器配置"""
        return NegativeSampler('frequency', num_sampled=num_sampled, item_name="movie_id", item_count=self.item_count)
    
embedding_dim = 32
feature_max_idx = feature_processor.feature_max_idx
    
# 创建 FeatureConfig 实例
feature_config = FeatureConfig(feature_max_idx, embedding_dim, SEQ_LEN)
user_feature_columns = feature_config.get_user_feature_columns()
item_feature_columns = feature_config.get_item_feature_columns()

# 创建 NegativeSamplerConfig 实例
negative_sampler_config = NegativeSamplerConfig(train_model_input, item_feature_columns)
sampler_config = negative_sampler_config.get_sampler_config(num_sampled=255)

In [10]:
# 3.Define Model and train

import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)

model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)

model.compile(optimizer="adam", loss=sampledsoftmaxloss)

history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=BATCH_SIZE, epochs=40, verbose=1, validation_split=0.0, )

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Train on 988129 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [11]:
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values,}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)

(6040, 32)
(3706, 32)




# 使用faiss进行ANN查找并评估结果

In [12]:
test_true_label = {line[0]:[line[1]] for line in test_set}

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), 50)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    try:
        pred = [item_profile['movie_id'].values[x] for x in I[i]]
        filter_item = None
        recall_score = recall_N(test_true_label[uid], pred, N=50)
        s.append(recall_score)
        if test_true_label[uid] in pred:
            hit += 1
    except:
        print(i)
print("")
print("recall", np.mean(s))
print("hit rate", hit / len(test_user_model_input['user_id']))

6040it [00:00, 6461.68it/s]


recall 0.3564569536423841
hit rate 0.3564569536423841



