# DeepMatch 样例代码
- https://github.com/shenweichen/DeepMatch
- https://deepmatch.readthedocs.io/en/latest/

# 下载movielens-1M数据 安装依赖包

In [None]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip
! unzip -o ml-1m.zip

# 导入需要的库

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler

# 读取数据

In [2]:
class MoiveLenDataLoader:
    def __init__(self, data_path="./"):
        self.data_path = data_path
        self.user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
        self.rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
        self.movie_cols = ['movie_id', 'title', 'genres']

    def load_users(self):
        """加载用户数据"""
        user_file = f"{self.data_path}ml-1m/users.dat"
        return pd.read_csv(user_file, sep='::', header=None, names=self.user_cols, engine='python')

    def load_ratings(self):
        """加载评分数据"""
        ratings_file = f"{self.data_path}ml-1m/ratings.dat"
        return pd.read_csv(ratings_file, sep='::', header=None, names=self.rating_cols, engine='python')

    def load_movies(self):
        """加载电影数据，并处理电影类型"""
        movies_file = f"{self.data_path}ml-1m/movies.dat"
        movies = pd.read_csv(movies_file, sep='::', header=None, names=self.movie_cols, encoding="unicode_escape", engine='python')
        movies['genres'] = movies['genres'].map(lambda x: x.split('|')[0])
        return movies

    def load_data(self):
        """加载并合并所有数据"""
        self.users = self.load_users()
        self.ratings = self.load_ratings()
        self.movies = self.load_movies()
        
        # 对3个表的数据执行inner join
        data = pd.merge(self.ratings, self.movies, on='movie_id')
        data = pd.merge(data, self.users, on='user_id')
        return data

# 使用示例
data_loader = MoiveLenDataLoader(data_path="./")
data = data_loader.load_data()
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation,F,1,10,48067


In [3]:
# 1. Movie Len 数据集处理方法
class MovieLenDataProcessor:

    @staticmethod
    def gen_data_set(data, seq_max_len):
        """生成训练样本集"""
        
        # 按时间顺序对用户行为进行排序
        data.sort_values("timestamp", inplace=True)

        all_samples = []
                
        for user_id, hist in tqdm(data.groupby('user_id')):
            # 根据每个用户的历史行为数据生成训练样本
            user_samples = MovieLenDataProcessor._gen_user_samples(user_id, hist, seq_max_len)
            all_samples.extend(user_samples)

        # 打乱顺序
        random.shuffle(all_samples)
        
        # 定义样本数据集的列名称
        columns = ['user_id', 'movie_id', 'genres_id', 'label', 
                   'movie_seq', 'seq_len', 'genre_seq', 'rating', 'timestamp']

        # 创建空 DataFrame  
        train_df = pd.DataFrame(all_samples, columns=columns)
        
        return train_df
    
    
    @staticmethod
    def _gen_user_samples(user_id, history, seq_max_len):
        user_movie_list = history['movie_id'].tolist()
        genre_list = history['genres'].tolist()
        rating_list = history['rating'].tolist()
        timestamp_list = history['timestamp'].tolist()
        
        train_samples = []
        
        for index in range(1, len(user_movie_list)):
            # 序列长度，不能超过最大序列长度
            seq_len = min(index, seq_max_len)
            
            movie_id = user_movie_list[index]
            genres_id = genre_list[index]
            rating = rating_list[index]
            label = 1 # 正样本
            timestamp = timestamp_list[index]
            
            # 历史行为序列特征
            movie_seq = user_movie_list[:index]
            genre_seq = genre_list[:index]
            
            # 生成的条目为一个元组，包含了用户的各类交互信息
            sample_entry = (
                user_id,  # 样本用户ID
                movie_id,  # 样本电影ID
                genres_id,  # 样本电影的类型ID
                label,  # 用户对电影的反馈标签（如喜欢或点击）
                movie_seq[::-1][:seq_len],  # 截断的历史电影ID列表，顺序为先逆序再截断
                seq_len,  # 历史记录的长度
                genre_seq[::-1][:seq_len],  # 截断的历史电影类型列表，顺序为先逆序再截断
                rating,  # 当前电影的评分
                timestamp # 行为时间戳
            )
            
            train_samples.append(sample_entry)
            
        return train_samples
            
    @staticmethod
    def gen_model_input(train_samples, user_profile):
        """生成模型输入数据和标签"""       
        MovieLenDataProcessor.extract_features(train_samples, 50)
        train_model_input = self.extract_features(train_samples)
        train_model_input.update(self.add_user_profile_features(train_model_input['user_id'], user_profile))
        train_label = np.array(train_samples["label"].values)
        
        return train_model_input, train_label
    
    @staticmethod
    def extract_features(train_set, seq_max_len):
        """从训练集提取用户和电影ID、历史记录、电影类型等特征"""
        train_uid = np.array(train_set["user_id"].values)
        train_iid = np.array(train_set["movie_id"])
        train_genres = np.array(train_set["genres_id"])
        train_hist_len = np.array(train_set['seq_len'].values)
        train_seq_movie = train_set["movie_seq"].values
        train_seq_genres = train_set["genre_seq"].values  
        

        train_seq_movie_pad = pad_sequences(train_seq_movie, maxlen=seq_max_len, padding='post', truncating='post', value=0)
        train_seq_genres_pad = pad_sequences(train_seq_genres, maxlen=seq_max_len, padding='post', truncating='post', value=0)

        return {
            "user_id": train_uid,
            "movie_id": train_iid,
            "hist_movie_id": train_seq_movie_pad,
            "hist_genres": train_seq_genres_pad,
            "hist_len": train_hist_len,
            "genres": train_genres
        }

    @staticmethod
    def map_user_profile_features(user_ids, user_profile):
        """根据用户ID添加用户特征"""
        user_features = {}
        for feature in ["gender", "age", "occupation", "zip"]:
            user_features[feature] = user_profile.loc[user_ids][feature].values
        return user_features

    @staticmethod
    def label_encode_sparse_features(data, sparse_features):
        """对稀疏特征进行 Label Encoding 并生成特征最大索引值"""
        _data = data.copy()
        feature_max_idx = {}
        
        for feature in sparse_features:
            lbe = LabelEncoder()
            _data[feature] = lbe.fit_transform(_data[feature]) + 1
            feature_max_idx[feature] = _data[feature].max() + 1
        
        return _data, feature_max_idx
    
    @staticmethod
    def extract_user_profiles(data):
        user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id').set_index("user_id")
        return user_profile
    
    @staticmethod
    def extract_item_profiles(data):
        item_profile = data[["movie_id"]].drop_duplicates('movie_id')
        return item_profile
    
    @staticmethod
    def generate_user_item_lists(data):
        """生成用户的物品交互列表"""
        return data.groupby("user_id")['movie_id'].apply(list)

In [34]:
# 数据预处理：标签化 + 用户行为采样
sparse_features = ["movie_id", "user_id",
                    "gender", "age", "occupation", "zip", "genres"]

data_label_encode, feature_max_idx = MovieLenDataProcessor.label_encode_sparse_features(data, sparse_features)
all_samples = MovieLenDataProcessor.gen_data_set(data_label_encode, 50)

# 将 rating 小于 3 的行的 label 设置为 0
all_samples.loc[all_samples['rating'] < 3, 'label'] = 0
all_samples.head()

100%|██████████| 6040/6040 [00:06<00:00, 885.28it/s] 


Unnamed: 0,user_id,movie_id,genres_id,label,movie_seq,seq_len,genre_seq,rating,timestamp
0,4658,613,3,0,"[233, 1853, 1912, 1376, 2370, 966, 1943, 48, 3...",50,"[3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",1,963966876
1,2976,288,6,1,"[285, 1059, 1108, 3456, 584, 1338, 3295, 1965,...",50,"[6, 5, 1, 1, 5, 8, 1, 5, 8, 8, 8, 5, 5, 5, 2, ...",5,971022119
2,2496,1905,3,0,"[1892, 3671, 3672, 3673, 3677, 3684, 3682, 368...",18,"[5, 11, 11, 11, 11, 11, 5, 11, 2, 11, 11, 8, 1...",2,974090728
3,5220,514,8,1,"[343, 470, 3476, 3384, 3249, 1834, 2248, 1167,...",35,"[5, 8, 1, 1, 5, 1, 8, 11, 11, 11, 11, 1, 8, 5,...",5,961540508
4,5499,2203,5,1,"[1180, 1298, 1450, 627, 461, 1744, 2709, 348, ...",43,"[8, 6, 1, 8, 1, 5, 1, 5, 1, 8, 6, 8, 5, 1, 5, ...",4,959797713


In [48]:
# 根据 'timestamp' 列进行降序排列，拆分训练集与测试集
N_TEST_SAMPLE = 100000

all_samples.sort_values(by='timestamp', ascending=False, inplace=True)

# 仅选择正向样本
test_samples = all_samples.head(N_TEST_SAMPLE)
test_samples = test_samples[test_samples["label"] == 1]

# 剩余数据作为训练数据（包含正向与负向）
train_samples = all_samples.tail(len(all_samples) - N_TEST_SAMPLE)

train_samples.sample(frac=1).reset_index(drop=True)

Unnamed: 0,user_id,movie_id,genres_id,label,movie_seq,seq_len,genre_seq,rating,timestamp
0,5659,738,1,1,"[284, 2885, 217, 1987, 1909, 1018, 936, 2199, ...",50,"[1, 8, 5, 1, 3, 6, 6, 6, 8, 1, 1, 1, 5, 1, 5, ...",4,958790721
1,4510,2587,5,1,"[2980, 1733, 328, 869, 1694, 32, 516, 2027, 11...",50,"[5, 1, 8, 16, 8, 8, 8, 5, 1, 5, 8, 5, 3, 5, 8,...",4,965624115
2,3683,1827,1,0,"[508, 2566, 1697, 1488, 338, 335, 1293, 1571, ...",50,"[6, 5, 1, 6, 8, 5, 1, 11, 8, 16, 5, 1, 8, 1, 1...",1,966523231
3,4520,1306,11,1,"[1742, 2932, 1667, 1815, 1479, 1517, 47, 3295,...",50,"[15, 8, 5, 11, 1, 8, 6, 1, 8, 11, 1, 6, 2, 1, ...",4,964883752
4,1017,2722,8,0,"[2826, 34, 3295, 3239, 145, 893, 256, 3486, 31...",50,"[5, 4, 1, 8, 8, 8, 4, 5, 8, 5, 6, 11, 18, 8, 5...",1,975009808
...,...,...,...,...,...,...,...,...,...
894164,4575,709,3,1,"[2658, 999, 2150, 2730, 809, 715, 83, 2570, 10...",50,"[5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, ...",5,964446371
894165,1950,2043,15,1,"[370, 3456, 1473, 1738, 499, 320, 1697, 1833, ...",31,"[1, 1, 15, 1, 11, 1, 1, 5, 15, 1, 1, 1, 1, 8, ...",4,974691698
894166,4520,288,6,1,"[310, 1251, 700, 2074, 2886, 1246, 224, 3244, ...",50,"[8, 11, 1, 16, 16, 16, 8, 11, 6, 11, 15, 8, 5,...",5,965928403
894167,5672,1469,1,0,"[2568, 2148, 860, 1121, 3221, 2339, 3215]",7,"[5, 14, 8, 1, 1, 1, 5]",1,958686394


In [37]:
# 构建训练数据
user_profiles = MovieLenDataProcessor.extract_user_profiles(data_label_encode)
train_model_input = MovieLenDataProcessor.extract_features(train_samples, 50) # 列存储形式
train_user_profile_intput = MovieLenDataProcessor.map_user_profile_features(train_model_input['user_id'], user_profiles)
train_model_input.update(train_user_profile_intput) # 添加user profile特征
train_label = np.array(train_samples["label"].values)

# 构建特征列，训练模型，导出embedding

In [38]:
SEQ_LEN = 50
BATCH_SIZE = 1024
N_EPOCHS = 20
negsample = 0

In [39]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

from collections import Counter

class FeatureConfig:
    def __init__(self, feature_max_idx, embedding_dim=32, seq_len=50):
        self.feature_max_idx = feature_max_idx
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len
        
    def get_user_feature_columns(self):
        """生成用户特征列配置"""
        return [
            SparseFeat('user_id', self.feature_max_idx['user_id'], 16),
            SparseFeat("gender", self.feature_max_idx['gender'], 16),
            SparseFeat("age", self.feature_max_idx['age'], 16),
            SparseFeat("occupation", self.feature_max_idx['occupation'], 16),
            SparseFeat("zip", self.feature_max_idx['zip'], 16),
            VarLenSparseFeat(SparseFeat('hist_movie_id', self.feature_max_idx['movie_id'], self.embedding_dim,
                                        embedding_name="movie_id"), self.seq_len, 'mean', 'hist_len'),
            VarLenSparseFeat(SparseFeat('hist_genres', self.feature_max_idx['genres'], self.embedding_dim,
                                        embedding_name="genres"), self.seq_len, 'mean', 'hist_len')
        ]

    def get_item_feature_columns(self):
        """生成物品特征列配置"""
        return [SparseFeat('movie_id', self.feature_max_idx['movie_id'], self.embedding_dim)]

class NegativeSamplerConfig:
    def __init__(self, train_model_input, item_feature_columns):
        self.train_counter = Counter(train_model_input['movie_id'])
        self.item_count = [self.train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)]

    def get_sampler_config(self, num_sampled=255):
        """生成负采样器配置"""
        return NegativeSampler('frequency', num_sampled=num_sampled, item_name="movie_id", item_count=self.item_count)
    
embedding_dim = 32
    
# 创建 FeatureConfig 实例
feature_config = FeatureConfig(feature_max_idx, embedding_dim, SEQ_LEN)
user_feature_columns = feature_config.get_user_feature_columns()
item_feature_columns = feature_config.get_item_feature_columns()

# 创建 NegativeSamplerConfig 实例
negative_sampler_config = NegativeSamplerConfig(train_model_input, item_feature_columns)
sampler_config = negative_sampler_config.get_sampler_config(num_sampled=255)

In [40]:
# 3.Define Model and train

import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)

model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)

model.compile(optimizer="adam", loss=sampledsoftmaxloss)

history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=BATCH_SIZE, epochs=N_EPOCHS, verbose=1, validation_split=0.0, )

Train on 894169 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
item_profiles = MovieLenDataProcessor.extract_item_profiles(data_label_encode)
test_model_input = MovieLenDataProcessor.extract_features(test_samples, 50) # 列存储形式
test_user_profile_intput = MovieLenDataProcessor.map_user_profile_features(test_model_input['user_id'], user_profiles)
test_model_input.update(test_user_profile_intput) # 添加user profile特征

In [50]:
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profiles['movie_id'].values,}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)



(82630, 32)
(3706, 32)


# 使用faiss进行ANN查找并评估结果

In [53]:
test_true_label = {row.user_id:[row.movie_id] for row in test_samples.itertuples(index=False)}

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

TOP_N = 100

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), TOP_N)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    try:
        pred_movie_ids = [item_profiles['movie_id'].values[x] for x in I[i]]
        filter_item = None
        recall_score = recall_N(test_true_label[uid], pred_movie_ids, N=TOP_N)
        s.append(recall_score)
        if test_true_label[uid] in pred_movie_ids:
            hit += 1
    except:
        print(i)
print("")
print("recall", np.mean(s))
print("hit rate", hit / len(test_user_model_input['user_id']))

82630it [00:25, 3189.99it/s]


recall 0.18965266852232845
hit rate 0.18965266852232845



