# 资料参考
- [Deep Neural Networks for YouTube Recommendations](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45530.pdf)

# 导入需要的库

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler

# 读取数据

In [2]:
import json

train_df = pd.read_parquet('ml1M-train.parquet', engine='pyarrow')
test_df = pd.read_parquet('ml1M-test.parquet', engine='pyarrow')

train_df['label'] = 1
test_df['label'] = 1

with open('ml1M_feature_max_idx.json', 'r') as json_file:
    feature_max_idx = json.load(json_file)
    
train_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,watch_movie_seq,watch_genre_seq,seq_len,label
0,5505,355,4,959732229,"Lion King, The (1994)",3,1,3,5,107,"[581, 741, 958, 1916, 1839, 968, 1, 2899, 2484...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 6, 6, 10,...",33,1
1,4446,1125,2,965089335,Alien (1979),1,2,2,19,2047,"[2064, 3208, 633, 1963, 3257, 2, 309, 59, 1788...","[1, 1, 1, 2, 2, 2, 4, 2, 2, 1, 2, 2, 5, 3, 2, ...",50,1
2,2231,673,5,974602217,Faces (1968),8,2,6,12,2680,"[3427, 2953, 3546, 2799, 3185, 3229, 149, 3237...","[8, 2, 8, 8, 8, 2, 8, 8, 8, 8, 6, 8, 8, 8, 8, ...",50,1
3,4942,2439,4,962642043,Superman (1978),1,2,5,13,1356,"[1730, 2427, 2262, 2461, 2215, 3459, 2335, 151...","[13, 1, 11, 1, 5, 8, 1, 8, 1, 1, 1, 2, 5, 1, 1...",44,1
4,1067,1766,4,974952597,On the Waterfront (1954),6,2,4,13,3082,"[2480, 1140, 241, 2786, 3271, 864, 1130, 2899,...","[8, 5, 8, 5, 8, 8, 11, 3, 15, 15, 10, 8, 5, 3,...",50,1


In [3]:
def get_model_input(data, seq_max_len = 50):
    fixlen_feature_columns = ['user_id', 'movie_id', 'genres', 
                               'rating','gender', 'age', 
                              'occupation', 'zip', "seq_len"]

    varlen_feature_columns = ['watch_movie_seq', 'watch_genre_seq']

    _dict = data[fixlen_feature_columns].to_dict(orient='list')

    for feature in fixlen_feature_columns:
        _dict[feature] = np.array(_dict[feature])

    for feature in varlen_feature_columns:
        _dict[feature] = \
            pad_sequences(data[feature].values, maxlen=seq_max_len, padding='post', truncating='post', value=0)
        
    return _dict

In [4]:
# 构建训练数据的输入
train_model_input = get_model_input(train_df)
train_label = np.array(train_df["label"].values)

# 构建特征列，训练模型，导出embedding

In [30]:
BATCH_SIZE = 1024
N_EPOCHS = 20
SEQ_LEN = 50
embedding_dim = 32

In [31]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

from collections import Counter

class FeatureConfig:
    def __init__(self, feature_max_idx, embedding_dim=32, seq_len=50):
        self.feature_max_idx = feature_max_idx
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len
        
    def get_user_feature_columns(self):
        """生成用户特征列配置"""
        return [
            SparseFeat('user_id', self.feature_max_idx['user_id'], 16),
            SparseFeat("gender", self.feature_max_idx['gender'], 16),
            SparseFeat("age", self.feature_max_idx['age'], 16),
            SparseFeat("occupation", self.feature_max_idx['occupation'], 16),
            SparseFeat("zip", self.feature_max_idx['zip'], 16),
            SparseFeat("seq_len", SEQ_LEN + 1, 16),
            VarLenSparseFeat(SparseFeat('watch_movie_seq', self.feature_max_idx['movie_id'], self.embedding_dim,
                                        embedding_name="movie_ebm"), self.seq_len, 'mean'),
            VarLenSparseFeat(SparseFeat('watch_genre_seq', self.feature_max_idx['genres'], self.embedding_dim,
                                        embedding_name="genre_ebm"), self.seq_len, 'mean'),
        ]

    def get_item_feature_columns(self):
        """生成物品特征列配置"""
        return [SparseFeat('movie_id', self.feature_max_idx['movie_id'], self.embedding_dim)]

class NegativeSamplerConfig:
    def __init__(self, train_model_input, item_feature_columns):
        self.train_counter = Counter(train_model_input['movie_id'])
        self.item_count = [self.train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)]

    def get_sampler_config(self, num_sampled=255):
        """生成负采样器配置"""
        return NegativeSampler('frequency', num_sampled=num_sampled, item_name="movie_id", item_count=self.item_count)
    
# 创建 FeatureConfig 实例
feature_config = FeatureConfig(feature_max_idx, embedding_dim, SEQ_LEN)
user_feature_columns = feature_config.get_user_feature_columns()
item_feature_columns = feature_config.get_item_feature_columns()

# 创建 NegativeSamplerConfig 实例
negative_sampler_config = NegativeSamplerConfig(train_model_input, item_feature_columns)
sampler_config = negative_sampler_config.get_sampler_config(num_sampled=255)

In [32]:
# 3.Define Model and train

import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)

model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(512, 256, embedding_dim), sampler_config=sampler_config)

model.compile(optimizer="adam", loss=sampledsoftmaxloss)

history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=BATCH_SIZE, epochs=N_EPOCHS, verbose=1, validation_split=0.0, )

Train on 900189 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
# 构建训练数据
test_model_input = get_model_input(test_df)

In [34]:
# 合并训练集和测试集，取movie_id的唯一值
all_data = pd.concat([train_df, test_df], ignore_index=True)
movide_id_list = all_data['movie_id'].unique()

In [46]:
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": movide_id_list,}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)



(100020, 32)
(3706, 32)


In [36]:
# 最新的用户行为产生的user embedding
all_data.sort_values(by='timestamp', ascending=False, inplace=True)
all_users_latest = all_data.drop_duplicates(subset=["user_id"], keep='first', inplace=False, ignore_index=False)
all_user_model_input = get_model_input(all_users_latest)
all_user_embs = user_embedding_model.predict(all_user_model_input, batch_size=2 ** 12)
print(all_user_embs.shape)

(6040, 32)


In [37]:
user_id_idx = np.array(all_users_latest["user_id"].values)
movie_id_idx = np.array(movide_id_list)

In [14]:
! mkdir model

In [38]:
# 保存所有的embedding
np.save('model/item_embs.npy', item_embs)
np.save('model/user_embs.npy', all_user_embs)
np.save('model/user_id_idx.npy', user_id_idx)
np.save('model/movie_id_idx.npy', movie_id_idx)

# 使用faiss进行ANN查找并评估结果

In [47]:
test_true_label = {row.user_id:[row.movie_id] for row in test_df.itertuples(index=False)}

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

TOP_N = 100

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), TOP_N)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    try:
        pred_movie_ids = [movide_id_list[x] for x in I[i]]
        recall_score = recall_N(test_true_label[uid], pred_movie_ids, N=TOP_N)
        s.append(recall_score)
        if test_true_label[uid] in pred_movie_ids:
            hit += 1
    except Exception as e:
        print(e)
        raise e
print("")
print("recall", np.mean(s))
print("hit rate", hit / len(test_user_model_input['user_id']))

100020it [00:17, 5751.60it/s]


recall 0.15369926014797042
hit rate 0.15369926014797042





In [43]:
# 单个用户情况查看

user_id = 1998

user_model_input = get_model_input(test_df[test_df["user_id"] == user_id].head(n=1))

user_embs = user_embedding_model.predict(user_model_input)

D, I = index.search(np.ascontiguousarray(user_embs), TOP_N)

retrival_movie_index = I[0]
retrival_movie_ids = [movide_id_list[x] for x in retrival_movie_index]

watch_movie_ids = train_df[train_df["user_id"] == user_id]["movie_id"].values
furture_movie_ids = test_df[test_df["user_id"] == user_id]["movie_id"].values

same_ids_in_history = set(watch_movie_ids) & set(retrival_movie_ids)
same_ids_in_future = set(furture_movie_ids) & set(retrival_movie_ids)

print(len(same_ids_in_history))
print(len(same_ids_in_future))

63
20


In [48]:
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
watch_movie_seq (InputLayer)    [(None, 50)]         0                                            
__________________________________________________________________________________________________
watch_genre_seq (InputLayer)    [(None, 50)]         0                                            
__________________________________________________________________________________________________
movie_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_id (InputLayer)            [(None, 1)]          0                                            
____________________________________________________________________________________________