# 资料参考
- [Deep Neural Networks for YouTube Recommendations](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45530.pdf)

# 导入需要的库

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss, NegativeSampler

# 读取数据

In [24]:
import json

train_df = pd.read_parquet('ml1M-train.parquet', engine='pyarrow')
test_df = pd.read_parquet('ml1M-test.parquet', engine='pyarrow')

train_df['label'] = 1
test_df['label'] = 1

with open('ml1M_feature_max_idx.json', 'r') as json_file:
    feature_max_idx = json.load(json_file)
    
train_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,watch_movie_seq,label
0,2456,50,5,974181965,"Usual Suspects, The (1995)",6,2,3,6,1598,"[2484, 2890, 1026, 1575, 709, 2558, 145, 1, 96...",1
1,4964,1154,4,962620051,Miller's Crossing (1990),8,2,4,1,3058,"[2163, 859, 2587, 1839, 2593, 690, 1179, 514, ...",1
2,4747,1030,3,963241794,Top Gun (1986),1,2,2,5,958,"[1108, 1179, 2970, 2558, 254, 2014, 2427, 1121...",1
3,5824,2327,1,957969367,Airport 1975 (1974),8,2,2,13,692,"[640, 854, 2163, 1108, 1196, 581, 859, 582, 97...",1
4,1058,2331,2,974957947,Alligator (1980),1,2,3,2,3303,"[1105, 1108, 859, 2587, 1179, 1026, 2489, 254,...",1


In [13]:
def get_model_input(data, seq_max_len = 50):
    fixlen_feature_columns = ['user_id', 'movie_id', 'genres', 
                               'rating','gender', 'age', 
                              'occupation', 'zip']

    varlen_feature_columns = ['watch_movie_seq', ]

    _dict = data[fixlen_feature_columns].to_dict(orient='list')

    for feature in fixlen_feature_columns:
        _dict[feature] = np.array(_dict[feature])

    for feature in varlen_feature_columns:
        _dict[feature] = \
            pad_sequences(data[feature].values, maxlen=seq_max_len, padding='post', truncating='post', value=0)
        
    return _dict

In [4]:
# 构建训练数据
train_model_input = get_model_input(train_df)
train_label = np.array(train_df["label"].values)

# 构建特征列，训练模型，导出embedding

In [5]:
BATCH_SIZE = 1024
N_EPOCHS = 20
SEQ_LEN = 50
embedding_dim = 32

In [6]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

from collections import Counter

class FeatureConfig:
    def __init__(self, feature_max_idx, embedding_dim=32, seq_len=50):
        self.feature_max_idx = feature_max_idx
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len
        
    def get_user_feature_columns(self):
        """生成用户特征列配置"""
        return [
            SparseFeat('user_id', self.feature_max_idx['user_id'], 16),
            SparseFeat("gender", self.feature_max_idx['gender'], 16),
            SparseFeat("age", self.feature_max_idx['age'], 16),
            SparseFeat("occupation", self.feature_max_idx['occupation'], 16),
            SparseFeat("zip", self.feature_max_idx['zip'], 16),
            VarLenSparseFeat(SparseFeat('watch_movie_seq', self.feature_max_idx['movie_id'], self.embedding_dim,
                                        embedding_name="movie_id"), self.seq_len, 'mean'),
        ]

    def get_item_feature_columns(self):
        """生成物品特征列配置"""
        return [SparseFeat('movie_id', self.feature_max_idx['movie_id'], self.embedding_dim)]

class NegativeSamplerConfig:
    def __init__(self, train_model_input, item_feature_columns):
        self.train_counter = Counter(train_model_input['movie_id'])
        self.item_count = [self.train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)]

    def get_sampler_config(self, num_sampled=255):
        """生成负采样器配置"""
        return NegativeSampler('frequency', num_sampled=num_sampled, item_name="movie_id", item_count=self.item_count)
    
# 创建 FeatureConfig 实例
feature_config = FeatureConfig(feature_max_idx, embedding_dim, SEQ_LEN)
user_feature_columns = feature_config.get_user_feature_columns()
item_feature_columns = feature_config.get_item_feature_columns()

# 创建 NegativeSamplerConfig 实例
negative_sampler_config = NegativeSamplerConfig(train_model_input, item_feature_columns)
sampler_config = negative_sampler_config.get_sampler_config(num_sampled=255)

In [7]:
# 3.Define Model and train

import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()
else:
    K.set_learning_phase(True)

model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)

model.compile(optimizer="adam", loss=sampledsoftmaxloss)

history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=BATCH_SIZE, epochs=N_EPOCHS, verbose=1, validation_split=0.0, )

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Train on 900189 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
# 构建训练数据
test_model_input = get_model_input(test_df)
test_label = np.array(train_df["label"].values)

In [27]:
# 垂直训练集和测试集，取movie_id的唯一值
all_data = pd.concat([train_df, test_df], ignore_index=True)
movide_id_list = all_data['movie_id'].unique()

In [28]:
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": movide_id_list,}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)



(100020, 32)
(3706, 32)


# 使用faiss进行ANN查找并评估结果

In [33]:
test_true_label = {row.user_id:[row.movie_id] for row in test_df.itertuples(index=False)}

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

TOP_N = 100

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), TOP_N)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    try:
        pred_movie_ids = [movide_id_list[x] for x in I[i]]
        filter_item = None
        recall_score = recall_N(test_true_label[uid], pred_movie_ids, N=TOP_N)
        s.append(recall_score)
        if test_true_label[uid] in pred_movie_ids:
            hit += 1
    except:
        print(i)
print("")
print("recall", np.mean(s))
print("hit rate", hit / len(test_user_model_input['user_id']))

100020it [00:17, 5843.14it/s]


recall 0.12580483903219356
hit rate 0.12580483903219356



