In [40]:
import sys
sys.path.append('../')

In [41]:
import pandas as pd
from deepmatch.models import YouTubeDNN
from deepctr.inputs import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F


In [42]:
import random
import numpy as np
from tqdm import tqdm

In [43]:
def pad_sequences(sequences, maxlen=None, dtype='int32',
                  padding='pre', truncating='pre', value=0.):
    """
    填充序列到等长的 ndarray 数组。
    这是 tf.keras.preprocessing.sequence.pad_sequences 的 Pytorch 等效实现。

    :param sequences: 序列
    :param maxlen: 保留的最大长度
    :param dtype: 数据类型
    :param padding: 填充方位
    :param truncating: 截短方位
    :param value: 填充值或截短值
    :return: 填充或截短后的 ndarray
    """
    assert padding in ['pre', 'post'], f'无效填充方位={padding}，仅支持 pre|post'
    assert truncating in ['pre', 'post'], f'无效截短方位={truncating}，仅支持 pre|post'

    if maxlen is None:
        maxlen = max(len(x) for x in sequences)
    arr = np.full((len(sequences), maxlen), value, dtype=dtype)
    for idx, x in enumerate(sequences):
        if len(x) == 0:
            continue  # 空列表

        if truncating == 'pre':  # 截前
            trunc = x[-maxlen:]
        else:
            trunc = x[:maxlen]  # 截后
        trunc = np.asarray(trunc, dtype=dtype)

        if padding == 'pre':  # 填前
            arr[idx, -len(trunc):] = trunc
        else:
            arr[idx, :len(trunc)] = trunc  # 填后
    return arr


In [44]:
def gen_data_set(data, negsample=0):

    data.sort_values("timestamp", inplace=True)
    item_ids = data['movie_id'].unique()

    train_set = []
    test_set = []
    for reviewerID, hist in tqdm(data.groupby('user_id')):
        pos_list = hist['movie_id'].tolist()
        rating_list = hist['rating'].tolist()

        if negsample > 0:
            candidate_set = list(set(item_ids) - set(pos_list))
            neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True)
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]
            if i != len(pos_list) - 1:
                train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]),rating_list[i]))
                for negi in range(negsample):
                    train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1])))
            else:
                test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1]),rating_list[i]))

    random.shuffle(train_set)
    random.shuffle(test_set)

    print(len(train_set[0]),len(test_set[0]))

    return train_set,

In [45]:
def gen_data_set_youteube(data, negsample=5):

    data.sort_values("timestamp", inplace=True)
    item_ids = data['movie_id'].unique()

    train_set = []
    test_set = []
    for reviewerID, hist in tqdm(data.groupby('user_id')):
        pos_list = hist['movie_id'].tolist()
        rating_list = hist['rating'].tolist()

        if negsample > 0:
            candidate_set = list(set(item_ids) - set(pos_list))
            neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True)
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]
            if i != len(pos_list) - 1:
                # 这里的 label = 1 其实相当于是多分类的 1
                train_set.append((reviewerID, hist[::-1], [pos_list[i]] + [neg_list[item_idx] for item_idx in np.random.choice(neg_list, negsample)], 0, len(hist[::-1]),rating_list[i]))
            else:
                test_set.append((reviewerID, hist[::-1], [pos_list[i]] + [neg_list[item_idx] for item_idx in np.random.choice(neg_list, negsample)], 0,len(hist[::-1]),rating_list[i]))

    random.shuffle(train_set)
    random.shuffle(test_set)

    print(len(train_set[0]),len(test_set[0]))

    return train_set,test_set

In [46]:
def gen_model_input(train_set,user_profile,seq_maxlen):

    train_uid = np.array([line[0] for line in train_set])
    train_seq = [line[1] for line in train_set]
    train_iid = np.array([line[2] for line in train_set])
    train_label = np.array([line[3] for line in train_set])
    train_hist_len = np.array([line[4] for line in train_set])

    train_seq_pad = pad_sequences(train_seq, maxlen=seq_maxlen, padding='post', truncating='post', value=0)
    train_model_input = {"user_id": train_uid, "movie_id": train_iid, "hist_movie_id": train_seq_pad,
                         "hist_len": train_hist_len}

    for key in ["gender", "age", "occupation", "zip"]:
        train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values

    return train_model_input, train_label

In [47]:
data = pd.read_csv('./movielens_sample.txt')
sparse_features = ['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']
SEQ_LEN = 50

data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [48]:
# 对稀疏特征进行标签编码
features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
feature_max_idx = {}
for feat in features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat]) + 1
    feature_max_idx[feat] = data[feat].max() + 1

feature_max_idx

{'user_id': 4,
 'movie_id': 209,
 'gender': 3,
 'age': 4,
 'occupation': 4,
 'zip': 4}

In [49]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,66,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1,1,1,1
1,1,39,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,1,1,1,1
2,1,47,3,978301968,My Fair Lady (1964),Musical|Romance,1,1,1,1
3,1,191,4,978300275,Erin Brockovich (2000),Drama,1,1,1,1
4,1,147,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,1,1,1,1


In [50]:
# 构建用户画像
user_profile = data[['user_id', 'gender', 'age', 'occupation', 'zip']].drop_duplicates('user_id')
user_profile

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,1,1,1,1
53,2,2,3,3,3
182,3,2,2,2,2


In [51]:
# 构建物品画像
item_profile = data[['movie_id']].drop_duplicates('movie_id')
item_profile

Unnamed: 0,movie_id
0,66
1,39
2,47
3,191
4,147
...,...
228,202
229,169
230,135
231,64


In [52]:
user_profile.set_index('user_id', inplace=True)
user_profile

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1,1,1
2,2,3,3,3
3,2,2,2,2


In [53]:
user_item_list = data.groupby('user_id')['movie_id'].apply(list)
user_item_list

user_id
1    [66, 39, 47, 191, 147, 68, 85, 166, 33, 48, 34...
2    [66, 180, 144, 127, 70, 132, 77, 89, 177, 100,...
3    [147, 68, 84, 126, 11, 183, 37, 71, 80, 168, 2...
Name: movie_id, dtype: object

In [54]:
train_set, test_set = gen_data_set_youteube(data, 5)
train_set

100%|██████████| 3/3 [00:00<00:00, 664.46it/s]

6 6





[(3,
  [108,
   23,
   81,
   105,
   137,
   71,
   86,
   69,
   68,
   185,
   169,
   67,
   11,
   31,
   93,
   92,
   202,
   83,
   97,
   126,
   128,
   197,
   168,
   32],
  [130, 132, 6, 62, 122, 179],
  0,
  24,
  4),
 (2,
  [175, 66, 14, 74, 173, 87, 160, 73, 71, 69],
  [168, 55, 56, 191, 5, 5],
  0,
  10,
  4),
 (1,
  [55,
   50,
   180,
   132,
   126,
   159,
   35,
   48,
   11,
   66,
   40,
   70,
   166,
   191,
   117,
   145,
   52,
   84,
   112,
   186],
  [127, 103, 76, 44, 89, 41],
  0,
  20,
  4),
 (3, [126, 128, 197, 168, 32], [97, 76, 34, 29, 181, 139], 0, 5, 3),
 (1, [112, 186], [84, 200, 114, 89, 6, 121], 0, 2, 5),
 (2,
  [46,
   128,
   161,
   123,
   113,
   80,
   205,
   196,
   201,
   116,
   58,
   127,
   49,
   122,
   125,
   67,
   89,
   180,
   78,
   184,
   6,
   133,
   176,
   154,
   59,
   195,
   25,
   118,
   32,
   179,
   70,
   72,
   121,
   168,
   175,
   66,
   14,
   74,
   173,
   87,
   160,
   73,
   71,
   69],
  [62, 

In [55]:
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
train_model_input

{'user_id': array([3, 2, 1, 3, 1, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 2, 1, 2, 3, 2, 3,
        1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 3, 2,
        1, 2, 3, 2, 3, 2, 2, 1, 3, 3, 3, 1, 3, 1, 3, 2, 1, 2, 2, 2, 3, 2,
        2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 3, 1, 3, 2, 1, 1, 1, 2, 3, 1, 2, 3,
        2, 2, 1, 3, 3, 2, 1, 2, 2, 2, 2, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 1,
        1, 1, 3, 2, 1, 2, 2, 3, 2, 3, 2, 1, 2, 2, 2, 1, 3, 1, 2, 2, 2, 1,
        1, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 2, 2, 3, 2, 2, 3, 3,
        2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2,
        2, 3, 1, 1, 2, 1, 3, 2, 3, 2, 1, 2, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2,
        1, 1, 2, 2, 1, 3, 2, 3, 3, 2, 2, 3, 2, 2, 3, 2, 3, 2, 1, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 3]),
 'movie_id': array([[130, 132,   6,  62, 122, 179],
        [168,  55,  56, 191,   5,   5],
        [127, 103,  76,  44,  89,  41],
        ...,
        [175, 109, 142, 109,  35, 126],
        [196, 191,  40,  35, 162,

In [56]:
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)
test_model_input

{'user_id': array([2, 1, 3]),
 'movie_id': array([[110, 166,  81, 109,  28,  86],
        [  3,  10, 125,  41,  88, 188],
        [134, 143, 118, 151, 119,  19]]),
 'hist_movie_id': array([[120,  19, 101,   4,  13, 136,  42, 129, 189, 111, 157,  20, 106,
         115, 181,  17, 170,   9,  22,  44, 153,  94, 103,  91, 114,  37,
         151,  90, 141, 130, 146,  96,  99,   2, 188,  15, 107,  23,   8,
         192,  18, 171,  30, 132,  21, 155, 182, 206,  82,  16],
        [119, 104,  45, 147, 142,  34,  29,  43, 158,   1,  26, 102, 150,
          68,  33,  54, 144, 164, 183,  27, 172,  39, 163,  77, 165,  85,
          47,  61, 131,  53,   7, 127,  55,  50, 180, 132, 126, 159,  35,
          48,  11,  66,  40,  70, 166, 191, 117, 145,  52,  84],
        [  5, 207, 198, 147, 109,  82,  80,  57,  84, 200,  88, 174, 193,
          95, 183,  64, 162,  37,  28, 156,  56, 135, 152,  38,  41, 130,
         108,  23,  81, 105, 137,  71,  86,  69,  68, 185, 169,  67,  11,
          31,  93,  92,

In [57]:
# 计算每个稀疏字段的唯一特征，并为序列特征生成特征排至
embedding_dim = 16

user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
                        SparseFeat('gender', feature_max_idx['gender'], embedding_dim),
                        SparseFeat('age', feature_max_idx['age'], embedding_dim),
                        SparseFeat('occupation', feature_max_idx['occupation'], embedding_dim),
                        SparseFeat('zip', feature_max_idx['zip'], embedding_dim),
                        VarLenSparseFeat(SparseFeat('hist_movie_id',
                                                               vocabulary_size=feature_max_idx['movie_id'],
                                                               embedding_dim=embedding_dim,
                                                               embedding_name='movie_id'), maxlen=10, combiner='mean')]

item_feature_columns = [
    VarLenSparseFeat(SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name='movie_id'), maxlen=6, combiner='mean')]

In [58]:
# 定义模型并训练
model = YouTubeDNN(user_feature_columns,
                   item_feature_columns,
                   num_sampled=5,
                   user_dnn_hidden_units=(64, embedding_dim),
                   criterion=F.cross_entropy,
                   optimizer='Adam',
                   config={'gpus': -1, 'init_std': 0.002})

TypeError: normal_(): argument 'std' (position 2) must be float, not NoneType