In [None]:
import os
import numpy as np
from src.DSSM import dssm_model_keras_preprocess as dssm_model

In [None]:
import pandas as pd
import tensorflow as tf

## 数据预处理

In [None]:
FOLDER = "movielen"

In [None]:
ratings = pd.read_csv(
    os.path.join(FOLDER, 'rating.csv') 
)

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
movies = pd.read_csv(
    os.path.join(FOLDER, 'movie.csv')
)

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### tags

In [None]:
tags = pd.read_csv(
    os.path.join(FOLDER, 'genome_scores.csv')
)

tags.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [None]:
def get_top10(df):
    df = df.sort_values(by='relevance', ascending=False).head(10)
    tags_group_dict = {}
    # tags_group_dict["movieId"].append(movieId)
    tags_group_dict["tags"] = df["tagId"].values.tolist()
    tags_group_dict["relevances"] = df["relevance"].values.tolist()
    return pd.Series(tags_group_dict)
tags_group = tags.groupby("movieId").apply(get_top10)

In [None]:
all = ratings.merge(movies, "inner", on="movieId").merge(tags_group, "inner", on="movieId")
all.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tags,relevances
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy,"[29, 584, 204, 588, 951, 377, 62, 415, 203, 374]","[0.981, 0.967, 0.96425, 0.9585, 0.93475, 0.934..."
1,5,2,3.0,1996-12-25 15:26:09,Jumanji (1995),Adventure|Children|Fantasy,"[29, 584, 204, 588, 951, 377, 62, 415, 203, 374]","[0.981, 0.967, 0.96425, 0.9585, 0.93475, 0.934..."
2,13,2,3.0,1996-11-27 08:19:02,Jumanji (1995),Adventure|Children|Fantasy,"[29, 584, 204, 588, 951, 377, 62, 415, 203, 374]","[0.981, 0.967, 0.96425, 0.9585, 0.93475, 0.934..."
3,29,2,3.0,1996-06-23 20:36:14,Jumanji (1995),Adventure|Children|Fantasy,"[29, 584, 204, 588, 951, 377, 62, 415, 203, 374]","[0.981, 0.967, 0.96425, 0.9585, 0.93475, 0.934..."
4,34,2,3.0,1996-10-28 13:29:44,Jumanji (1995),Adventure|Children|Fantasy,"[29, 584, 204, 588, 951, 377, 62, 415, 203, 374]","[0.981, 0.967, 0.96425, 0.9585, 0.93475, 0.934..."


## genres 特征构造

In [None]:
def split_genres(genres):
    return genres.split("|")

In [None]:
genres_set = set()
maxLen = 0
for index, row in movies.iterrows():
    genres = set(split_genres(row["genres"]))
    if len(genres)>maxLen:
        maxLen=len(genres)
    genres_set|=genres
genre_vocabulary_list = list(genres_set)
genre_vocabulary_list.append("<PAD>")

In [None]:
print(maxLen)

10


## keras preprocess layer

In [None]:
userId_layer = tf.keras.layers.IntegerLookup(
     max_tokens = 200000, output_mode='int')

user_embedding_layer = tf.keras.layers.Embedding(200000,32)

In [None]:
movie_genre_IDs_layer = tf.keras.layers.StringLookup(
    vocabulary=genre_vocabulary_list, num_oov_indices=0)
weighted_movie_genre_IDs_layer = tf.keras.layers.CategoryEncoding(num_tokens=len(genre_vocabulary_list)+16, output_mode='count')
movie_genre_embedding_layer = tf.keras.layers.Dense(32, use_bias=False)

In [None]:
maxTags = 1200
tag_IDs_layer = tf.keras.layers.IntegerLookup(
     max_tokens = maxTags, output_mode='int')
weighted_movie_tag_IDs_layer = tf.keras.layers.CategoryEncoding(num_tokens=maxTags+16, output_mode='count')
tag_embedding_layer = tf.keras.layers.Dense(32, use_bias=False)

In [None]:
EPS = 0  #CategoryEncoding可以处理取值为0的情况

train_model_input = {}
train_model_input["userId"] = all["userId"].values[:, np.newaxis]
train_model_input["genres"] = []
train_model_input["genre_weights"] = []
train_model_input["tags"] = np.array(all["tags"].values.tolist())
train_model_input["tags_weights"] = np.array(all["relevances"].values.tolist())

for genres_str in all["genres"].values:
    genres_list = split_genres(genres_str)
    genres_list_len = len(genres_list)
    genres_list += ["<PAD>"]*(maxLen-genres_list_len)
    train_model_input["genres"].append(genres_list)
    train_model_input["genre_weights"].append([1]*genres_list_len+[EPS]*(maxLen-genres_list_len))

# tf.keras.layers.IntegerLookup需要在加入模型之前adapt初始化，此处是为了程序能够快速执行只adapt前1k个
userId_layer.adapt(train_model_input["userId"][:1000])

In [None]:
tag_IDs_layer.adapt(np.arange(1, maxTags+16))

In [None]:
inputs = {}
feature_names = [
    "userId", "genres", "genre_weights", "tags", "tags_weights"
]
feature_dtypes = [
    tf.dtypes.int32, tf.dtypes.string, tf.dtypes.float32, tf.dtypes.int32, tf.dtypes.float32
]
feature_dimensions = [
    1, maxLen, maxLen, 10, 10
]

for feature_name, feature_dtype, feature_dimension in zip(feature_names, feature_dtypes, feature_dimensions):
    inputs[feature_name] = tf.keras.layers.Input(shape=(feature_dimension), name=feature_name, dtype=feature_dtype)

user_embedding = user_embedding_layer(userId_layer(inputs["userId"]))
genres_embedding = movie_genre_embedding_layer(\
    weighted_movie_genre_IDs_layer(\
        movie_genre_IDs_layer(inputs["genres"]), count_weights=inputs["genre_weights"]))
tags_embedding = tag_embedding_layer(\
    weighted_movie_tag_IDs_layer(\
        tag_IDs_layer(inputs["tags"]), count_weights=inputs["tags_weights"]))

item_embedding = tf.keras.layers.Concatenate(axis= -1)([genres_embedding, tags_embedding])

model = dssm_model(inputs, item_embedding, user_embedding, 
                   item_hidden_unit=[64, 64],
                   user_hidden_units=[],
                   output_hidden_units=[128,],
                   activation="relu")
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 genres (InputLayer)            [(None, 10)]         0           []                               
                                                                                                  
 tags (InputLayer)              [(None, 10)]         0           []                               
                                                                                                  
 string_lookup (StringLookup)   (None, 10)           0           ['genres[0][0]']                 
                                                                                                  
 genre_weights (InputLayer)     [(None, 10)]         0           []                               
                                                                                              

In [None]:
#train_model_input作为训练集 rating作为标签值
history = model.fit(train_model_input, ratings['rating'].values, batch_size=16, epochs=1, verbose=True, validation_split=0.2 )



In [19]:
dummy_input = {}
for key in train_model_input.keys():
    dummy_input[key] = tf.constant(train_model_input[key][:32])

In [20]:
model.predict(dummy_input)



array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)