In [1]:
import os
import numpy as np
from src.DSSM import dssm_model_feature_column as dssm_model

In [2]:
import pandas as pd
import tensorflow as tf

## 数据预处理

In [3]:
FOLDER = "movielen"

In [4]:
ratings = pd.read_csv(
    os.path.join(FOLDER, 'rating.csv') 
)

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
movies = pd.read_csv(
    os.path.join(FOLDER, 'movie.csv')
)

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
all = ratings.merge(movies, "inner", on="movieId")
all.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,1996-12-25 15:26:09,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,1996-11-27 08:19:02,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,1996-06-23 20:36:14,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,1996-10-28 13:29:44,Jumanji (1995),Adventure|Children|Fantasy


## genres 特征构造

In [7]:
def split_genres(genres):
    return genres.split("|")

In [8]:
genres_set = set()
maxLen = 0
for index, row in movies.iterrows():
    genres = set(split_genres(row["genres"]))
    if len(genres)>maxLen:
        maxLen=len(genres)
    genres_set|=genres
genre_vocabulary_list = list(genres_set)
genre_vocabulary_list.append("<PAD>")

In [9]:
print(maxLen)

10


## feature column 构造

In [10]:
userId = tf.feature_column.categorical_column_with_identity(
    "userId",
    num_buckets=200000,
    default_value=0,
)

user_embedding = tf.feature_column.embedding_column(
    userId,
    dimension=32
)

In [11]:
# movieId = tf.feature_column.categorical_column_with_identity(
#     "movieId",
#     num_buckets=50000,
#     default_value=0,
# )

# movie_embedding = tf.feature_column.embedding_column(
#     movieId,
#     dimension=32,
# )

In [12]:
movie_genre_IDs = tf.feature_column.categorical_column_with_vocabulary_list(
    "genres",
    genre_vocabulary_list,
    default_value=0,
)

#使用weight将pad的部分mask
weighted_movie_genre_col = tf.feature_column.weighted_categorical_column(
    movie_genre_IDs, 'genre_weights')

movie_genres_embedding = tf.feature_column.embedding_column(
    weighted_movie_genre_col, 32, combiner='sum')

In [13]:
EPS = 1e-5  #取值为0会导致weighted_categorical_column忽略这个数值，使得embedding和weight的维度不匹配
dnn_feature_columns = [user_embedding, movie_genres_embedding]
# feature_names = [
#     "userId", "movieId",
# ]
train_model_input = {}
train_model_input["userId"] = all["userId"].values[:, np.newaxis]
train_model_input["genres"] = []
train_model_input["genre_weights"] = []

for genres_str in all["genres"].values:
    genres_list = split_genres(genres_str)
    genres_list_len = len(genres_list)
    genres_list += ["<PAD>"]*(maxLen-genres_list_len)
    train_model_input["genres"].append(genres_list)
    train_model_input["genre_weights"].append([1]*genres_list_len+[EPS]*(maxLen-genres_list_len))



In [14]:
inputs = {}
feature_names = [
    "userId", "genres", "genre_weights",
]
feature_dtypes = [
    tf.dtypes.int32, tf.dtypes.string, tf.dtypes.float32, 
]
feature_dimensions = [
    1, maxLen, maxLen
]

for feature_name, feature_dtype, feature_dimension in zip(feature_names, feature_dtypes, feature_dimensions):
    inputs[feature_name] = tf.keras.layers.Input(shape=(feature_dimension), name=feature_name, dtype=feature_dtype)
    

model = dssm_model(inputs, [movie_genres_embedding], [user_embedding], 
                   item_hidden_unit=[64, 64],
                   user_hidden_units=[],
                   output_hidden_units=[128,],
                   activation="relu")
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 genre_weights (InputLayer)     [(None, 10)]         0           []                               
                                                                                                  
 genres (InputLayer)            [(None, 10)]         0           []                               
                                                                                                  
 userId (InputLayer)            [(None, 1)]          0           []                               
                                                                                                  
 dense_features (DenseFeatures)  (None, 32)          672         ['genre_weights[0][0]',          
                                                                  'genres[0][0]',             

In [None]:
#train_model_input作为训练集 rating作为标签值
history = model.fit(train_model_input, ratings['rating'].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2 )

In [16]:
dummy_input = {}
for key in train_model_input.keys():
    dummy_input[key] = tf.constant(train_model_input[key][:32])

In [17]:
model.predict(dummy_input)



array([[0.        ],
       [0.        ],
       [0.02157721],
       [0.06062543],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00323882],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.01674271],
       [0.        ]], dtype=float32)

## feature columns 构建正确性的校验

In [17]:
def call_feature_columns(feature_columns, inputs):
  # This is a convenient way to call a `feature_column` outside of an estimator
  # to display its output.
  feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
  return feature_layer(inputs)

In [18]:
G = tf.constant(
    [['Adventure', 'Animation', '<PAD>', '<PAD>', '<PAD>',
        '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'],
       ['Adventure', 'Children', 'Fantasy', '<PAD>', '<PAD>',
        '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'],
        ['Adventure', 'Children', 'Fantasy', '<PAD>', '<PAD>','<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'],
        ['Adventure', 'Children', 'Fantasy', '<PAD>', '<PAD>','<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'],
        ]
)
W = tf.constant(
    [
        [1.,1,1,1,1,1,1,1,1,1],
        [EPS,1,1,1,1,1,1,1,1,1],
        [1.,EPS,1,1,1,1,1,1,1,1],
        [1.,1,EPS,1,1,1,1,1,1,1],

     ]
) 
call_feature_columns(movie_genres_embedding, {"genres": G, "genre_weights": W})

<tf.Tensor: shape=(4, 32), dtype=float32, numpy=
array([[-0.01025002, -0.19998565, -0.04607623, -0.04126164, -0.07219312,
        -0.04555692,  0.02510051,  0.03658491, -0.05945265, -0.23975222,
         0.12487745,  0.2613276 , -0.05230153,  0.07742523, -0.01574746,
         0.04816418,  0.10785003, -0.03619203, -0.00247046,  0.01726955,
         0.04688532,  0.14554428,  0.03743019,  0.13508302,  0.07975156,
        -0.02729838,  0.23728755,  0.03629925,  0.19420603,  0.10194741,
         0.0269358 ,  0.17606667],
       [-0.01084621, -0.1139962 , -0.03842422, -0.06041563, -0.05343094,
        -0.06304001,  0.01288665,  0.04135886, -0.05852029, -0.21205896,
         0.13876   ,  0.26895538, -0.02417302,  0.01237551,  0.04414184,
         0.0321498 ,  0.101023  , -0.01964763, -0.04555282,  0.02190899,
         0.04260078,  0.13064252,  0.05654093,  0.0866235 ,  0.08348385,
        -0.03203323,  0.30250815,  0.09829962,  0.13228078,  0.09878932,
         0.00320143,  0.18916929],
     

In [18]:
class FM(layers.Layer):
    """显示特征交叉，直接按照优化后的公式实现即可
    注意：
        1. 传入进来的参数看起来是一个Embedding权重，没有像公式中出现的特征，那是因
        为，输入的id特征本质上都是onehot编码，取出对应的embedding就等价于特征乘以
        权重。所以后续的操作直接就是对特征进行操作
        2. 在实现过程中，对于公式中的平方的和与和的平方两部分，需要留意是在哪个维度
        上计算，这样就可以轻松实现FM特征交叉模块
    """
    def __init__(self, **kwargs):
        super(FM, self).__init__(**kwargs)

    def build(self, input_shape):
        if not isinstance(input_shape, list) or len(input_shape) < 2:
            raise ValueError('`FM` layer should be called \
                on a list of at least 2 inputs')
        super(FM, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, inputs, **kwargs):
        """
        inputs: 是一个列表，列表中每个元素的维度为：(None, 1, emb_dim)， 列表长度
            为field_num
        """
        inputs = [tf.expand_dims(input, 1) for input in inputs]
        concated_embeds_value =  tf.concat(inputs, axis=1) #(None,field_num,emb_dim)
        # 根据最终优化的公式计算即可，需要注意的是计算过程中是沿着哪个维度计算的，将代码和公式结合起来看会更清晰
        square_of_sum = tf.square(tf.reduce_sum(
            concated_embeds_value, axis=1, keepdims=True)) # (None, 1, emb_dim)
        sum_of_square = tf.reduce_sum(
            concated_embeds_value * concated_embeds_value,
             axis=1, keepdims=True) # (None, 1, emb_dim)
        cross_term = square_of_sum - sum_of_square
        cross_term = 0.5 * tf.reduce_sum(cross_term, axis=2, keepdims=False)#(None,1)
        return cross_term

    def compute_output_shape(self, input_shape):
        return (None, 1)
    
    def get_config(self):
        return super().get_config()

In [21]:
fm_layer = FM()