In [8]:
# 在原版paper的deepfm算法做了一点改进，即fm_component和deep component输出的都不只是一个数了，而是一个响亮
# 大量参照https://github.com/ChenglongChen/tensorflow-DeepFM
# 他对deep fm进行了一点点优化，虽然数据描述不太明白，但是其数据结构设计得确实好，很精简也非常适合这个算法
# deep fm只能处理one-hot类型的，无法直接处理multi-hot特征
import numpy as np
import tensorflow as tf

In [9]:
class EmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self,feat_dim,field_num,emb_dim,*args,**kwargs):
        """

        :param feat_dim: int 每个样本的特征维度，假如每个样本可以被表征为[0,0,0,1,0,1,2]，那feat_dim就应该是7
        :param field_num: int 每个样本的特征的field_num，可以理解为有多少种特征，例如一个样本有性别和年龄两类特征(特征向量可能为[0,1,12])，那field_num就是2
        :param emb_dim: int 对于每个field的嵌入向量维度
        :param args:
        :param kwargs:
        """
        super(EmbeddingLayer,self).__init__(*args,**kwargs)
        self.feat_dim=feat_dim
        self.field_num=field_num
        self.emb_dim=emb_dim
        self.emb_layer=tf.keras.layers.Embedding(input_dim=feat_dim,output_dim=emb_dim)

    def call(self, inputs, **kwargs):

        """

        :param inputs: (feat_indices_batch, feat_value_batch)，分为两部分，你可以把他看做一个手动构造的稀疏矩阵
            例如，如果feat_indices_batch的数据为[[1,5,9],[2,7,8]]；feat_value_batch的数据为[[1,1,2.3],[1,1,0.98]]
            假设第一个样本的特征向量是x，那么x[1]=1, x[5]=1, x[9]=2.3，其余位置取值均为0。
            这样构造是因为每个样本都有field_num个field，每个field的取值只有一种（one-hot或者连续值）
            也就是说每个样本都有field_num个不为0的特征维度。而deep fm算法的嵌入方法是对每一个field嵌入，不管是不是连续值都要嵌入，然后再乘以特征取值
            例如x[9]=2.3，那就要从emb_table里找到第9个emb_vector，然后乘以2.3
        :param kwargs:
        :return:
        """

        # feat_indices_batch: [batch_size, field_num]
        # feat_value_batch: [batch_size, field_num]
        feat_indices_batch,feat_value_batch=inputs
        # 两者形状要相同，并且二者的第二个轴取值维度都是field_num个
        assert feat_indices_batch.shape==feat_value_batch.shape
        assert feat_indices_batch.shape[1:]==[self.field_num]

        emb_vectors=self.emb_layer(feat_indices_batch) # [batch_size, field_num, emb_dim]
        feat_value_batch = tf.expand_dims(feat_value_batch,axis=-1) # [batch_size, field_num, 1]

        # broadcast性质 feat_value_batch会被看做[batch_size, field_num, emb_dim]
        emb_vectors = tf.multiply(emb_vectors,feat_value_batch) # [batch_size, field_num, emb_dim]
        return emb_vectors


feat_indices_arr=[np.random.choice(range(10),size=[1,3],replace=False) for _ in range(9)]
feat_indices_arr=np.concatenate(feat_indices_arr,axis=0).astype(np.float32)
print("feat_indices_arr")
print(feat_indices_arr) #[10,3]

feat_vals_arr=np.concatenate((np.ones(shape=[9,2]),
                              np.random.random(size=[9,1])),axis=1).astype(np.float32)
print("\nfeat_vals_arr")
print(feat_vals_arr) # [10,3]

input_ds=tf.data.Dataset.from_tensor_slices((feat_indices_arr,feat_vals_arr))
batched_ds=input_ds.batch(5)
iterator=iter(batched_ds)
input_batch=next(iterator)


print("\nemb vectors")
emb_layer=EmbeddingLayer(feat_dim=10,field_num=3,emb_dim=4)
emb_vectors=emb_layer(input_batch)
print(emb_vectors)


feat_indices_arr
[[5. 1. 9.]
 [9. 7. 0.]
 [8. 7. 3.]
 [7. 9. 1.]
 [1. 0. 9.]
 [9. 5. 3.]
 [9. 5. 0.]
 [3. 5. 1.]
 [9. 8. 2.]]

feat_vals_arr
[[1.         1.         0.7929278 ]
 [1.         1.         0.89248794]
 [1.         1.         0.5548318 ]
 [1.         1.         0.2731279 ]
 [1.         1.         0.46400025]
 [1.         1.         0.6120409 ]
 [1.         1.         0.51371866]
 [1.         1.         0.6585211 ]
 [1.         1.         0.74477696]]

emb vectors
tf.Tensor(
[[[ 0.01684952 -0.04023438  0.03148881 -0.00977103]
  [-0.00602516 -0.00801171 -0.02596194 -0.04596521]
  [ 0.02065054  0.02518673 -0.01142569 -0.01360821]]

 [[ 0.0260434   0.03176421 -0.01440949 -0.01716198]
  [-0.02003713 -0.02100695  0.02142284  0.01265253]
  [ 0.03413467  0.02389419 -0.00540351  0.02527387]]

 [[ 0.01591739  0.00670235  0.04473504  0.00274844]
  [-0.02003713 -0.02100695  0.02142284  0.01265253]
  [ 0.01365053 -0.00405248 -0.01630789 -0.00757939]]

 [[-0.02003713 -0.02100695  0.021422

In [10]:
class FMComponent(tf.keras.layers.Layer):
    def __init__(self,feat_dim,field_num,emb_dim,*args,**kwargs):
        """
        同EmbeddingLayer
        :param feat_dim:
        :param field_num:
        :param emb_dim:
        :param args:
        :param kwargs:
        """
        super(FMComponent,self).__init__(*args,**kwargs)
        self.feat_dim=feat_dim
        self.field_num=field_num
        self.emb_dim=emb_dim

    def build(self, input_shape):
        self.w=tf.Variable(initial_value=tf.random.truncated_normal(shape=[self.feat_dim, self.emb_dim]))

    def call(self, inputs, **kwargs):
        """

        :param inputs: (raw_input_batch,emb_vectors)
            其中raw_input_batch是feat_indices_batch,feat_value_batch 其实就是EmbeddingLayer的输入 用于计算一阶term
            emb_vectors是后者是emb_layer的输出
        :param kwargs:
        :return:
        """
        raw_input_batch,emb_vectors=inputs # emb_vectors: [batch_size, field_num, emb_dim]
        # feat_indices_batch: [batch_size, field_num]
        # feat_value_batch: [batch_size, field_num]
        feat_indices_batch,feat_value_batch=raw_input_batch

        # first order term
        # 使用feat_indices找到embedding_lookup快速找到field_num个权重然后做相乘
        # 例如，如果一个样本x，他在特征维度1、3、5上有取值，那么他的feat_indices=[1,3,5]。那只需要从self.w找到第1、3、5个数就可以了
        # 这样的计算方法更加快速
        # 改进1 一阶输出也是一个向量而非一个标量，这就要求self.w的shape为[self.feat_dim, self.emb_dim]
        # 原本deepFM之中女
        # 一阶做的事情实际上就是，假如一个样本有三个field上的取值为[1,1,1.3]，特征id分别是1，3，5，那么一阶结果就是w1*1+w3*1+w5*1.3
        # 拓展版本就是将w1换成了一个长度为emb_size的向量
        weights=tf.nn.embedding_lookup(params=self.w,ids=tf.cast(feat_indices_batch,tf.int32)) # [batch_size, field_num, self.emb_dim]
        # 需要对feat_value_batch扩充一下，不然无法进行broadcast
        first_order_term = tf.multiply(tf.expand_dims(feat_value_batch,axis=2),weights) # [batch_size, field_num, emb_dim]
        first_order_term = tf.reduce_sum(first_order_term,axis=1) # [batch_size, emb_dim]

        # second order term
        # 下面这个是fm算法的优化算法 和平方减去平方和
        sum_square=tf.square(tf.reduce_sum(emb_vectors,axis=1)) # [batch_size, emb_dim]
        square_sum=tf.reduce_sum(tf.square(emb_vectors),axis=1) # [batch_size, emb_dim]

        second_order_term=1/2*tf.subtract(sum_square,square_sum) # [batch_size, emb_dim]
        y_fm=first_order_term+second_order_term
        return y_fm


# 突出一个问题，如果num_fields过多，会导致fm_output的数值膨胀
num_fields=3000
num_features=30000
feat_indices_arr=[np.random.choice(range(1,num_features),size=[1,num_fields],replace=False) for _ in range(9)]
feat_indices_arr=np.concatenate(feat_indices_arr,axis=0).astype(np.float32)
feat_vals_arr=np.concatenate((np.ones(shape=[9,2000]),
                              np.random.random(size=[9,1000])),axis=1).astype(np.float32)

input_ds=tf.data.Dataset.from_tensor_slices((feat_indices_arr,feat_vals_arr))
batched_ds=input_ds.batch(5)
iterator=iter(batched_ds)
input_batch=next(iterator)


print("\nemb vectors")
emb_layer=EmbeddingLayer(feat_dim=num_features,field_num=num_fields,emb_dim=4)
emb_vectors=emb_layer(input_batch)
print(emb_vectors)

fm_component=FMComponent(feat_dim=num_features,field_num=num_fields,emb_dim=4)
fm_inputs=(input_batch,emb_vectors)
fm_outputs=fm_component(fm_inputs)
print("\nfm_outputs")
print(fm_outputs)



emb vectors
tf.Tensor(
[[[-0.03016762 -0.02599541 -0.00467795 -0.04329629]
  [ 0.00803028 -0.03629998  0.00394214  0.03103927]
  [-0.01044718 -0.00198661  0.0081759   0.02610639]
  ...
  [-0.00034495 -0.03942304 -0.00878468 -0.00130062]
  [ 0.00112212 -0.00829253 -0.00185618  0.00145497]
  [-0.01404082  0.03171968  0.01547828  0.02390836]]

 [[-0.00799986 -0.01253077 -0.00314962  0.01862652]
  [ 0.02455476 -0.02964035 -0.02909072  0.0354126 ]
  [ 0.04982335  0.00999033 -0.02765146 -0.01071627]
  ...
  [ 0.03765546 -0.01602382  0.03813593  0.00277154]
  [ 0.00711776  0.02643858 -0.02162612 -0.00900532]
  [ 0.00067139 -0.00327853  0.00467062  0.00072237]]

 [[-0.03388724  0.02102203 -0.00727738 -0.02720662]
  [ 0.03614216 -0.00052707  0.00784385  0.03627434]
  [-0.04883645  0.03580966 -0.04068707  0.01569316]
  ...
  [-0.00240174 -0.00995342  0.00579038  0.00817747]
  [-0.00467994  0.01093904  0.00651143 -0.00203572]
  [-0.00718298 -0.00533949  0.00218272 -0.00767821]]

 [[ 0.0497408   

In [4]:
class DeepComponent(tf.keras.layers.Layer):
    """
    深层网络，没啥好说的
    """
    def __init__(self,deep_units_list,*args,**kwargs):
        super(DeepComponent,self).__init__(*args,**kwargs)
        self.deep_layers=list()
        for deep_units in deep_units_list:
            self.deep_layers.append(tf.keras.layers.Dense(units=deep_units,activation=tf.nn.relu))

    def call(self, inputs, **kwargs):
        for deep_layer in self.deep_layers:
            inputs=deep_layer(inputs)
        return inputs

In [6]:

class DeepFM(tf.keras.Model):
    def __init__(self,feat_dim,field_num,emb_dim,deep_units_list,scoring_units=2,*args,**kwargs):
        """
        同EmbeddingLayer描述
        :param feat_dim:
        :param field_num:
        :param emb_dim:
        :param deep_units_list:
        :param scoring_units: int 输出的类别数目，两类问题就是2，三类问题就是3
        :param args:
        :param kwargs:
        """
        super(DeepFM,self).__init__(*args,**kwargs)

        self.emb_layer=EmbeddingLayer(feat_dim=feat_dim,field_num=field_num,emb_dim=emb_dim)
        self.fm_component=FMComponent(feat_dim=feat_dim,field_num=field_num,emb_dim=emb_dim)
        self.deep_component=DeepComponent(deep_units_list=deep_units_list)
        self.scoring_layer=tf.keras.layers.Dense(units=scoring_units,activation=None)

    def call(self, inputs, training=None, mask=None):
        emb_vectors=self.emb_layer(inputs)

        fm_inputs=(inputs,emb_vectors)
        y_fm=self.fm_component(fm_inputs)

        deep_inputs=tf.reshape(emb_vectors,shape=[emb_vectors.shape[0],-1])
        y_deep=self.deep_component(deep_inputs)
        y = self.scoring_layer(tf.concat((y_fm,y_deep),axis=1))

        return y


feat_indices_arr=[np.random.choice(range(10),size=[1,3],replace=False) for _ in range(9)]
feat_indices_arr=np.concatenate(feat_indices_arr,axis=0).astype(np.float32)
feat_vals_arr=np.concatenate((np.ones(shape=[9,2]),
                              np.random.random(size=[9,1])),axis=1).astype(np.float32)

input_ds=tf.data.Dataset.from_tensor_slices((feat_indices_arr,feat_vals_arr))
batched_ds=input_ds.batch(5)
iterator=iter(batched_ds)
input_batch=next(iterator)

deep_fm_model=DeepFM(feat_dim=10,field_num=3000,emb_dim=4,deep_units_list=[10,8])
deep_fm_model(input_batch)



<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[-0.26829186, -0.108273  ],
       [-1.1350449 ,  2.1167572 ],
       [-1.3979748 ,  1.5567256 ],
       [-0.7603167 ,  1.079642  ],
       [-0.16347432, -1.9003195 ]], dtype=float32)>

[[0.67044261 0.75686147]
 [0.14368512 0.98200156]
 [0.7414385  0.31037816]]
tf.Tensor(
[[0.47840872 0.52159128]
 [0.30188948 0.69811052]
 [0.60612684 0.39387316]], shape=(3, 2), dtype=float64)
