In [33]:
"""
FM模型TensorFlow2.X构建
"""

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Input
from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
import os

# All the data could be downloaded from: https://www.kaggle.com/datasets/mrkmakr/criteo-dataset

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
"""
criteo数据集预处理

criteo数据特征说明：
- Label - 标签列，「点击」取值为1，「未点击」取值为0
- I1-I13 - 总共13列整型特征（绝大多数是计数特征）
- C1-C26 - 总共26列类别型特征，基于脱敏原因，数据经过哈希处理得到32位串
"""

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.model_selection import train_test_split

def sparse_feature(feat, feat_num, embed_dim=4):
    """
    为稀疏特征构建字典
    :@param feat: 特征名称
    :@param feat_num: 不重复的稀疏特征个数
    :@param embed_dim: 特征嵌入(embedding)的维度
    """
    return {'feat_name': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}


def dense_feature(feat):
    """
    为稠密(数值)型特征构建字典
    :@param feat: 特征名称
    """
    return {'feat_name': feat}


def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2):
    """
    criteo数据集预处理
    :@param file: 数据路径
    :@param embed_dim: 稀疏特征的嵌入(embedding)维度
    :@param read_part: 读取部分数据(在数据集很大的情况下最好设定为True)
    :@param sample_num: 部分读取的形态下，每个part的样本量
    :@param test_size: 测试集比例
    """
    names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13',
             'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
             'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',
             'C23', 'C24', 'C25', 'C26']

    # 部分读取与全部读取
    if read_part:
        data_df = pd.read_csv(file, sep='\t', iterator=True, header=None, names=names)
        data_df = data_df.get_chunk(sample_num)

    else:
        data_df = pd.read_csv(file, sep='\t', header=None, names=names)

    # 指定稀疏特征与稠密特征
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    features = sparse_features + dense_features

    # 缺失值填充
    data_df[sparse_features] = data_df[sparse_features].fillna('nan')
    data_df[dense_features] = data_df[dense_features].fillna(0)

    # 离散化处理
    est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
    data_df[dense_features] = est.fit_transform(data_df[dense_features])

    for feat in sparse_features:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])

    # 特征工程：对离散特征进行embedding处理
    feature_columns = [sparse_feature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim) for feat in features]
    train, test = train_test_split(data_df, test_size=test_size)

    # 生成训练与测试集
    train_X = train[features].values.astype('int32')
    train_y = train['label'].values.astype('int32')
    test_X = test[features].values.astype('int32')
    test_y = test['label'].values.astype('int32')

    return feature_columns, (train_X, train_y), (test_X, test_y)

In [21]:
class MyLayer(Layer):
    def __init__(self, feature_columns, k, w_r=1e-6, v_r=1e-6):
        """
        FM模型
        :@param feature_columns: A list. sparse column feature information.
        :@param k: 隐向量维度
        :@param w_r: 参数w的正则化系数
        :@param v_r: 参数v的正则化系数
        """
        super(MyLayer, self).__init__()
        self.sparse_feature_columns = feature_columns
        self.index_mapping = []
        self.feature_length = 0
        for feat in self.sparse_feature_columns:
            self.index_mapping.append(self.feature_length)
            self.feature_length += feat['feat_num']
        self.k = k
        self.w_r = w_r
        self.v_r = v_r

    def build(self, input_shape):
        self.w0 = self.add_weight(name='w0', shape=(1,),
                                  initializer=tf.zeros_initializer(),
                                  trainable=True)
        self.w = self.add_weight(name='w', shape=(self.feature_length, 1),
                                 initializer=tf.random_normal_initializer(),
                                 regularizer=l2(self.w_r),
                                 trainable=True)
        self.V = self.add_weight(name='V', shape=(self.feature_length, self.k),
                                 initializer=tf.random_normal_initializer(),
                                 regularizer=l2(self.v_r),
                                 trainable=True)

    def call(self, inputs, **kwargs):
        # 映射
        inputs = inputs + tf.convert_to_tensor(self.index_mapping)
        
        # 一阶项
        first_order = self.w0 + tf.reduce_sum(tf.nn.embedding_lookup(self.w, inputs), axis=1)  # (batch_size, 1)
        
        # 二阶项
        second_inputs = tf.nn.embedding_lookup(self.V, inputs)  # (batch_size, fields, embed_dim)
        square_sum = tf.square(tf.reduce_sum(second_inputs, axis=1, keepdims=True))  # (batch_size, 1, embed_dim)
        sum_square = tf.reduce_sum(tf.square(second_inputs), axis=1, keepdims=True)  # (batch_size, 1, embed_dim)
        second_order = 0.5 * tf.reduce_sum(square_sum - sum_square, axis=2)  # (batch_size, 1)
        
        # 一阶+二阶
        outputs = first_order + second_order
        return outputs

In [17]:
class FM(Model):
    def __init__(self, feature_columns, k, w_r=1e-6, v_r=1e-6):
        """
        Factorization Machines
        :param feature_columns: A list. sparse column feature information.
        :param k: the latent vector
        :param w_r: the regularization coefficient of parameter w
		:param v_r: the regularization coefficient of parameter v
        """
        super(FM, self).__init__()
        self.sparse_feature_columns = feature_columns
        self.fm = MyLayer(feature_columns, k, w_r, v_r)

    def call(self, inputs, **kwargs):
        fm_outputs = self.fm(inputs)
        outputs = tf.nn.sigmoid(fm_outputs)
        return outputs

    def summary(self, **kwargs):
        sparse_inputs = Input(shape=(len(self.sparse_feature_columns),), dtype=tf.int32)
        Model(inputs=sparse_inputs, outputs=self.call(sparse_inputs)).summary()

In [19]:
# 环境设定
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#フォルダ
dataDir = '/Users/hayden/Documents/data'#フォルダ

# 超参数设定
file = dataDir + '/Criteo_dataset/train.txt'
read_part = True
sample_num = 200000
test_size = 0.2

k = 8

learning_rate = 0.001
batch_size = 512
epochs = 10

# 构建数据集
feature_columns, train, test = create_criteo_dataset(file=file,
                                        read_part=read_part,
                                        sample_num=sample_num,
                                        test_size=test_size)
train_X, train_y = train
test_X, test_y = test

In [22]:
 # 模型构建
model = FM(feature_columns=feature_columns, k=k)
model.summary()
model.compile(loss=binary_crossentropy, 
                optimizer=Adam(learning_rate=learning_rate),
                metrics=[AUC()])

# 模型训练
model.fit(
    train_X,
    train_y,
    epochs=epochs,
    callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)],  # checkpoint
    batch_size=batch_size,
    validation_split=0.15
)



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 39)]              0         
                                                                 
 my_layer (MyLayer)          (None, 1)                 3751795   
                                                                 
 tf.math.sigmoid (TFOpLambd  (None, 1)                 0         
 a)                                                              
                                                                 
Total params: 3751795 (14.31 MB)
Trainable params: 3751795 (14.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




Epoch 1/10


2023-09-23 21:49:21.413246: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-09-23 21:49:28.052561: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10


<keras.src.callbacks.History at 0x2a82a9900>

In [23]:
# 测试集上验证效果
print('test AUC: %f' % model.evaluate(test_X, test_y, batch_size=batch_size)[1])

test AUC: 0.760714


In [26]:
train_X

array([[   12,   118, 83059, ...,     1,     0,     0],
       [  148,    62, 62800, ...,    12,     0,     0],
       [   12,   296, 10578, ...,     6,     0,     0],
       ...,
       [  301,   118, 68646, ...,     2,     0,     0],
       [   12,   443, 56302, ...,     0,     0,     0],
       [   12,   487, 35936, ...,     3,     0,     0]], dtype=int32)

In [34]:
train_X.shape
test_X.shape

(160000, 39)

(40000, 39)

In [36]:
model.fm.V


<tf.Variable 'my_layer/V:0' shape=(416866, 8) dtype=float32, numpy=
array([[-0.07061051,  0.03911226,  0.00684353, ..., -0.02671424,
         0.0013881 ,  0.08055133],
       [ 0.01250868, -0.01755876,  0.01218676, ..., -0.02601379,
         0.01278361, -0.03148063],
       [ 0.04311165,  0.01400692,  0.00356985, ...,  0.00036511,
         0.00878077,  0.04960729],
       ...,
       [ 0.07237215,  0.0586164 , -0.050928  , ...,  0.00987792,
        -0.04890952, -0.01900741],
       [ 0.00903635, -0.00232638,  0.00441977, ..., -0.00183274,
        -0.00590627, -0.00448157],
       [-0.03557094,  0.04855461, -0.03319177, ...,  0.07543939,
         0.01555497, -0.00623946]], dtype=float32)>

In [38]:
model.fm.w

<tf.Variable 'my_layer/w:0' shape=(416866, 1) dtype=float32, numpy=
array([[ 0.06552668],
       [-0.04119697],
       [ 0.05113696],
       ...,
       [ 0.00861098],
       [-0.02402387],
       [-0.03423011]], dtype=float32)>