In [23]:
"""
TensorFlow2.Xで構築
"""

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Embedding, Dropout, Dense, Input, Layer
import tensorflow as tf
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
import os

# All the data could be downloaded from: https://www.kaggle.com/datasets/mrkmakr/criteo-dataset

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

'\nTensorFlow2.Xで構築\n'

In [22]:
"""
criteo dataset preprocessing

criteo data feature introduce：
- Label - Y label，「Click」value=1，「Non-Click」value=0
- I1-I13 - Totally 13 col integer data feature（most of them are counting features）
- C1-C26 - Totally 26 col catagory features，for secure reason，original data was transformed to 32 bit data
"""

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.model_selection import train_test_split

def sparse_feature(feat, feat_num, embed_dim=4):
    """
    Build a dictionary for sparse features
    :@param feat: features name
    :@param feat_num: The number of sparse features that are not repeated
    :@param embed_dim: Dimension of the feature embedding
    """
    return {'feat_name': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}


def dense_feature(feat):
    """
    Build dictionaries for dense (numerical) type features
    :@param feat: features name
    """
    return {'feat_name': feat}


def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2):
    """
    criteo data set preprocessing
    :@param file: data file path
    :@param embed_dim: The embedding dimension of sparse features
    :@param read_part: Read partial data (best set to True if the data set is large)
    :@param sample_num: Sample size for each part under the partial read form
    :@param test_size: Test set ratio
    """
    names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13',
             'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
             'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',
             'C23', 'C24', 'C25', 'C26']

    # Partial read and full read
    if read_part:
        data_df = pd.read_csv(file, sep='\t', iterator=True, header=None, names=names)
        data_df = data_df.get_chunk(sample_num)

    else:
        data_df = pd.read_csv(file, sep='\t', header=None, names=names)

    # Specify sparse and dense features
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    features = sparse_features + dense_features

    # Missing value filling
    data_df[sparse_features] = data_df[sparse_features].fillna('nan')
    data_df[dense_features] = data_df[dense_features].fillna(0)

    # discretization
    est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
    data_df[dense_features] = est.fit_transform(data_df[dense_features])

    for feat in sparse_features:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])

    # Feature engineering: embedding of discrete features
    feature_columns = [sparse_feature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim) for feat in features]
    train, test = train_test_split(data_df, test_size=test_size)

    # Generate training and test sets
    train_X = train[features].values.astype('int32')
    train_y = train['label'].values.astype('int32')
    test_X = test[features].values.astype('int32')
    test_y = test['label'].values.astype('int32')

    return feature_columns, (train_X, train_y), (test_X, test_y)

'\ncriteo dataset preprocessing\n\ncriteo data feature introduce：\n- Label - Y label，「Click」value=1，「Non-Click」value=0\n- I1-I13 - Totally 13 col integer data feature（most of them are counting features）\n- C1-C26 - Totally 26 col catagory features，for secure reason，original data was transformed to 32 bit data\n'

In [12]:
class MyFM(Layer):
    """
    Wide part
    """
    def __init__(self, feature_length, w_r=1e-6):
        """
        
        :@param feature_length: lenth of feature
        :@param w_r: w regularization coefficient
        """
        super(MyFM, self).__init__()
        self.feature_length = feature_length
        self.w_r = w_r

    def build(self, input_shape):
        self.w = self.add_weight(name='w', shape=(self.feature_length, 1),
                                 initializer='random_normal',
                                 regularizer=l2(self.w_r),
                                 trainable=True)

    def call(self, inputs, **kwargs):
        """
        :@param inputs: 1个字典，维度为 `(batch_size, {'sparse_inputs', 'embed_inputs'})`:
          其中sparse_inputs是一个维度为 `(batch_size, sum(field_num))`的2D tensor
             embed_inputs是一个维度为 `(batch_size, fields, embed_dim)`的3D tensor
        """
        sparse_inputs, embed_inputs = inputs['sparse_inputs'], inputs['embed_inputs']
        
        # First-order term
        first_order = tf.reduce_sum(tf.nn.embedding_lookup(self.w, sparse_inputs), axis=1)  # (batch_size, 1)
        
        # second-order term
        square_sum = tf.square(tf.reduce_sum(embed_inputs, axis=1, keepdims=True))  # (batch_size, 1, embed_dim)
        sum_square = tf.reduce_sum(tf.square(embed_inputs), axis=1, keepdims=True)  # (batch_size, 1, embed_dim)
        second_order = 0.5 * tf.reduce_sum(square_sum - sum_square, axis=2)  # (batch_size, 1)
        return first_order + second_order

In [13]:
class MyDNN(Layer):
    """
    Deep part
    """
    def __init__(self, hidden_units, activation='relu', dnn_dropout=0.):
        """
        DNN part
        :@param hidden_units: List of hidden layer neurons in the shape of '[unit1, unit2,...,]'
        :@param activation: Activation function
        :@param dnn_dropout: dropout rate
        """
        super(MyDNN, self).__init__()
        self.dnn_network = [Dense(units=unit, activation=activation) for unit in hidden_units]
        self.dropout = Dropout(dnn_dropout)

    def call(self, inputs, **kwargs):
        x = inputs
        for dnn in self.dnn_network:
            x = dnn(x)
        x = self.dropout(x)
        return x

In [14]:
class DeepFM(Model):
	def __init__(self, feature_columns, hidden_units=(200, 200, 200), dnn_dropout=0.,
				 activation='relu', fm_w_r=1e-6, embed_reg=1e-6):
		"""
		DeepFM
		:@param feature_columns: Sparse feature list
		:@param hidden_units: List of hidden layer neurons
		:@param dnn_dropout: Dropout rate
		:@param activation: Activation function
		:@param fm_w_r: The regularization coefficient of w in FM
		:@param embed_reg: embedding regularization coefficient
		"""
		super(DeepFM, self).__init__()
		self.sparse_feature_columns = feature_columns
		self.embed_layers = {
			'embed_' + str(i): Embedding(input_dim=feat['feat_num'],
										 input_length=1,
										 output_dim=feat['embed_dim'],
										 embeddings_initializer='random_normal',
										 embeddings_regularizer=l2(embed_reg))
			for i, feat in enumerate(self.sparse_feature_columns)
		}
		self.index_mapping = []
		self.feature_length = 0
		for feat in self.sparse_feature_columns:
			self.index_mapping.append(self.feature_length)
			self.feature_length += feat['feat_num']
		self.embed_dim = self.sparse_feature_columns[0]['embed_dim']  # all sparse features have the same embed_dim
		self.fm = MyFM(self.feature_length, fm_w_r)
		self.dnn = MyDNN(hidden_units, activation, dnn_dropout)
		self.dense = Dense(1, activation=None)

	def call(self, inputs, **kwargs):
		sparse_inputs = inputs
		# embedding
		sparse_embed = tf.concat([self.embed_layers['embed_{}'.format(i)](sparse_inputs[:, i])
                                  for i in range(sparse_inputs.shape[1])], axis=-1)  # (batch_size, embed_dim * fields)
		# wide
		sparse_inputs = sparse_inputs + tf.convert_to_tensor(self.index_mapping)
		wide_inputs = {'sparse_inputs': sparse_inputs,
					   'embed_inputs': tf.reshape(sparse_embed, shape=(-1, sparse_inputs.shape[1], self.embed_dim))}
		wide_outputs = self.fm(wide_inputs)  # (batch_size, 1)
		# deep
		deep_outputs = self.dnn(sparse_embed)
		deep_outputs = self.dense(deep_outputs)  # (batch_size, 1)
		
		# combination
		outputs = tf.nn.sigmoid(tf.add(0.5 * wide_outputs, 0.5 * deep_outputs))
		return outputs

	def summary(self):
		sparse_inputs = Input(shape=(len(self.sparse_feature_columns),), dtype=tf.int32)
		Model(inputs=sparse_inputs, outputs=self.call(sparse_inputs)).summary()

In [8]:

 # Environment setting
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#フォルダ
dataDir = '/Users/hayden/Documents/data'#フォルダ

# Hyperparameter setting
file = dataDir + '/Criteo_dataset/train.txt'
read_part = True
sample_num = 200000
test_size = 0.2

embed_dim = 8
dnn_dropout = 0.5
hidden_units = [256, 128, 64]

learning_rate = 0.001
batch_size = 4096
epochs = 10

# Build data set
feature_columns, train, test = create_criteo_dataset(file=file,
                                                        embed_dim=embed_dim,
                                                        read_part=read_part,
                                                        sample_num=sample_num,
                                                        test_size=test_size)
train_X, train_y = train
test_X, test_y = test

In [15]:
 # model building
model = DeepFM(feature_columns, hidden_units=hidden_units, dnn_dropout=dnn_dropout)
model.summary()
model.compile(loss=binary_crossentropy, 
                optimizer=Adam(learning_rate=learning_rate),
                metrics=[AUC()])

# model training
model.fit(
    train_X,
    train_y,
    epochs=epochs,
    callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)],  # checkpoint,
    batch_size=batch_size,
    validation_split=0.1
)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 39)]                 0         []                            
                                                                                                  
 tf.__operators__.getitem (  (None,)                      0         ['input_1[0][0]']             
 SlicingOpLambda)                                                                                 
                                                                                                  
 tf.__operators__.getitem_1  (None,)                      0         ['input_1[0][0]']             
  (SlicingOpLambda)                                                                               
                                                                                              



Epoch 1/10


2023-09-23 22:22:48.491269: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-09-23 22:23:01.017742: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.src.callbacks.History at 0x16a38b280>

In [16]:
# Performance on testing set
print('test AUC: %f' % model.evaluate(test_X, test_y, batch_size=batch_size)[1])

test AUC: 0.755763
