In [1]:
!pip install tensorflow

from IPython.display import clear_output
from google.colab import drive
import os.path
from os import path
import collections
import pathlib
import tensorflow as tf
from tensorflow.keras import utils
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

drive.mount('/content/gdrive')
gdrive_path = '/content/gdrive/My Drive/recommender_dataset'

if not path.exists(gdrive_path):
  os.mkdir(gdrive_path)

os.chdir(gdrive_path)
%cd '/content/gdrive/MyDrive/recommender_dataset/'

# Set the destination path
destination_path='/content/gdrive/MyDrive/recommender_dataset/'

# Create the destination directory if it doesn't exist
!mkdir -p "$destination_path"

# Download the tar.gz file
!wget -O "$destination_path/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz" "https://go.criteo.net/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz"

# Extract the tar.gz file
!tar -xzvf "$destination_path/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz" -C "$destination_path"

# Remove the downloaded tar.gz file if desired
!rm "$destination_path/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz"

clear_output()

In [2]:
destination_path='/content/gdrive/MyDrive/recommender_dataset/'
file = destination_path + 'train.txt'
print(file)
columns = ['label', *(f'I{i}' for i in range(1, 14)), *(f'C{i}' for i in range(1, 27))]
df = pd.read_csv(file, nrows=1000000, sep='\t', names=columns)

/content/gdrive/MyDrive/recommender_dataset/train.txt


Preprocessing

In [13]:
cat_columns = [f'C{i}' for i in range(1, 27)]
int_columns = [f'I{i}' for i in range(1, 14)]

In [8]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X = df.fillna(0)
labels = X['label']
integer_features = X.iloc[:, 1:14]  # I1-I13
categorical_features = X.iloc[:, 14:]  # C1-C26

# Convert categorical features to numerical representations (hashing)
label_encoders = {}
for col in cat_columns:
    le = preprocessing.LabelEncoder()
    categorical_features[col] = le.fit_transform(categorical_features[col].astype(str))
    label_encoders[col] = le

features = pd.concat([integer_features, categorical_features], axis=1)

# Split dataset into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

# Prepare target and features
train_numerical_features = train_data[['I' + str(i) for i in range(1, 14)]]
train_categorical_features = train_data[cat_columns]

test_numerical_features = test_data[['I' + str(i) for i in range(1, 14)]]
test_categorical_features = test_data[cat_columns]

In [26]:
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten
from tensorflow import keras
from tensorflow.keras.models import Model

def build_mlp(input_dim):
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_dim=input_dim),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

In [28]:
embedding_dim = 8
num_cat_cols = len(cat_columns)
num_int_cols = len(int_columns)

input_dim = num_cat_cols + num_int_cols

mlp_model = build_mlp(input_dim)
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

mlp_model.fit(
    x=tf.concat([train_categorical_features, train_numerical_features], axis=1),
    y=train_labels,
    epochs=10,
    batch_size=128,
    validation_split=0.1
)
mlp_model.save('/content/gdrive/MyDrive/recommender_res/q1/mlp_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# Copy the dataframe
eval_data_mlp = test_data.copy()

# Drop the column "col2" from the copied dataframe
mlp_auc, mlp_loss = mlp_model.evaluate(eval_data_mlp, test_labels)
print("MLP AUC:", mlp_auc, "Loss:", mlp_loss)

MLP AUC: 803.6530151367188 Loss: 0.4881417751312256


In [31]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Layer

class FactorizationMachine(Layer):
    def __init__(self, input_dim, num_factors, **kwargs):
        super(FactorizationMachine, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.num_factors = num_factors

    def build(self, input_shape):
        self.V = self.add_weight(name='V',
                                 shape=(self.input_dim, self.num_factors),
                                 initializer='uniform',
                                 trainable=True)
        super(FactorizationMachine, self).build(input_shape)

    def call(self, inputs):
        linear_terms = tf.reduce_sum(inputs, axis=1, keepdims=True)
        interactions = 0.5 * tf.reduce_sum(
            tf.square(tf.matmul(inputs, self.V) -
                      tf.matmul(tf.square(inputs), tf.square(self.V))),
            axis=1, keepdims=True)
        output = linear_terms + interactions
        return output

def build_mlp_fm(input_dim, fm_num_factors):
    input_layer = keras.layers.Input(shape=(input_dim,))

    # MLP layers
    mlp = keras.Sequential([
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(64, activation='relu')
    ])
    mlp_output = mlp(input_layer)

    # FM layer
    fm_output = FactorizationMachine(input_dim=input_dim, num_factors=fm_num_factors)(input_layer)

    # Concatenate MLP and FM outputs
    concatenated = keras.layers.Concatenate()([mlp_output, fm_output])

    # Final MLP layer
    final_output = keras.layers.Dense(1, activation='sigmoid')(concatenated)

    model = keras.Model(inputs=input_layer, outputs=final_output)
    return model

fm_num_factors = 10  # Number of factors for FM

mlp_fm_model = build_mlp_fm(input_dim, fm_num_factors)
mlp_fm_model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 39)]         0           []                               
                                                                                                  
 sequential_2 (Sequential)      (None, 64)           13376       ['input_1[0][0]']                
                                                                                                  
 factorization_machine (Factori  (None, 1)           390         ['input_1[0][0]']                
 zationMachine)                                                                                   
                                                                                                  
 concatenate_2 (Concatenate)    (None, 65)           0           ['sequential_2[0][0]',     

In [32]:
mlp_fm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

mlp_fm_model.fit(
    x=tf.concat([train_categorical_features, train_numerical_features], axis=1),
    y=train_labels,
    epochs=10,
    batch_size=128,
    validation_split=0.1
)
mlp_fm_model.save('/content/gdrive/MyDrive/recommender_res/q2/mlp_fm_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
# Copy the dataframe
eval_data_mlp_fm = test_data.copy()

# Drop the column "col2" from the copied dataframe
mlp_fm_auc, mlp_fm_loss = mlp_fm_model.evaluate(eval_data_mlp_fm, test_labels)
print("MLP+FM AUC:", mlp_fm_auc, "MLP+FM Loss:", mlp_fm_loss)

MLP+FM AUC: 31842531328.0 MLP+FM Loss: 0.5
