In [1]:
import random
import time
import os

import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

from AFM import AFM, DeepAFM

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)


1 Physical GPUs, 1 Logical GPUs


In [3]:
def negative_sample(df, columns_unqiue):
    x = df.groupby(['userId', 'movieId'])
    negative_df = pd.DataFrame()
    for name, group in tqdm(x):
        for index, row in tqdm(group.iterrows(), leave=False):
            tag_options = list(set(columns_unqiue['tag']) - set(group.tag.tolist()))
           

            negative_row1 = row.copy()
            negative_row2 = row.copy()

            if tag_options:
                new_tag = random.choices(tag_options, k=2)
                negative_row1.tag = new_tag[0]
                negative_row2.tag = new_tag[1]
    
            negative_df = negative_df.append(negative_row1)
            negative_df = negative_df.append(negative_row2)
        
    
    return negative_df

In [4]:
if not os.path.exists('movielens_all.csv'):
    
    movielens_df = pd.read_csv('ml-20m/tags.csv', sep=',')
    
    columns_unqiue = {}
    for column in movielens_df.columns:
        columns_unqiue[column] = movielens_df[column].unique().tolist()

    
    negative_df = negative_sample(movielens_df, columns_unqiue)

    negative_df['label'] = [0] * negative_df.shape[0]

    negative_df = negative_df[['userId', 'movieId', 'tag']]

    movielens_df['label'] = [1] * movielens_df.shape[0]

    df = movielens_df.append(negative_df)

    df.reset_index(drop=True, inplace=True)

    df.to_csv('movielens_all.csv', index=False)

In [5]:
df = pd.read_csv('movielens_all.csv')

In [None]:
labels = df['label']
df.drop(columns=['label'], inplace=True)

In [6]:
df

Unnamed: 0,userId,movieId,tag,label
0,124998.0,76251.0,foul language,1.0
1,131620.0,5707.0,Sally Field,1.0
2,121164.0,30848.0,David E. Durston,0.0
3,91544.0,94466.0,==============,0.0
4,93258.0,924.0,high-tech firms,0.0
...,...,...,...,...
1396687,33119.0,97938.0,AFI 10 (courtroom drama),0.0
1396688,19356.0,593.0,prescient,0.0
1396689,57124.0,67197.0,watch again before hating,0.0
1396690,127138.0,70451.0,add to prospects list,1.0


In [7]:
labels = df['label']
df.drop(columns=['label'], inplace=True)

In [8]:
encoders = {}
for column in df.columns:
    encoders[column] = LabelEncoder()
    df[column] = encoders[column].fit_transform(df[column].values)

In [9]:
features = df.nunique().to_dict()

In [10]:
loss_object = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01)


In [11]:
def loss_function(y_true, y_pred):
    
    loss = loss_object(y_true=y_true, y_pred=y_pred)
    
    rmse = tf.math.sqrt(loss)
    
    return rmse

In [12]:
afm = AFM(features, embedding_size=256, attention_factor=16, rate=0.1, reg=0)


In [13]:
x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

In [14]:
train_loss = tf.keras.metrics.BinaryCrossentropy(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')
train_auc = tf.keras.metrics.AUC(name='train_auc')

val_loss = tf.keras.metrics.BinaryCrossentropy(name='val_loss')
val_accuracy = tf.keras.metrics.BinaryAccuracy(name='val_accuracy')
val_auc = tf.keras.metrics.AUC(name='val_auc')

test_loss = tf.keras.metrics.BinaryCrossentropy(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')
test_auc = tf.keras.metrics.AUC(name='test_auc')



In [15]:
def train_step(inputs, target):

    with tf.GradientTape() as tape:
        predictions  = afm(inputs, True)
        loss = loss_function(y_true=target, y_pred=predictions)
                
    gradients = tape.gradient(loss, afm.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, afm.trainable_variables))

    train_loss(y_true=target, y_pred=predictions)
    train_accuracy(y_true=target, y_pred=predictions)
    train_auc(y_true=target, y_pred=predictions)


In [16]:
EPOCHS = 5
BATCH_SIZE = 128
STEPS = x_train.shape[0] // BATCH_SIZE

In [17]:
checkpoint_path = "./checkpoints/ml/AFM"

ckpt = tf.train.Checkpoint(transformer=afm,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [18]:
x_train

Unnamed: 0,userId,movieId,tag
1118326,7168,6809,36404
1105932,7602,15559,24884
1108919,5791,1033,12634
407713,6560,17891,5715
867650,6833,319,28477
...,...,...,...
1141924,6191,1124,24895
373252,1534,46,27805
75574,960,11577,10007
342824,6943,13810,33387


In [19]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()
    train_auc.reset_states()

    
    val_loss.reset_states()
    val_accuracy.reset_states()
    val_auc.reset_states()

    for batch in range(STEPS):
        
        sample = x_train.sample(n=BATCH_SIZE)
        indexs = sample.index
        y = y_train[indexs].values.reshape((-1,1))
        x = {k: np.array(list(v.values())) for k, v in sample.to_dict().items()}
        train_step(x, y)

        if batch % 2500 == 0:
            print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result(), train_auc.result()))

    for batch in range(x_val.shape[0] // BATCH_SIZE):

        sample = x_val.sample(n=BATCH_SIZE)
        indexs = sample.index
        y = y_val[indexs].values.reshape((-1,1))
        x = {k: np.array(list(v.values())) for k, v in sample.to_dict().items()}
        val_predictions = afm(x, False)

        val_loss(y_true=y, y_pred=val_predictions)
        val_accuracy(y_true=y, y_pred=val_predictions)
        val_auc(y_true=y, y_pred=val_predictions)

    print()
    print('Validation Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(
      val_loss.result(), val_accuracy.result(), val_auc.result()))
    print()

    
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

        print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                    train_loss.result(), 
                                                    train_accuracy.result()))

        print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))


Epoch 1 Batch 0 Loss 0.6913 Accuracy 0.5156 AUC 0.5535
Epoch 1 Batch 2500 Loss 0.6498 Accuracy 0.6509 AUC 0.5103
Epoch 1 Batch 5000 Loss 0.6445 Accuracy 0.6545 AUC 0.5230
Epoch 1 Batch 7500 Loss 0.6420 Accuracy 0.6562 AUC 0.5364

Validation Loss 0.6319 Accuracy 0.6664 AUC 0.6086

Epoch 2 Batch 0 Loss 0.6533 Accuracy 0.6328 AUC 0.5782
Epoch 2 Batch 2500 Loss 0.6361 Accuracy 0.6602 AUC 0.5979
Epoch 2 Batch 5000 Loss 0.6352 Accuracy 0.6607 AUC 0.6063
Epoch 2 Batch 7500 Loss 0.6341 Accuracy 0.6616 AUC 0.6132

Validation Loss 0.6269 Accuracy 0.6660 AUC 0.6608

Epoch 3 Batch 0 Loss 0.6392 Accuracy 0.6641 AUC 0.5418
Epoch 3 Batch 2500 Loss 0.6305 Accuracy 0.6648 AUC 0.6397
Epoch 3 Batch 5000 Loss 0.6295 Accuracy 0.6655 AUC 0.6432
Epoch 3 Batch 7500 Loss 0.6286 Accuracy 0.6660 AUC 0.6474

Validation Loss 0.6221 Accuracy 0.6660 AUC 0.6935

Epoch 4 Batch 0 Loss 0.6400 Accuracy 0.6484 AUC 0.6939
Epoch 4 Batch 2500 Loss 0.6268 Accuracy 0.6658 AUC 0.6611
Epoch 4 Batch 5000 Loss 0.6252 Accuracy 0.66

In [20]:
test_loss.reset_states()
test_accuracy.reset_states()
test_auc.reset_states()

for batch in range(x_test.shape[0] // BATCH_SIZE):

        sample = x_test.sample(n=BATCH_SIZE)
        indexs = sample.index
        y = y_test[indexs].values.reshape((-1,1))
        x = {k: np.array(list(v.values())) for k, v in sample.to_dict().items()}
        test_predictions = afm(x, False)

        test_loss(y_true=y, y_pred=test_predictions)
        test_accuracy(y_true=y, y_pred=test_predictions)
        test_auc(y_true=y, y_pred=test_predictions)

print('Test Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(
      test_loss.result(), test_accuracy.result(), test_auc.result()))

Test Loss 0.6136 Accuracy 0.6735 AUC 0.7270
