In [63]:
from datetime import datetime
print(str(datetime.now()))
import numpy as np
# os.environ["TF_CPP_MIN_LOG_LEVEL"] ="3"
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import math

import os
import glob

2022-08-23 22:56:13.979795


In [5]:
gpu_available = tf.test.is_gpu_available()
gpu_available
req_cols = ['ITEM_ID', 'USER_ID', 'CABIN_TYPE', 'USER_RESIDENCE']

In [18]:
data_set_df_updated = pd.read_csv("interaction_demo.csv")
test_df = pd.read_csv("interaction_test_demo.csv")
data_set_df_updated.loc[data_set_df_updated.USER_RESIDENCE.isnull(),"USER_RESIDENCE"] = 'None'
test_df.loc[test_df.USER_RESIDENCE.isnull(),"USER_RESIDENCE"] = 'None'
train_df = pd.concat([data_set_df_updated, test_df], ignore_index=True)
train_df.sort_values("TIMESTAMP", ascending= False, inplace= True)

In [22]:
item_df = train_df[["ITEM_ID"]].drop_duplicates("ITEM_ID")
item_ds = tf.data.Dataset.from_tensor_slices(item_df.to_dict("list")).batch(32)
item_df.shape

(350, 1)

In [23]:
train_ds =  tf.data.Dataset.from_tensor_slices(train_df[req_cols].to_dict("list")).batch(256)

In [24]:
USER_ID_unique = np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["USER_ID"]))))

CABIN_TYPE_unique = np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["CABIN_TYPE"]))))

USER_RESIDENCE_unique = np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["USER_RESIDENCE"]))))

In [26]:
item_unique =  np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["ITEM_ID"]))))

In [27]:
class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        emb_dim = 32    
        self.user_id_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=USER_ID_unique, mask_token=None),
            tf.keras.layers.Embedding(len(USER_ID_unique) + 1, emb_dim),
        ])
            
        self.cabin_type_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary= CABIN_TYPE_unique, mask_token=None),  
            tf.keras.layers.Embedding(len(CABIN_TYPE_unique) + 1, emb_dim),
        ])

        self.user_residence_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=USER_RESIDENCE_unique, mask_token=None),
            tf.keras.layers.Embedding(len(USER_RESIDENCE_unique) + 1, emb_dim),
        ])
        

    def call(self, user_interation_data):
        return tf.concat([                          
            self.user_id_embedding(user_interation_data["USER_ID"]), 
            self.cabin_type_embedding(user_interation_data["CABIN_TYPE"]), 
            self.user_residence_embedding(user_interation_data["USER_RESIDENCE"]),
        ], axis=1)





In [28]:
class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        

        self.item_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=item_unique, mask_token=None),
            tf.keras.layers.Embedding(len(item_unique) + 1, 32),
        ])


    def call(self, user_interation_data):

        return tf.concat([
            self.item_embedding(user_interation_data["ITEM_ID"])
            
            ], axis=1)

In [88]:
class TRFSRetrievalModel(tfrs.models.Model):

    def __init__(self, UserModel,ItemModel, item_ds ):
        super().__init__()



        self.query_model = tf.keras.Sequential([#,UserModel()
          UserModel(),
          tf.keras.layers.Dense(32 , kernel_initializer= tf.keras.initializers.RandomNormal(seed=99)),   
        ])
        

        self.candidate_model = tf.keras.Sequential([
          ItemModel(),
          tf.keras.layers.Dense(32, kernel_initializer= tf.keras.initializers.RandomNormal(seed=1))
        ]) 
        
        


#         metrics = [
#           tf.keras.metrics.TopKCategoricalAccuracy(
#               k=x, name=f"factorized_top_k/top_{x}_categorical_accuracy")
#           for x in [3,5,10,15, 25]
#         ]  
        
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
            item_ds.map(self.candidate_model),
                ks= (3, 5, 10,15, 25))
        )
        
#         self.task = tfrs.tasks.Retrieval(
#             metrics=tfrs.metrics.FactorizedTopK(
#                 candidates=item_ds.map(self.candidate_model),
#                 metrics = metrics,
#                 k = 100
#             ),
#             # temperature = 0.5,
#             num_hard_negatives = 5
#         )

    def compute_loss(self, features, training= True):

        item_features = {"ITEM_ID":features.pop("ITEM_ID") }
        query_embeddings = self.query_model(features)
        item_embeddings = self.candidate_model(item_features)

        return self.task(query_embeddings, 
        item_embeddings, 
        compute_metrics=True
        )

    def call(self, test):
        features= test.copy()
        item_features = {"ITEM_ID":features.pop("ITEM_ID") }
        query_embeddings = self.query_model(features)
        item_embeddings = self.candidate_model(item_features)

        return query_embeddings, item_embeddings


In [89]:
test_max_index = math.floor(train_df.shape[0]*0.1)
train_split_len = train_df.shape[0] - test_max_index
data_set_tf = tf.data.Dataset.from_tensor_slices(train_df[req_cols].to_dict("list"))
test = data_set_tf.take(test_max_index)
train = data_set_tf.skip(test_max_index).take(train_split_len)
shuffled = train.shuffle(train_split_len, seed=42, reshuffle_each_iteration=True)
cached_train = shuffled.batch(512).prefetch(4096)#train.shuffle(100_000).batch(2048)
cached_test = test.batch(512).cache()


In [90]:
train_df.shape[0], test_max_index

(34853, 3485)

In [91]:
def delete_all_check_points(fpath= 'new_amazon_check_points/*'):
    files = glob.glob(fpath)
    for f in files:
        os.remove(f)
        

In [92]:
def get_call_back_fun(K):
    delete_all_check_points()
    model_path = f"new_amazon_check_points/best_check_point_{K}k"
    
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=model_path,
        save_weights_only=True,
        monitor=f'val_factorized_top_k/top_{K}_categorical_accuracy',
        mode='max',
        save_best_only=True)

    early_stoping = tf.keras.callbacks.EarlyStopping(monitor=f'val_factorized_top_k/top_{K}_categorical_accuracy',
                                                     mode='min',
                                                     patience=5)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor=f'val_factorized_top_k/top_{K}_categorical_accuracy', 
                                                     factor=0.6,
                                                     #mode='min',
                                                     patience=9, 
                                                     min_lr=1e-6
    )
    return model_path, model_checkpoint_callback, early_stoping, reduce_lr

In [93]:
model_path, model_checkpoint_callback, early_stoping, reduce_lr = get_call_back_fun(10)

In [94]:
model = TRFSRetrievalModel(UserModel, ItemModel, item_ds)
model.compile(optimizer=tf.keras.optimizers.Adam(0.01)) 



In [95]:
model.fit(cached_train,
          validation_data=cached_test,
          epochs=100,
          verbose=1, 
          workers=3,
          use_multiprocessing=True,
          callbacks=[model_checkpoint_callback, 
                     reduce_lr]

          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100


Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100


Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100


Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100


Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100


Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7efecc39c700>

In [96]:
model.load_weights("new_amazon_check_points/best_check_point_10k")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7efeec76a7f0>

In [97]:
result_summary = model.evaluate(cached_test)



In [99]:
{val:result_summary[idx] for idx, val in enumerate([3, 5, 10,15, 25]) }

{3: 0.20459111034870148,
 5: 0.2622668445110321,
 10: 0.34318506717681885,
 15: 0.39856529235839844,
 25: 0.4837876558303833}