In [None]:
from datetime import datetime
print(str(datetime.now()))
import numpy as np
# os.environ["TF_CPP_MIN_LOG_LEVEL"] ="3"
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import math

import os
import glob
import matplotlib.pyplot as plt


In [None]:
# !pip install matplotlib

In [None]:
gpu_available = tf.test.is_gpu_available()
gpu_available
req_cols = ['ITEM_ID', 'USER_ID', 'CABIN_TYPE', 'USER_RESIDENCE']

In [None]:
data_set_df_updated = pd.read_csv("dataset/interaction_demo.csv")
test_df = pd.read_csv("dataset/interaction_test_demo.csv")
data_set_df_updated.loc[data_set_df_updated.USER_RESIDENCE.isnull(),"USER_RESIDENCE"] = 'None'
test_df.loc[test_df.USER_RESIDENCE.isnull(),"USER_RESIDENCE"] = 'None'
train_df = pd.concat([data_set_df_updated, test_df], ignore_index=True)
train_df.sort_values("TIMESTAMP", ascending= False, inplace= True)

In [None]:
train_df_item_count = train_df.groupby(["ITEM_ID"]).size().reset_index(name='counts')

In [None]:
train_df_item_count["probability"]= train_df_item_count["counts"] / train_df_item_count["counts"].sum()

In [None]:
train_df = train_df.merge(train_df_item_count[["ITEM_ID","probability"]], how='left', on='ITEM_ID',)


In [None]:
train_df

In [None]:
item_df = train_df[["ITEM_ID"]].drop_duplicates("ITEM_ID")
item_ds = tf.data.Dataset.from_tensor_slices(item_df.to_dict("list")).batch(32)
item_df.shape

In [None]:
req_cols = req_cols+["probability","EVENT_VALUE"]

In [None]:
train_ds =  tf.data.Dataset.from_tensor_slices(train_df[req_cols].to_dict("list")).batch(256)

In [None]:
USER_ID_unique = np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["USER_ID"]))))

CABIN_TYPE_unique = np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["CABIN_TYPE"]))))

USER_RESIDENCE_unique = np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["USER_RESIDENCE"]))))

In [None]:
item_unique =  np.unique(np.concatenate(list(train_ds.map(
        lambda x: x["ITEM_ID"]))))

In [None]:
# def rankL(np_rank):
#     r = int(np_rank[-1])
#     _l = 0
#     for k in range(1, r+1):
#         _l += 1./k
#     return np.float32(_l)


# """
# labels are assumed to be 1 hot encoded
# """
# def warp_loss(labels, logits):
#     # for easy broadcasting
#     labels, logits = tf.transpose(labels, [1, 0]), tf.transpose(logits, [1, 0])
#     f_y = tf.reduce_sum(logits*labels, axis=0)
#     rank = tf.reduce_sum(tf.maximum(tf.sign(1+logits-f_y), 0), axis=0)
#     diff = tf.reduce_sum(tf.maximum(1+logits-f_y, 0), axis=0)
#     with tf.control_dependencies([tf.assert_greater(rank, tf.zeros_like(rank))]):
#         return tf.py_func(rankL, [rank], tf.float32) * diff/rank

In [None]:
class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        emb_dim = 8    
        self.user_id_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=USER_ID_unique, mask_token=None),
            tf.keras.layers.Embedding(len(USER_ID_unique) + 1, 16),
        ])
            
        self.cabin_type_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary= CABIN_TYPE_unique, mask_token=None),  
            tf.keras.layers.Embedding(len(CABIN_TYPE_unique) + 1, emb_dim),
        ])

        self.user_residence_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=USER_RESIDENCE_unique, mask_token=None),
            tf.keras.layers.Embedding(len(USER_RESIDENCE_unique) + 1, emb_dim),
        ])
        

    def call(self, user_interation_data):
        return tf.concat([                          
            self.user_id_embedding(user_interation_data["USER_ID"]), 
            self.cabin_type_embedding(user_interation_data["CABIN_TYPE"]), 
            self.user_residence_embedding(user_interation_data["USER_RESIDENCE"]),
        ], axis=1)





In [None]:
class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        

        self.item_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=item_unique, mask_token=None),
            tf.keras.layers.Embedding(len(item_unique) + 1, 32),
        ])


    def call(self, user_interation_data):

        return tf.concat([
            self.item_embedding(user_interation_data["ITEM_ID"])
            
            ], axis=1)

In [None]:
class TRFSRetrievalModel(tfrs.models.Model):

    def __init__(self, UserModel,ItemModel, item_ds ):
        super().__init__()



        self.query_model = tf.keras.Sequential([#,UserModel()
          UserModel(),
#           tf.keras.layers.Dense(32 , kernel_initializer= tf.keras.initializers.RandomNormal(seed=99)),  
#           tf.keras.layers.Dropout(0.2),
        ])
        

        self.candidate_model = tf.keras.Sequential([
          ItemModel(),
#           tf.keras.layers.Dense(32, kernel_initializer= tf.keras.initializers.RandomNormal(seed=1)),
#           tf.keras.layers.Dropout(0.2),
        ]) 
        
        


#         metrics = [
#           tf.keras.metrics.TopKCategoricalAccuracy(
#               k=x, name=f"factorized_top_k/top_{x}_categorical_accuracy")
#           for x in [3,5,10,15, 25]
#         ]  
        
        self.task = tfrs.tasks.Retrieval(
#             loss=warp_loss,
#             num_hard_negatives=100,
            metrics=tfrs.metrics.FactorizedTopK(
            item_ds.map(self.candidate_model),
                ks= (3, 5, 10,15, 25)),
                          

        )
        
#         self.task = tfrs.tasks.Retrieval(
#             metrics=tfrs.metrics.FactorizedTopK(
#                 candidates=item_ds.map(self.candidate_model),
#                 metrics = metrics,
#                 k = 100
#             ),
#             # temperature = 0.5,
#             num_hard_negatives = 5
#         )

    def compute_loss(self, features, training= True):

        item_features = {"ITEM_ID":features.pop("ITEM_ID") }
        candidate_sampling_probability = features.pop("probability")
        sample_weight= features.pop("EVENT_VALUE")
        query_embeddings = self.query_model(features)
        item_embeddings = self.candidate_model(item_features)
        return self.task(query_embeddings, 
        item_embeddings, 
        compute_metrics=True,
#         sample_weight= sample_weight,
        candidate_sampling_probability = candidate_sampling_probability
        )

    def call(self, test):
        features= test.copy()
        __ = features.pop("probability")
        item_features = {"ITEM_ID":features.pop("ITEM_ID") }
        query_embeddings = self.query_model(features)
        item_embeddings = self.candidate_model(item_features)

        return query_embeddings, item_embeddings


In [None]:
test_max_index = math.floor(train_df.shape[0]*0.1)
train_split_len = train_df.shape[0] - test_max_index
data_set_tf = tf.data.Dataset.from_tensor_slices(train_df[req_cols].to_dict("list"))
test = data_set_tf.take(test_max_index)
train = data_set_tf.skip(test_max_index).take(train_split_len)
shuffled = train.shuffle(train_split_len, seed=42, reshuffle_each_iteration=True)
cached_train = shuffled.batch(512).prefetch(4096)#train.shuffle(100_000).batch(2048)
cached_test = test.batch(512).cache()


In [None]:
train_df.shape[0], test_max_index

In [None]:
def delete_all_check_points(fpath= 'new_amazon_check_points/*'):
    files = glob.glob(fpath)
    for f in files:
        os.remove(f)
        

In [None]:
def get_call_back_fun(K):
    delete_all_check_points()
    model_path = f"new_amazon_check_points/best_check_point_{K}k"
    
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=model_path,
        save_weights_only=True,
        monitor=f'val_factorized_top_k/top_{K}_categorical_accuracy',
        mode='max',
        save_best_only=True)

    early_stoping = tf.keras.callbacks.EarlyStopping(monitor=f'val_factorized_top_k/top_{K}_categorical_accuracy',
                                                     mode='min',
                                                     patience=5)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor=f'val_factorized_top_k/top_{K}_categorical_accuracy', 
                                                     factor=0.6,
                                                     #mode='min',
                                                     patience=9, 
                                                     min_lr=1e-6
    )
    return model_path, model_checkpoint_callback, early_stoping, reduce_lr

In [None]:
model_path, model_checkpoint_callback, early_stoping, reduce_lr = get_call_back_fun(25)

In [None]:
model = TRFSRetrievalModel(UserModel, ItemModel, item_ds)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01)) 

In [None]:
data = model.fit(cached_train,
          validation_data=cached_test,
          epochs=1,
          verbose=1, 
          workers=3,
          use_multiprocessing=True,
          callbacks=[model_checkpoint_callback, 
                     reduce_lr]

          )

In [None]:
model.load_weights("new_amazon_check_points/best_check_point_25k")

In [None]:
path = "model/candidate"
tf.saved_model.save(model.candidate_model, path)

In [None]:
qpath = "model/query"
tf.saved_model.save(model.query_model, qpath)

In [None]:
user_model = tf.saved_model.load(qpath)
candidate_model = tf.saved_model.load(path)

In [None]:
item_test_ds = tf.data.Dataset.from_tensor_slices(item_df.to_dict("list")).batch(5000)
for item_name in item_test_ds:
    item_names = item_name["ITEM_ID"].numpy()
for user in cached_test:
    all_features = list(user.keys()) 
    __ = [user.pop(j) for j in all_features if j not in ['USER_ID', 'CABIN_TYPE', 'USER_RESIDENCE']]
    print("User keys",user.keys())
    user_vector = user_model(user)
    item_vector = item_test_ds.map(candidate_model)
    for i in item_vector:
        score = tf.matmul(user_vector[0:1], i, transpose_b=True)
        break
    sorted_index = tf.argsort(score,axis=-1,direction='DESCENDING',stable=False)
    print("sorted_list",tf.gather(item_names, sorted_index[0,0:5]), sorted_index.shape )
    print("shape of score",tf.gather(tf.reshape(score,(-1)), sorted_index[0,0:5]) )
    break

In [None]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model, k=10)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip( (item_test_ds.map(lambda x: x["ITEM_ID"]), item_test_ds.map(model.candidate_model)))
)

In [None]:
for i in test.batch(1):
    print(i)
    print(index(i))
    break

In [None]:
result_summary = model.evaluate(cached_test)

In [None]:
{val:result_summary[idx] for idx, val in enumerate([3, 5, 10,15, 25]) }

In [None]:
data.history

In [None]:
keys = iter([key for key in data.history.keys() if key.startswith("factorized_top")])

In [None]:
x = [i for i in range(1,201,1)]
for key in keys:
    plt.plot(x, data.history[key], "-b", label="Train")
    plt.plot(x, data.history["val_"+key], "-r", label="Test")
    plt.legend(loc="upper left")
    plt.title(key.split("/")[1])
    break

    