## Imports

In [1]:
# Colab Imports
# !pip install tensorflow_recommenders

In [2]:
import pprint
import tensorflow as tf
import pandas as pd

import numpy as np
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import tempfile
import pathlib
from typing import Dict, Text

## Dataset Pre-processing

### Old Code

In [3]:
# # load the metadata
# data = []
# with gzip.open("datasets\Arts_Crafts_and_Sewing_5.json.gz") as f:
#     for l in f:
#         data.append(json.loads(l.strip()))

# # total length of list, this number equals total number of products
# print(len(data))

# # first row of the list
# print(data[0])

# # convert list into pandas dataframe
# df = pd.DataFrame.from_dict(data)
# # print(len(df))

# # remove rows with NaN values
# df3 = df.dropna()

# reviewTimeTest = df3.sort_values(by=['reviewTime'], key=pd.to_datetime)
# reviewTimeTest

# # function to smoothen the distribution of user preference
# def smooth_user_preference(x):
#     return math.log(1+x, 2)

# # counting the total amount of user interactions
# users_interactions_count = df3.groupby(['reviewerID', 'asin']).size().groupby('reviewerID').size()
# print('# users: %d' % len(users_interactions_count))

# #  counting the amount of users with at least 5 interactions
# users_with_enough_interactions = users_interactions_count[users_interactions_count >= 5].reset_index()[['reviewerID']]
# print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions))


# print('# of interactions: %d' % len(df3))

# # counting the amount of interactions from users with at least 5 interactions
# interactions_from_selected_users = df3.merge(users_with_enough_interactions, 
#                how = 'right',
#                left_on = 'reviewerID',
#                right_on = 'reviewerID')
# print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users))

# # counting the amount of unique user-to-item interactions
# interactions_full = interactions_from_selected_users \
#                     .groupby(['reviewerID', 'asin'])['overall'].sum() \
#                     .apply(smooth_user_preference).reset_index()
# print('# of unique user/item interactions: %d' % len(interactions_full))
# interactions_full


### Import Dataset

In [4]:
# # Toys Section Importing From amazon_us_reviews dataset
# tools = (tfds.load("amazon_us_reviews/Personal_Care_Appliances_v1_00", split='train')
#             # Cache for efficiency.
#             .cache(tempfile.NamedTemporaryFile().name))

test_ds = tfds.load('amazon_us_reviews/Personal_Care_Appliances_v1_00', split=[
    f'train[{k}%:{k+10}%]' for k in range(0, 100, 10)
])
train_ds = tfds.load('amazon_us_reviews/Personal_Care_Appliances_v1_00', split=[
    f'train[:{k}%]+train[{k+10}%:]' for k in range(0, 100, 10)
])

In [5]:
# print(train_ds)
# dataset = tf.data.Dataset.from_tensor_slices(train_ds).prefetch(tf.data.AUTOTUNE)

# # for x in dataset:
# unique_train_df = pd.DataFrame(dataset.customer_id.unique(), columns=['customer_id'])

In [6]:
# for y in dataset:
#   for x in y.take(1).as_numpy_iterator():
#     pprint.pprint(x)

### Data Mapping

In [7]:
# tools = tools.map(lambda x: {
#     "customer_id": x["customer_id"],
#     "product_title": x["product_title"],
#     "product_id": x["product_id"],
# })

tools1 = toolsProcessed = list()

for k in range(10):
  tools1.append(train_ds[k].map(lambda x: x["data"]))
  toolsProcessed[k] = tools1[k].map(lambda x: {
      "customer_id": x["customer_id"],
      "product_title": x["product_title"],
      "star_rating": x["star_rating"]
  })
  tf.random.set_seed(1)
  shuffledTools = toolsProcessed[k].shuffle(10_000, seed=1, reshuffle_each_iteration=False)

  # Determine Unique Customer and Product ID
  customerID = (toolsProcessed[k]
              # Retain only the fields we need.
              .map(lambda x: x["customer_id"])
              )
  product = (toolsProcessed[k]
            .map(lambda x: x["product_title"])
            )

  uniqueCustomerID = np.unique(np.concatenate(list(customerID.batch(1_000))))
  uniqueProduct = np.unique(np.concatenate(list(product.batch(1_000))))

### Split into Training and Testing Sets using Cross Validation

In [8]:
trainNum = 8_000
testNum = 2_000

train = shuffledTools.take(trainNum)

test = shuffledTools.skip(trainNum).take(testNum)

### Model

In [9]:
class Model(tfrs.Model):
  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    super().__init__()
    embeddingDim = 32

    ## Model that represents customers with Matrix Factorization
    self.customer_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=uniqueCustomerID, mask_token=None),
      # Embedding for unknown tokens
      tf.keras.layers.Embedding(len(uniqueCustomerID) + 1, embeddingDim)
    ])

    ## Model that represents products
    self.product_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=uniqueProduct, mask_token=None),
      # Embedding for unknown tokens
      tf.keras.layers.Embedding(len(uniqueProduct) + 1, embeddingDim)
    ])

    # RELU-based DNN
    self.rating_model = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(128, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      tf.keras.layers.Dense(1, activation="relu"),
    ])

    # Loss function used to train the models using the Factorized Top-k Method for Retrieval
    self.retrieval_task = tfrs.tasks.Retrieval(
      metrics=tfrs.metrics.FactorizedTopK(
        candidates=product.batch(128).cache().map(self.product_model)
      )
    )

    # Loss function for rating
    self.rating_task = tfrs.tasks.Ranking(
        loss = tf.keras.losses.MeanAbsoluteError(),
        metrics = [tf.keras.metrics.RootMeanSquaredError()],
    )

    # The loss weights
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

      # We pick out the user features and pass them into the user model.
      customer_embeddings = self.customer_model(features["customer_id"])
        
      # And pick out the item features and pass them into the item model.
      product_embeddings = self.product_model(features["product_title"])

      return (
          customer_embeddings,
          product_embeddings,
          # We apply the multi-layered rating model to a concatentation of
          # user and item embeddings.
          self.rating_model(
              tf.concat([customer_embeddings, product_embeddings], axis=1)
          ),
      )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    ## ratings go here as a method to compute loss
    ratings = features.pop("star_rating")
      
    user_embeddings, item_embeddings, rating_predictions = self(features)
      
    # We compute the loss for each task.
    rating_loss = self.rating_task(
      labels=ratings,
      predictions=rating_predictions,
    )
    
    retrieval_loss = self.retrieval_task(user_embeddings, item_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

In [10]:
learningRate = 0.1

earlystopping = tf.keras.callbacks.EarlyStopping(monitor ="loss", 
                                        mode ="min", patience = 5, 
                                        restore_best_weights = True)

model = Model(retrieval_weight=0.5, rating_weight=0.5)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learningRate), loss="root_mean_squared_error")

### Model Fitting and Evaluation

In [11]:
cachedTrain = train.shuffle(10_000).batch(8192).cache()
cachedTest = test.batch(4096).cache()

# model.fit(cachedTrain, epochs=10, callbacks=[earlystopping])
model.fit(cachedTrain, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a9569f01f0>

In [12]:
metrics = model.evaluate(cachedTest, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Retrieval top-100 accuracy: 0.004.
Ranking RMSE: 1.887.


### Convert into Tensorflow Lite

In [13]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]

tflite_model_dir = pathlib.Path("/tflite_models/")
tflite_model_dir.mkdir(exist_ok = True, parents = True)

tflite_f16_model = converter.convert()
tflite_f16_file = tflite_model_dir/"recommender_f16.tflite"
tflite_f16_file.write_bytes(tflite_f16_model)

FileNotFoundError: [WinError 2] The system cannot find the file specified: '\\tflite_models'

ls: cannot access '{tflite_models_dir}': No such file or directory


### Retrieving Top-K Candidates

In [None]:
# Dummy values created to simulate larger dataset
uniqueProduct = tf.data.Dataset.from_tensor_slices(uniqueProduct)

toolsWithDummy = tf.data.Dataset.concatenate(
    uniqueProduct.batch(4096),
    uniqueProduct.batch(4096).repeat(1_000).map(lambda x: tf.zeros_like(x))
)

toolsWithDummyEmb = tf.data.Dataset.concatenate(
    uniqueProduct.batch(4096).map(model.product_model),
    uniqueProduct.batch(4096).repeat(1_000)
      .map(lambda x: model.product_model(x))
      .map(lambda x: x * tf.random.uniform(tf.shape(x)))
)

In [None]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(model.customer_model)
brute_force.index_from_dataset(
    uniqueProduct.batch(100).map(lambda prod: (prod, model.product_model(prod)))
)

In [None]:
# Get predictions for user.
id_input = input("Enter the customer ID: ")
_, titles = brute_force(np.array([str(id_input)]), k=3)

print(f"Top recommendations: {titles[0]}")

## Print Out Graph

In [None]:
# plt.figure(figsize=(12, 4))
# plt.barh(corr_similar_count['asin'].head(10),
#          abs(corr_similar_count['Correlation'].head(10)), 
#          align='center',
#          color='red')
# plt.xlabel("Popularity")
# plt.title("Top 10 Popular Movies")
# plt.gca().invert_yaxis()