In [None]:
!pip install tensorflow==2.15.1
!pip install tensorflow-recommenders==0.7.3

# Summary

Previously, I have built a couple of recommendation systems on Amazon Reviews dataset. In each time, I made the system more complex and comprehensive. This time I created a multitask recommendation system. This system, optimizes the retrieval and the ranking objectives at the same time.

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import matplotlib.pyplot as plt

tf.random.set_seed(42)
np.random.seed(42)
plt.style.use("ggplot")

In [3]:
!wget https://raw.githubusercontent.com/imsreecharan/datasets_/refs/heads/master/amazon_reviews.csv

--2025-01-06 06:31:29--  https://raw.githubusercontent.com/imsreecharan/datasets_/refs/heads/master/amazon_reviews.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 903820 (883K) [text/plain]
Saving to: ‘amazon_reviews.csv.1’


2025-01-06 06:31:30 (19.9 MB/s) - ‘amazon_reviews.csv.1’ saved [903820/903820]



In [4]:
reviews = pd.read_csv('amazon_reviews.csv',index_col = 0).rename(columns = {'asin': 'product_id'})[["product_id","overall","reviewerID","summary"]]
reviews.head()

Unnamed: 0,product_id,overall,reviewerID,summary
0,528881469,5.0,AO94DHGC771SJ,Gotta have GPS!
1,528881469,1.0,AMO214LNFCEI4,Very Disappointed
2,528881469,3.0,A3N7T0DY83Y4IG,1st impression
3,528881469,2.0,A1H8PY3QHMQQA0,"Great grafics, POOR GPS"
4,528881469,1.0,A24EV6RXELQZ63,"Major issues, only excuses for support"


In [5]:
unique_product_id = np.unique(reviews["product_id"])
unique_reviewerID = np.unique(reviews["reviewerID"])

In [6]:
dataset = tf.data.Dataset.from_tensor_slices({
    "reviewerID": reviews["reviewerID"].values,
    "product_id": reviews["product_id"].values,
    "summary": reviews["summary"].values,
    "overall": reviews["overall"].values
})

candidate_dataset = tf.data.Dataset.from_tensor_slices({"product_id": unique_product_id})

shuffled = dataset.shuffle(len(dataset), seed=42, reshuffle_each_iteration=False)
train = shuffled.take(int(len(dataset) * 0.8)).batch(32)
test = shuffled.skip(int(len(dataset) * 0.8)).batch(32)

In [7]:
class UserModelFFN(tf.keras.Model):
  def __init__(self,layer_sizes):
    super().__init__()

    self.user_model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = unique_reviewerID),
                                           tf.keras.layers.Embedding(len(unique_reviewerID) +1,32)])

    self.text_vectorizer = tf.keras.layers.TextVectorization(max_tokens = 2000)
    self.text_vectorizer.adapt(reviews["summary"].values)
    self.summary_model = tf.keras.Sequential([self.text_vectorizer,
                                             tf.keras.layers.Embedding(self.text_vectorizer.vocabulary_size(),32,mask_zero = True),
                                             tf.keras.layers.GlobalAveragePooling1D()])

    self.combined_model = tf.keras.Sequential()
    for layer_size in layer_sizes[:-1]:
      self.combined_model.add(tf.keras.layers.Dense(layer_size,activation = "relu"))

    self.combined_model.add(tf.keras.layers.Dense(layer_sizes[-1]))

  def call(self,features):
    reviewerID = features["reviewerID"]
    summary = features["summary"]
    user_embeddings = self.user_model(reviewerID)
    summary_embeddings = self.summary_model(summary)
    user_summary_combined = self.combined_model(tf.concat([user_embeddings,summary_embeddings],axis = 1))
    return user_summary_combined

In [8]:
class ProductModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.product_model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = unique_product_id),
                                             tf.keras.layers.Embedding(len(unique_product_id) +1,32)])

  def call(self,features):
    product_id = features["product_id"]
    product_embeddings = self.product_model(product_id)
    return product_embeddings

In [9]:
class MultiTaskModel(tfrs.models.Model):
  def __init__(self,rating_weight,retrieval_weight,layer_sizes):
    super().__init__()

    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

    self.user_summary_model = UserModelFFN(layer_sizes)
    self.product_model = ProductModel()

    self.rating_task = tfrs.tasks.Ranking(loss = tf.keras.losses.MeanSquaredError(),
                                        metrics = [tf.keras.metrics.RootMeanSquaredError()])
    self.retrieval_task = tfrs.tasks.Retrieval(metrics = tfrs.metrics.FactorizedTopK(candidates = candidate_dataset.batch(32).map(self.product_model)))

    self.rating_model = tf.keras.Sequential([tf.keras.layers.Dense(32,activation = "relu"),
                                            tf.keras.layers.Dense(16,activation = "relu"),
                                            tf.keras.layers.Dense(1)])

  def call(self,features):
    user_summary_embeddings = self.user_summary_model(features)
    product_embeddings = self.product_model(features)
    rating_predictions = self.rating_model(tf.concat([user_summary_embeddings,product_embeddings],axis = 1))
    return user_summary_embeddings,product_embeddings,rating_predictions

  def compute_loss(self,features,training = False):
    user_summary_embeddings,product_embeddings,rating_predictions = self(features)
    rating_loss = self.rating_task(labels = features["overall"],predictions = rating_predictions)
    retrieval_loss = self.retrieval_task(user_summary_embeddings,product_embeddings)
    return (self.rating_weight * rating_loss) + (self.retrieval_weight * retrieval_loss)


In [10]:
model = MultiTaskModel(rating_weight = 0.5,retrieval_weight = 0.5,layer_sizes = [32])
model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))

In [11]:
model.fit(train,epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f4ff6d95f60>

In [12]:
model.evaluate(test,return_dict = True)



{'root_mean_squared_error': 1.247291088104248,
 'factorized_top_k/top_1_categorical_accuracy': 0.08500000089406967,
 'factorized_top_k/top_5_categorical_accuracy': 0.2800000011920929,
 'factorized_top_k/top_10_categorical_accuracy': 0.41999998688697815,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 9.581385612487793,
 'regularization_loss': 0,
 'total_loss': 9.581385612487793}