In [None]:
!pip install tensorflow==2.15.1
!pip install tensorflow-recommenders==0.7.3

# Summary

The purpose of this notebook is to build more complex forms of retrieval and ranking models that I've built previously on Amazon Reviews dataset. The main difference is that this time the summary texts are included in the models too.

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [3]:
!wget https://raw.githubusercontent.com/imsreecharan/datasets_/refs/heads/master/amazon_reviews.csv

--2025-01-03 12:45:43--  https://raw.githubusercontent.com/imsreecharan/datasets_/refs/heads/master/amazon_reviews.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 903820 (883K) [text/plain]
Saving to: ‘amazon_reviews.csv.2’


2025-01-03 12:45:43 (131 MB/s) - ‘amazon_reviews.csv.2’ saved [903820/903820]



In [4]:
reviews = pd.read_csv('amazon_reviews.csv',index_col = 0).rename(columns = {'asin': 'product_id'})[["product_id","overall","reviewerID","summary"]]
reviews.head()

Unnamed: 0,product_id,overall,reviewerID,summary
0,528881469,5.0,AO94DHGC771SJ,Gotta have GPS!
1,528881469,1.0,AMO214LNFCEI4,Very Disappointed
2,528881469,3.0,A3N7T0DY83Y4IG,1st impression
3,528881469,2.0,A1H8PY3QHMQQA0,"Great grafics, POOR GPS"
4,528881469,1.0,A24EV6RXELQZ63,"Major issues, only excuses for support"


In [5]:
unique_product_id = np.unique(reviews["product_id"])
unique_reviewerID = np.unique(reviews["reviewerID"])

In [6]:
retrieval_dataset = tf.data.Dataset.from_tensor_slices({
    "reviewerID": reviews["reviewerID"].values,
    "product_id": reviews["product_id"].values,
    "summary": reviews["summary"].values
})

candidate_dataset = tf.data.Dataset.from_tensor_slices({"product_id": unique_product_id})

shuffled = retrieval_dataset.shuffle(len(retrieval_dataset), seed=42, reshuffle_each_iteration=False)
train_retrieval = shuffled.take(int(len(retrieval_dataset) * 0.8)).batch(32)
test_retrieval = shuffled.skip(int(len(retrieval_dataset) * 0.8)).batch(32)

# 1. Product Model

In [7]:
product_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary = unique_product_id),
    tf.keras.layers.Embedding(len(unique_product_id) + 1, 32)
])

print(product_model(reviews["product_id"][0]))
print(tf.expand_dims(product_model(reviews["product_id"][0]),axis = 0))

tf.Tensor(
[-0.04181057  0.00613198 -0.01580423  0.0428285   0.00778408  0.0392483
 -0.01950583 -0.01486744  0.02464231  0.03471922 -0.03536431  0.02868606
 -0.00068168  0.03673505 -0.04824916  0.01919064  0.00385328  0.03569541
 -0.04817854 -0.02764463  0.03993749  0.03238017  0.01970936 -0.02609757
  0.01877946  0.03225947  0.04883577  0.02146194 -0.03961407  0.02023555
 -0.04718411  0.02892954], shape=(32,), dtype=float32)
tf.Tensor(
[[-0.04181057  0.00613198 -0.01580423  0.0428285   0.00778408  0.0392483
  -0.01950583 -0.01486744  0.02464231  0.03471922 -0.03536431  0.02868606
  -0.00068168  0.03673505 -0.04824916  0.01919064  0.00385328  0.03569541
  -0.04817854 -0.02764463  0.03993749  0.03238017  0.01970936 -0.02609757
   0.01877946  0.03225947  0.04883577  0.02146194 -0.03961407  0.02023555
  -0.04718411  0.02892954]], shape=(1, 32), dtype=float32)


# 2. Summary Model (Text Vectorization + Embedding + GlobalAverage1D)

In [8]:
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens = 10000)
text_vectorizer.adapt(reviews["summary"].values)
text_vectorizer(reviews["summary"][0])

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([869,  69, 384])>

In [9]:
summary_model = tf.keras.Sequential([text_vectorizer,
                                     tf.keras.layers.Embedding(10000, 32,mask_zero = True),
                                     tf.keras.layers.GlobalAveragePooling1D()])
summary_model(tf.constant([reviews["summary"][0]]))

<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.00030636,  0.01723851, -0.01490012, -0.00807221, -0.00255157,
        -0.01997457, -0.00225286, -0.00447299, -0.00549032,  0.02629794,
        -0.03398538, -0.00827839, -0.00120654, -0.0111842 , -0.01154557,
        -0.01757006,  0.01046104, -0.00840172,  0.00088971,  0.00361286,
         0.02737132, -0.00062442,  0.00638324, -0.00867405,  0.01707853,
         0.02028238, -0.00175454,  0.0218857 , -0.00444392,  0.03047863,
         0.01414151, -0.00941967]], dtype=float32)>

# 3. User Model

In [10]:
user_model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = unique_reviewerID),
                                  tf.keras.layers.Embedding(len(unique_reviewerID) + 1, 32)])

print(user_model(reviews["reviewerID"][0]))
print(tf.expand_dims(user_model(reviews["reviewerID"][0]),axis = 0))

tf.Tensor(
[-0.02986693 -0.02979692 -0.01532043  0.04924932 -0.02070814 -0.01078987
  0.00057954 -0.03435329 -0.03419303 -0.04441765 -0.03568858 -0.00528691
 -0.00776166  0.00930915  0.04178662  0.02857419  0.01260522  0.00234913
 -0.01912628 -0.0217849  -0.00707304 -0.00443298 -0.03219974 -0.0386531
 -0.01271708  0.01447986  0.00828339  0.011569   -0.02881675 -0.02913202
  0.03719957  0.01463187], shape=(32,), dtype=float32)
tf.Tensor(
[[-0.02986693 -0.02979692 -0.01532043  0.04924932 -0.02070814 -0.01078987
   0.00057954 -0.03435329 -0.03419303 -0.04441765 -0.03568858 -0.00528691
  -0.00776166  0.00930915  0.04178662  0.02857419  0.01260522  0.00234913
  -0.01912628 -0.0217849  -0.00707304 -0.00443298 -0.03219974 -0.0386531
  -0.01271708  0.01447986  0.00828339  0.011569   -0.02881675 -0.02913202
   0.03719957  0.01463187]], shape=(1, 32), dtype=float32)


# 4. Stacking the Models in a Class Object

In [11]:
class UserModel(tf.keras.Model):
  def __init__(self):
    super().__init__()

    self.user_model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = unique_reviewerID),
                                           tf.keras.layers.Embedding(len(unique_reviewerID) + 1, 32)])

    self.text_vectorizer = tf.keras.layers.TextVectorization(max_tokens = 10000)
    self.text_vectorizer.adapt(reviews["summary"].values)

    self.summary_model = tf.keras.Sequential([self.text_vectorizer,
                                              tf.keras.layers.Embedding(10000, 32,mask_zero = True),
                                              tf.keras.layers.GlobalAveragePooling1D()])

  def call(self,inputs):
    return tf.concat([self.user_model(inputs["reviewerID"]),
                     self.summary_model(inputs["summary"])],axis = 1)





In [12]:
UserModel()({"reviewerID": tf.constant([reviews["reviewerID"][0]]),"summary": tf.constant([reviews["summary"][0]])})

<tf.Tensor: shape=(1, 64), dtype=float32, numpy=
array([[-0.03941888,  0.00555389,  0.04971308, -0.01112105,  0.01875227,
         0.03149897,  0.04983838,  0.0415214 ,  0.02211056,  0.01191982,
         0.04536425,  0.03971851,  0.01094618,  0.04215712,  0.03751919,
         0.01000056, -0.02822317, -0.04830662,  0.01903229,  0.01250613,
         0.03184104, -0.01903722,  0.0473478 ,  0.02164492,  0.02620583,
        -0.01028949, -0.00583061, -0.0183398 ,  0.04627231, -0.02192235,
         0.02941177,  0.04654131, -0.02743609, -0.02914637, -0.0074079 ,
        -0.01874513,  0.02003949, -0.00562538, -0.01637788, -0.00098066,
         0.04014473,  0.00038821,  0.01450654, -0.00279236, -0.0319703 ,
         0.01541704,  0.0048491 , -0.00965881,  0.02179919,  0.0100096 ,
        -0.0052073 , -0.00259396,  0.01986673,  0.00423572, -0.00790298,
        -0.00715874, -0.00500179, -0.01505018,  0.02665781, -0.01729001,
         0.00890473,  0.02580987,  0.00678342,  0.00126406]],
      dtype=f

# 5. Retrieval Model

In [13]:
candidate_dataset = tf.data.Dataset.from_tensor_slices(unique_product_id).batch(32)
metrics = tfrs.metrics.FactorizedTopK(candidates = candidate_dataset.map(lambda x: (x, product_model(x))))
task = tfrs.tasks.Retrieval(metrics = metrics)

In [14]:
class Retrieval(tfrs.Model):
  def __init__(self,user_model,product_model):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self,inputs,training = False):
    user_embeddings_and_summary = self.user_model({"reviewerID": inputs["reviewerID"],"summary": inputs["summary"]})
    positive_embeddings = self.product_model(inputs["product_id"])
    return self.task(user_embeddings_and_summary,positive_embeddings)

#to reduce dimensions of the UserModel (1,64) to (1,32)
user_model_retrieval = tf.keras.Sequential([UserModel(),
                                  tf.keras.layers.Dense(32)])

retrieval_model = Retrieval(user_model_retrieval,product_model)
retrieval_model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))

In [15]:
retrieval_model.fit(train_retrieval, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7fd6245c7850>

In [16]:
retrieval_model.evaluate(test_retrieval,return_dict = True)



{'factorized_top_k/top_1_categorical_accuracy': 0.05000000074505806,
 'factorized_top_k/top_5_categorical_accuracy': 0.20999999344348907,
 'factorized_top_k/top_10_categorical_accuracy': 0.2800000011920929,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 18.4489688873291,
 'regularization_loss': 0,
 'total_loss': 18.4489688873291}

# 6. Ranking Model

In [17]:
ranking_dataset = tf.data.Dataset.from_tensor_slices({
    "reviewerID": reviews["reviewerID"].values,
    "product_id": reviews["product_id"].values,
    "summary": reviews["summary"].values,
    "overall": reviews["overall"].values
})

for i in ranking_dataset.take(1).as_numpy_iterator():
  print(i)

{'reviewerID': b'AO94DHGC771SJ', 'product_id': b'0528881469', 'summary': b'Gotta have GPS!', 'overall': 5.0}


In [18]:
ranking_dataset = ranking_dataset.shuffle(len(ranking_dataset),seed = 42)
train = ranking_dataset.take(int(len(ranking_dataset) * 0.8))
test = ranking_dataset.skip(int(len(ranking_dataset) * 0.8)).take(int(len(ranking_dataset) * 0.2))

In [19]:
class RankingModel(tf.keras.Model):
  def __init__(self):
    super().__init__()

    self.product_id_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary = unique_product_id),
        tf.keras.layers.Embedding(len(unique_product_id) + 1, 32)
    ]) #output = (1,32)

    self.user_id_summary_embedding = tf.keras.Sequential([UserModel(),
                                                        tf.keras.layers.Dense(32)]) #output = (1,32)

    self.ranking_model = tf.keras.Sequential([tf.keras.layers.Dense(32,activation = "relu"),
                                              tf.keras.layers.Dense(16,activation = "relu"),
                                              tf.keras.layers.Dense(1)]) #output = (1,1)

  def call(self,inputs):
    return self.ranking_model(tf.concat([
                      self.product_id_embedding(inputs["product_id"]),
                      self.user_id_summary_embedding({"reviewerID": inputs["reviewerID"],"summary": inputs["summary"]})],axis = 1))

In [20]:
print(RankingModel()({"reviewerID": tf.constant([reviews["reviewerID"][0]]),
               "summary": tf.constant([reviews["summary"][0]]),
               "product_id": tf.constant([reviews["product_id"][0]])})) # output = (1,1)

print(tf.squeeze(RankingModel()({"reviewerID": tf.constant([reviews["reviewerID"][0]]),
               "summary": tf.constant([reviews["summary"][0]]),
               "product_id": tf.constant([reviews["product_id"][0]])}))) # output = ()

tf.Tensor([[-0.07688543]], shape=(1, 1), dtype=float32)
tf.Tensor(0.028314583, shape=(), dtype=float32)


In [21]:
class AmazonRankModel(tfrs.models.Model):
  def __init__(self):
    super().__init__()

    self.ranking_model = RankingModel()
    self.task = tfrs.tasks.Ranking(loss = tf.keras.losses.MeanSquaredError(),
                                   metrics = [tf.keras.metrics.RootMeanSquaredError()])

  def call(self,inputs):
    return tf.squeeze(self.ranking_model(inputs)) #output = ()

  def compute_loss(self,inputs,training = False):
    labels = inputs.pop("overall") #output = ()
    scores = self(inputs) #output = ()
    return self.task(labels = labels,predictions = scores)

In [22]:
rank_model = AmazonRankModel()
rank_model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))

In [23]:
rank_model.fit(train.batch(32).cache(),epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7fd6241f78e0>

In [24]:
rank_model.evaluate(test.batch(32).cache(),return_dict = True)



{'root_mean_squared_error': 0.6402822136878967,
 'loss': 0.06442855298519135,
 'regularization_loss': 0,
 'total_loss': 0.06442855298519135}

In [25]:
rank_model({"reviewerID": tf.constant([reviews["reviewerID"][0]]),
               "summary": tf.constant([reviews["summary"][0]]),
               "product_id": tf.constant([reviews["product_id"][0]])})

<tf.Tensor: shape=(), dtype=float32, numpy=4.667259>

In [26]:
rank_model({"reviewerID": tf.constant([reviews["reviewerID"][1]]),
               "summary": tf.constant([reviews["summary"][1]]),
               "product_id": tf.constant([reviews["product_id"][1]])})

<tf.Tensor: shape=(), dtype=float32, numpy=1.1716633>