In [None]:
!pip install tensorflow==2.15.1
!pip install tensorflow-recommenders==0.7.3

# Summary

The main purpose of this notebook is to apply retrieval and ranking models on Amazon Reviews dataset. The retrieval model filters out the redundant products given the user. The ranking model orders the best possible products that have been selected by the retrieval model considering other features (like product rankings in this case).


## Models

**Retrieval Model**

UserId ----> StringLookup (equvalent to the OrdinalEncoder) ----> Embedding (1,16)  => User Embedding $u_v$ (1,16)

ProductId ----> StringLookup ----> Embedding (1,16)  => Product Embedding $u_i$ (1,16)

During the learning process, depending on the previous interactions $u_i$ and $u_v$ come closer or further from each other. At the end of the learning process, if user $i$ and product $v$ are highly correlated, the degree between each vector decreases hence the dot product ($u_i . u_v$) increases. 

Retrieval model objective:

max Top-K Categorical Accuracy

**Ranking Model**

Concatenate($u_v$, $u_i$) = ProductUser Embedding ($u_m$) (1,32)

FFN($u_m$) ----> Rating Prediction

Ranking model objective:

min RMSE(Rating Prediction, Rating)

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [3]:
!wget https://raw.githubusercontent.com/imsreecharan/datasets_/refs/heads/master/amazon_reviews.csv

--2025-01-02 08:59:48--  https://raw.githubusercontent.com/imsreecharan/datasets_/refs/heads/master/amazon_reviews.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 903820 (883K) [text/plain]
Saving to: ‘amazon_reviews.csv.3’


2025-01-02 08:59:48 (18.7 MB/s) - ‘amazon_reviews.csv.3’ saved [903820/903820]



In [4]:
reviews = pd.read_csv('amazon_reviews.csv',index_col = 0).rename(columns = {'asin': 'product_id'})[["product_id","overall","reviewerID"]]
reviews.head()

Unnamed: 0,product_id,overall,reviewerID
0,528881469,5.0,AO94DHGC771SJ
1,528881469,1.0,AMO214LNFCEI4
2,528881469,3.0,A3N7T0DY83Y4IG
3,528881469,2.0,A1H8PY3QHMQQA0
4,528881469,1.0,A24EV6RXELQZ63


# 1. Retrieval Model

In [5]:
unique_product_id = np.unique(reviews.product_id.copy())
unique_reviewerID = np.unique(reviews.reviewerID.copy())

In [6]:
retrieval_dataset = tf.data.Dataset.from_tensor_slices({
    "reviewerID": reviews["reviewerID"].values,
    "product_id": reviews["product_id"].values,
})
candidate_dataset = tf.data.Dataset.from_tensor_slices(unique_product_id).batch(32)

shuffled = retrieval_dataset.shuffle(len(retrieval_dataset), seed=42, reshuffle_each_iteration=False)
train_retrieval = shuffled.take(int(len(retrieval_dataset) * 0.8)).batch(32)
test_retrieval = shuffled.skip(int(len(retrieval_dataset) * 0.8)).batch(32)

In [7]:
product_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_product_id, mask_token=None),
    tf.keras.layers.Embedding(len(unique_product_id) + 1, output_dim=16)
])

user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_reviewerID, mask_token=None),
    tf.keras.layers.Embedding(len(unique_reviewerID) + 1, output_dim=16)
])

In [8]:
#feed products and their embeddings to the metric
metrics = tfrs.metrics.FactorizedTopK(
    candidates=candidate_dataset.map(lambda x: (x, product_model(x)))
)

#define task
task = tfrs.tasks.Retrieval(metrics=metrics)

class Retrieval(tfrs.Model):
    def __init__(self, user_model, product_model):
        super().__init__()
        self.user_model = user_model
        self.product_model = product_model
        self.task = task

    def compute_loss(self, features, training=False): #No dropout, BatchNormalization or activation function only train for embeddings
        user_embeddings = self.user_model(features["reviewerID"])
        positive_embeddings = self.product_model(features["product_id"])
        return self.task(user_embeddings, positive_embeddings)

#define the model
retrieval_model = Retrieval(user_model, product_model)
retrieval_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

#train the model
retrieval_model.fit(train_retrieval, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7b308823d6f0>

In [9]:
retrieval_model.evaluate(test_retrieval,return_dict = True)



{'factorized_top_k/top_1_categorical_accuracy': 0.07500000298023224,
 'factorized_top_k/top_5_categorical_accuracy': 0.2800000011920929,
 'factorized_top_k/top_10_categorical_accuracy': 0.38499999046325684,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 16.507652282714844,
 'regularization_loss': 0,
 'total_loss': 16.507652282714844}

In [10]:
#brute force model computes **all** of the user-product vector similarities given the metric (FactorizedTopK) in this case
index = tfrs.layers.factorized_top_k.BruteForce(retrieval_model.user_model)

#add candidate products and their embeddings to the brute force
index.index_from_dataset(
    candidate_dataset.map(
        lambda x: (x, product_model(x))
    )
)

scores, items = index(tf.constant(["A250YP5XTKH243"]))
print(f"Recommended products: {np.array(items[0, :3].numpy()).astype(str)}")
print(f"Recommendation scores: {scores[0,:3].numpy()}")

Recommended products: ['9983891212' '9573212919' '140053271X']
Recommendation scores: [0.71267945 0.26443994 0.12621395]


# 2. Ranking Model

In [11]:
ranking_dataset = tf.data.Dataset.from_tensor_slices(dict(reviews))

In [12]:
for i in ranking_dataset.take(1).as_numpy_iterator():
  print(i)

{'product_id': b'0528881469', 'overall': 5.0, 'reviewerID': b'AO94DHGC771SJ'}


In [13]:
ranking_dataset = ranking_dataset.shuffle(buffer_size = len(ranking_dataset), seed = 42)
train = ranking_dataset.take(int(len(ranking_dataset) * 0.8)).batch(32)
test = ranking_dataset.skip(int(len(ranking_dataset) * 0.8)).take(int(len(reviews) * 0.2)).batch(32)

In [14]:
class RankingModel(tf.keras.Model):
  def __init__(self):
    super().__init__()

    self.product_id_embedding = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = unique_product_id),
                                                     tf.keras.layers.Embedding(len(unique_product_id) + 1, 32),
                                                     ])

    self.reviewerID_embedding = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = unique_reviewerID),
                                                     tf.keras.layers.Embedding(len(unique_reviewerID) + 1, 32),
                                                     ])

    self.ratings = tf.keras.Sequential([tf.keras.layers.Dense(32, activation = 'relu'),
                                        tf.keras.layers.Dense(16,activation = 'relu'),
                                        tf.keras.layers.Dense(1)
                                        ])

  def call(self,inputs):
    product_id,reviewerID = inputs
    product_id_embedding = self.product_id_embedding(product_id)
    reviewerID_embedding = self.reviewerID_embedding(reviewerID)
    return self.ratings(tf.concat([product_id_embedding,reviewerID_embedding],axis = 1))

In [15]:
class AmazonRankModel(tfrs.models.Model):
  def __init__(self):
    super().__init__()
    self.ranking_model = RankingModel()
    self.task = tfrs.tasks.Ranking(loss = tf.keras.losses.MeanSquaredError(),
                                    metrics = [tf.keras.metrics.RootMeanSquaredError()])

  def call(self,features):
    return self.ranking_model((features['product_id'],features['reviewerID']))

  def compute_loss(self,features,training = False):
    labels = features.pop('overall')
    scores = self(features)
    return self.task(labels = labels,predictions = scores)

In [16]:
ranking_model = AmazonRankModel()
ranking_model.compile(optimizer = tf.keras.optimizers.Adagrad(learning_rate = 0.1))

In [17]:
ranking_model.fit(train,epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7b308168bc40>

In [18]:
ranking_model.evaluate(test,return_dict = True)



{'root_mean_squared_error': 0.36079248785972595,
 'loss': 0.1360926330089569,
 'regularization_loss': 0,
 'total_loss': 0.1360926330089569}

In [19]:
ratings = {}
items_to_rank = items.numpy()[0].astype(str)[:3]
for product_id in items_to_rank:
  ratings[product_id] = ranking_model({'reviewerID': np.array(['A250YP5XTKH243']),'product_id': np.array([product_id])})

print("Ratings:")
for product_id, rating in sorted(ratings.items(),key = lambda x: x[1],reverse = True):
  print(f"{product_id}: {rating[0]}")

Ratings:
9983891212: [5.021841]
9573212919: [4.551152]
140053271X: [4.537936]
