## Imports

In [2]:
# import pandas as pd
# import os
import pprint
# import json
# import gzip
# import matplotlib.pyplot as plt
# import math
# import random
# import scipy
# import sklearn
import tensorflow as tf
# from sklearn.model_selection import train_test_split

# from nltk.corpus import stopwords
import numpy as np
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import tempfile
from typing import Dict, Text
# from scipy.sparse import csr_matrix
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from scipy.sparse.linalg import svds
# from sklearn.preprocessing import MinMaxScaler
# from tensorflow.compat.v1 import metrics


## Dataset Pre-processing

### Old Code

In [3]:
# # load the metadata
# data = []
# with gzip.open("datasets\Arts_Crafts_and_Sewing_5.json.gz") as f:
#     for l in f:
#         data.append(json.loads(l.strip()))

# # total length of list, this number equals total number of products
# print(len(data))

# # first row of the list
# print(data[0])

# # convert list into pandas dataframe
# df = pd.DataFrame.from_dict(data)
# # print(len(df))

# # remove rows with NaN values
# df3 = df.dropna()

# reviewTimeTest = df3.sort_values(by=['reviewTime'], key=pd.to_datetime)
# reviewTimeTest

# # function to smoothen the distribution of user preference
# def smooth_user_preference(x):
#     return math.log(1+x, 2)

# # counting the total amount of user interactions
# users_interactions_count = df3.groupby(['reviewerID', 'asin']).size().groupby('reviewerID').size()
# print('# users: %d' % len(users_interactions_count))

# #  counting the amount of users with at least 5 interactions
# users_with_enough_interactions = users_interactions_count[users_interactions_count >= 5].reset_index()[['reviewerID']]
# print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions))


# print('# of interactions: %d' % len(df3))

# # counting the amount of interactions from users with at least 5 interactions
# interactions_from_selected_users = df3.merge(users_with_enough_interactions, 
#                how = 'right',
#                left_on = 'reviewerID',
#                right_on = 'reviewerID')
# print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users))

# # counting the amount of unique user-to-item interactions
# interactions_full = interactions_from_selected_users \
#                     .groupby(['reviewerID', 'asin'])['overall'].sum() \
#                     .apply(smooth_user_preference).reset_index()
# print('# of unique user/item interactions: %d' % len(interactions_full))
# interactions_full


### Import Dataset

In [4]:
# Toys Section Importing From amazon_us_reviews dataset
tools = (tfds.load("amazon_us_reviews/Tools_v1_00", split='train')
            # Cache for efficiency.
            .cache(tempfile.NamedTemporaryFile().name))

for x in tools.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'data': {'customer_id': b'8839363',
          'helpful_votes': 0,
          'marketplace': b'US',
          'product_category': b'Tools',
          'product_id': b'B0029HJAZ8',
          'product_parent': b'733806910',
          'product_title': b'Stanley 060864R Folding Sawhorse (2-Pack)',
          'review_body': b'These things broke after only a few uses. I called t'
                         b'o see if I could get replaced. I was transferred and'
                         b" whoever I talked to, said he couldn't help me.",
          'review_date': b'2015-08-24',
          'review_headline': b'Cheaply made.',
          'review_id': b'R30ZK5V4C0BJWY',
          'star_rating': 1,
          'total_votes': 0,
          'verified_purchase': 0,
          'vine': 1}}


### Data Mapping

In [5]:
# tools = tools.map(lambda x: {
#     "customer_id": x["customer_id"],
#     "product_title": x["product_title"],
#     "product_id": x["product_id"],
# })

tools1 = tools.map(lambda x: x["data"])
toolsProcessed = tools1.map(lambda x: {
    "customer_id": x["customer_id"],
    "product_title": x["product_title"],
    "star_rating": x["star_rating"]
})

tf.random.set_seed(1)
shuffledTools = toolsProcessed.shuffle(10_000, seed=1, reshuffle_each_iteration=False)

### Split into Training and Testing Sets

In [6]:
trainNum = 8_000
testNum = 2_000

train = shuffledTools.take(trainNum)
# for x,y,z in train:
#     print(x,y,z)

test = shuffledTools.skip(trainNum).take(testNum)

### Determine Unique Customer IDs and Products

In [7]:
customerID = (toolsProcessed
              # Retain only the fields we need.
              .map(lambda x: x["customer_id"])
              )
product = (toolsProcessed 
           .map(lambda x: x["product_title"])
           )

uniqueCustomerID = np.unique(np.concatenate(list(customerID.batch(1_000))))
uniqueProduct = np.unique(np.concatenate(list(product.batch(1_000))))


### Model

In [None]:
class Model(tfrs.Model):
    def __init__(self):
        super().__init__()

        embeddingDim = 32

        ## Model that represents customers with Matrix Factorization
        self.customer_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=uniqueCustomerID, mask_token=None),
            # Embedding for unknown tokens
            tf.keras.layers.Embedding(len(uniqueCustomerID) + 1, embeddingDim)
        ])

        ## Model that represents products
        self.product_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=uniqueProduct, mask_token=None),
            # Embedding for unknown tokens
            tf.keras.layers.Embedding(len(uniqueProduct) + 1, embeddingDim)
        ])

        # Loss function used to train the models using the Factorized Top-k Method
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=product.batch(128).cache().map(self.product_model)
            )
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
            customerEmbeddings = self.customer_model(features["customer_id"])
            productEmbeddings = self.product_model(features["product_title"])

            return self.task(customerEmbeddings, productEmbeddings, compute_metrics=not training)

In [None]:
learningRate = 0.1

model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learningRate))

### Model Fitting and Evaluation

In [None]:
cachedTrain = train.shuffle(10_000).batch(8192).cache()
cachedTest = test.batch(4096).cache()

model.fit(cachedTrain, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1d54de16460>

In [None]:
model.evaluate(cachedTest, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_5_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_10_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_50_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_100_categorical_accuracy': 0.0005000000237487257,
 'loss': 15201.9599609375,
 'regularization_loss': 0,
 'total_loss': 15201.9599609375}

### Retrieving Top-K Candidates

In [None]:
# Dummy values created to simulate larger dataset
toolsWithDummy = tf.data.Dataset.concatenate(
    product.batch(4096),
    product.batch(4096).repeat(1_000).map(lambda x: tf.zeros_like(x))
)

toolsWithDummyEmb = tf.data.Dataset.concatenate(
    product.batch(4096).map(model.product_model),
    product.batch(4096).repeat(1_000)
      .map(lambda x: model.product_model(x))
      .map(lambda x: x * tf.random.uniform(tf.shape(x)))
)

In [None]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(model.customer_model)
brute_force.index_from_dataset(
    product.batch(128).map(lambda prod: (prod, model.product_model(prod)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x1d54de597f0>

In [None]:
# Get predictions for user 42.
_, titles = brute_force(np.array(["8839363"]), k=3)

print(f"Top recommendations: {titles[0]}")

Top recommendations: [b'SE JT6218 5-Piece Watch Band Link Pin Tool Set'
 b'SE JT6218 5-Piece Watch Band Link Pin Tool Set'
 b'SE JT6218 5-Piece Watch Band Link Pin Tool Set']


### Evaluating the Approximation

In [None]:
# Override the existing streaming candidate source.
model.task.factorized_metrics = tfrs.metrics.FactorizedTopK(
    candidates=toolsWithDummyEmb
)


NameError: name 'tfrs' is not defined

In [None]:
# Need to recompile the model for the changes to take effect.
model.compile()


In [None]:

%time baseline_result = model.evaluate(test.batch(8192), return_dict=True, verbose=False)


## Print Out Graph

In [None]:
# plt.figure(figsize=(12, 4))
# plt.barh(corr_similar_count['asin'].head(10),
#          abs(corr_similar_count['Correlation'].head(10)), 
#          align='center',
#          color='red')
# plt.xlabel("Popularity")
# plt.title("Top 10 Popular Movies")
# plt.gca().invert_yaxis()

# Evaluation

In [None]:
# interactions_train, interactions_test = train_test_split(interactions_full,
#                                    stratify=interactions_full['reviewerID'], 
#                                    test_size=0.20,
#                                    random_state=42)

# print('# interactions on Train set: %d' % len(interactions_train))
# print('# interactions on Test set: %d' % len(interactions_test))

# #Indexing by reviewerID to speed up the searches during evaluation
# interactions_full_indexed = interactions_full.set_index('reviewerID')
# interactions_train_indexed = interactions_train.set_index('reviewerID')
# interactions_test_indexed = interactions_test.set_index('reviewerID')

# def get_items_interacted(reviewerID, interactions):
#     # Get the user's data and merge in the item information.
#     interacted_items = interactions.loc[reviewerID]['asin']
#     return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

# # Top-N Accuracy Metrics
# rand_non_interacted_items = 200

# class ModelEvaluator:
#     def get_non_interacted_items_sample(self, reviewerID, sample_size, seed=40):
#         interacted_items = get_items_interacted(reviewerID, interactions_full_indexed)
#         total_items = set(df3["asin"])
#         non_interacted_items = total_items - interacted_items

#         random.seed(seed)
#         non_interacted_items_sample = random.sample(non_interacted_items, k=sample_size)
#         return set(non_interacted_items_sample)

#     # def _verify_hit_top_n (self, ):


In [None]:
labels = tf.constant([[0, 3]], tf.int64)
predictions = tf.constant([[.10, .50, .30, .04, .05, .01]])
for k in range(1, 6):
    rec_at_k = tf.compat.v1.metrics.recall_at_k(labels, predictions, k)
    prec_at_k = tf.compat.v1.metrics.precision_at_k(labels, predictions, k)
    with tf.compat.v1.Session() as sess:
        sess.run([tf.compat.v1.local_variables_initializer(), tf.compat.v1.global_variables_initializer()])
        r, p = sess.run([rec_at_k, prec_at_k])
        print('recall@{} = {:.2f}, precision@{} = {:.2f}'.format(k, r[1], k, p[1]))

# Average Precision@k
k = 4
labels = tf.constant([[1, 0, 4],
                      [0, 0, 0],
                      [0, 2, 2]], tf.int64)
predictions = tf.constant([[.10, .50, .30, .04, .05, .01],
                           [.20, .60, .03, .10, .06, .01],
                           [.08, .25, .50, .15, .01, .01]])
avg_prec = tf.compat.v1.metrics.average_precision_at_k(labels, predictions, k)
with tf.compat.v1.Session() as sess:
    sess.run([tf.compat.v1.local_variables_initializer(), tf.compat.v1.global_variables_initializer()])
    print('{:.4f}'.format(sess.run(avg_prec[1])))

RuntimeError: tf.metrics.recall_at_k is not supported when eager execution is enabled.