## 1. Data Preprocessing 

### 1.1 Load Libraries

In [2]:
import pandas as pd
import numpy as np
import pprint

from typing import Dict, Text
import tensorflow as tf
import tensorflow_recommenders as tfrs
import hnswlib

# min rating to consider
min_rating = 8
# top k popular books
top_k_popular_books = 10_000

# parameters
output_dimension = 64
batch_size = 256
learning_rate = 0.1


### 1.2 Load Dataset

In [3]:
# Datasets
books = pd.read_csv("dataset/Books.csv")

ratings = pd.read_csv("dataset/Ratings.csv")

users = pd.read_csv("dataset/Users.csv")

  books = pd.read_csv("dataset/Books.csv")


In [4]:
# Visualization and Type Standardization
users["User-ID"] = users["User-ID"].apply(lambda x: f"user_{x}")

# Filter out books with missing or corrupted information
books["ISBN"] = books["ISBN"].apply(lambda x: f"book_{x}")
books.drop(["Image-URL-S", "Image-URL-M", "Image-URL-L"], axis=1, inplace=True)
books.dropna(inplace=True)

def clean_year(year):
    try:
        return int(year)
    except ValueError:
        return -1

books['Year-Of-Publication'] = books['Year-Of-Publication'].apply(clean_year)
books = books[books['Year-Of-Publication'] != -1].reset_index(drop=True)


ratings["ISBN"] = ratings["ISBN"].apply(lambda x: f"book_{x}")
ratings["User-ID"] = ratings["User-ID"].apply(lambda x: f"user_{x}")
ratings["Book-Rating"] = ratings["Book-Rating"].apply(lambda x: float(x))
ratings = ratings[ratings.ISBN.isin(books['ISBN'].unique())]
# Filtering products for simplicity
# Only consider high ratings
ratings = ratings[ratings["Book-Rating"] >= 8]
# Get top 10k products
popular_books = ratings.groupby('ISBN')['User-ID'].count().sort_values(ascending=False)[:10_000].index.tolist()
ratings = ratings[ratings.ISBN.isin(popular_books)]

# Consider user & book that has rating in ratings dataset
books = books[books.ISBN.isin(ratings['ISBN'].unique())]
users = users[users['User-ID'].isin(ratings['User-ID'].unique())]
print(f"Number of Users: {users['User-ID'].nunique()}")
print(f"Number of Books: {books['ISBN'].nunique()}")
print(f"Number of Ratings: {ratings.shape[0]}")


Number of Users: 32188
Number of Books: 10000
Number of Ratings: 109041


In [5]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
1,book_0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
3,book_0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
5,book_0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
18,book_0440234743,The Testament,John Grisham,1999,Dell
19,book_0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume


### 1.3 Book to Book Matches

In [6]:
# Group books which are read from same user
book_groups_raw = ratings.groupby('User-ID')
book_groups = pd.DataFrame(
    data={
        "User-ID": list(book_groups_raw.groups.keys()),
        "ISBN_list": list(book_groups_raw.ISBN.apply(list)),
    }
)
# Eliminate if user has read one book
book_groups = book_groups[book_groups['ISBN_list'].apply(len) > 1].reset_index(drop=True)
print(f"Number of Groups: {book_groups.shape[0]}")
book_groups.head()


Number of Groups: 12168


Unnamed: 0,User-ID,ISBN_list
0,user_100004,"[book_0345339703, book_0399146652, book_042508..."
1,user_100009,"[book_0060392452, book_0060977337, book_031298..."
2,user_100053,"[book_0312422156, book_0316769487, book_038549..."
3,user_100088,"[book_0006550576, book_0007110928, book_006016..."
4,user_100115,"[book_0060173289, book_0399144463, book_044047..."


In [7]:

book_matches = []
# for each book in our isbn_list we generate pairs
for isbn_list in book_groups['ISBN_list'].values:
    if len(isbn_list) <= 1:
        continue
    for i, main_isbn in enumerate(isbn_list[:-1]):
        for similar_isbn in isbn_list[i+1:]:
            book_matches.append([main_isbn, similar_isbn])

# Dataset generation and visualization
book_pairs_dataset = pd.DataFrame(book_matches, columns=["main_ISBN", "similar_ISBN"])
data_size = book_pairs_dataset.shape[0]
print(f"Number of Matches: {data_size}")
book_pairs_dataset.head()

Number of Matches: 2811412


Unnamed: 0,main_ISBN,similar_ISBN
0,book_0345339703,book_0399146652
1,book_0345339703,book_0425083837
2,book_0345339703,book_0439064872
3,book_0345339703,book_059035342X
4,book_0399146652,book_0425083837


In [8]:
# Our final dataset to train our model 
# Main book features
main_books = books.rename(columns=lambda x: 'main_' + x if x != 'ISBN' else x).copy()
book_pairs = pd.merge(book_pairs_dataset, main_books,
                              left_on='main_ISBN',
                              right_on='ISBN')
book_pairs.drop("ISBN", axis=1, inplace=True)

# Similar book features
similar_books = books.rename(columns=lambda x: 'similar_' + x if x != 'ISBN' else x).copy()
book_pairs = pd.merge(book_pairs, similar_books,
                              left_on='similar_ISBN',
                              right_on='ISBN')

book_pairs.drop("ISBN", axis=1, inplace=True)
book_pairs.head()

Unnamed: 0,main_ISBN,similar_ISBN,main_Book-Title,main_Book-Author,main_Year-Of-Publication,main_Publisher,similar_Book-Title,similar_Book-Author,similar_Year-Of-Publication,similar_Publisher
0,book_0345339703,book_0399146652,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,The Cat Who Smelled a Rat,Lilian Jackson Braun,2001,Putnam Publishing Group
1,book_0064400557,book_0399146652,Charlotte's Web (Trophy Newbery),E. B. White,1974,HarperTrophy,The Cat Who Smelled a Rat,Lilian Jackson Braun,2001,Putnam Publishing Group
2,book_0064400042,book_0399146652,On the Banks of Plum Creek,Laura Ingalls Wilder,1953,HarperTrophy,The Cat Who Smelled a Rat,Lilian Jackson Braun,2001,Putnam Publishing Group
3,book_0385492081,book_0399146652,Into Thin Air : A Personal Account of the Mt. ...,JON KRAKAUER,1998,Anchor,The Cat Who Smelled a Rat,Lilian Jackson Braun,2001,Putnam Publishing Group
4,book_0061092614,book_0399146652,Finding Moon,Tony Hillerman,1996,HarperTorch,The Cat Who Smelled a Rat,Lilian Jackson Braun,2001,Putnam Publishing Group


### 1.4 Convert Dataset to TFDS

In [9]:
batch_size = 256
# Pairs dataset
book_pairs_final = tf.data.Dataset.from_tensor_slices({
    # main book features
    'main_ISBN': tf.cast(book_pairs['main_ISBN'], dtype=tf.string),
    'main_Book-Author': tf.cast(book_pairs['main_Book-Author'], dtype=tf.string),
    'main_Publisher': tf.cast(book_pairs['main_Publisher'], dtype=tf.string),

    # similar book features
    'similar_ISBN': tf.cast(book_pairs['similar_ISBN'], dtype=tf.string),
    'similar_Book-Author': tf.cast(book_pairs['similar_Book-Author'], dtype=tf.string),
    'similar_Publisher': tf.cast(book_pairs['similar_Publisher'], dtype=tf.string),
})

book_pairs_final = book_pairs_final.batch(batch_size)
# Book information dataset
book_infos = tf.data.Dataset.from_tensor_slices({
    'ISBN': tf.cast(books['ISBN'], dtype=tf.string),
    'Book-Title': tf.cast(books['Book-Title'], dtype=tf.string),
    'Book-Author': tf.cast(books['Book-Author'], dtype=tf.string),
    'Year-Of-Publication': tf.cast(books['Year-Of-Publication'], dtype=tf.int32),
    'Publisher': tf.cast(books['Publisher'], dtype=tf.string),
})
book_infos = book_infos.batch(batch_size)


In [12]:
tf.random.set_seed(1002)
train_percentage = 0.8
shuffled = book_pairs_final.shuffle(data_size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(data_size * train_percentage))
test = shuffled.skip(int(data_size * train_percentage)).take(int(data_size * (1 - train_percentage)))

## 2. Model Definition

### Lookup Embeddings

In [13]:
print(f"Number of unique books {books['ISBN'].nunique()}")
print(f"Number of unique authors {books['Book-Author'].nunique()}")
print(f"Number of unique publishers {books['Publisher'].nunique()}")


Number of unique books 10000
Number of unique authors 3585
Number of unique publishers 912


In [14]:
# Book ID Model
book_embedding_dimension = 32
unique_book_ids = books['ISBN'].unique()
book_id_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_book_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_book_ids) + 1, book_embedding_dimension)
])


# Author Model
author_embedding_dimension = 16
unique_book_authors = books['Book-Author'].unique()
book_author_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_book_authors, mask_token=None),
  tf.keras.layers.Embedding(len(unique_book_authors) + 1, author_embedding_dimension)
])

# Publisher Model
publisher_embedding_dimension = 8
unique_book_publishers = books['Publisher'].unique()
book_publisher_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_book_publishers, mask_token=None),
  tf.keras.layers.Embedding(len(unique_book_publishers) + 1, publisher_embedding_dimension)
])


In [15]:
# Author Embedding Visualization
book_author_model('Richard Bruce Wright')

<tf.Tensor: shape=(16,), dtype=float32, numpy=
array([ 0.02610968, -0.01355533, -0.00070776,  0.01639757, -0.01837927,
        0.00612495,  0.02453266,  0.00846382,  0.04046288, -0.04418833,
       -0.02671603, -0.03624015, -0.03614105, -0.04618943,  0.02791945,
       -0.04999537], dtype=float32)>

### Book Model

In [16]:
class BookModel(tfrs.Model):

  def __init__(self, book_id_model, book_author_model, book_publisher_model):
    super().__init__()
    # assigning sub models to convert ids to embeddings
    self.book_id_model = book_id_model
    self.book_author_model = book_author_model
    self.book_publisher_model = book_publisher_model

  def call(self, features: Dict[Text, tf.Tensor]):

    # concatenation of embeddings
    return tf.concat([
        self.book_id_model(features["ISBN"]),
        self.book_author_model(features["Book-Author"]),
        self.book_publisher_model(features["Publisher"])
    ], axis=1)

In [17]:
# Initialization of our book model
book_model = BookModel(book_id_model, book_author_model, book_publisher_model)

# Sample example
book_model({'Book-Author': ['Richard Bruce Wright'],
            'Book-Title': ['Clara Callan'],
            'ISBN': ['book_0002005018'],
            'Publisher': ['HarperFlamingo Canada'],
            'Year-Of-Publication': [2001]})

<tf.Tensor: shape=(1, 56), dtype=float32, numpy=
array([[-0.02277664, -0.03058398,  0.01887276, -0.03647789, -0.01543517,
         0.04707909, -0.04193626,  0.04556403,  0.03679964,  0.04722789,
        -0.03953106,  0.02931954, -0.00718392, -0.02253139, -0.03164486,
        -0.02387388,  0.04915477,  0.02391762,  0.04796959,  0.01057738,
         0.0180179 , -0.04633288, -0.02984225,  0.00789521, -0.03926926,
        -0.03618088,  0.03907367,  0.0187014 ,  0.0269802 ,  0.00521718,
         0.02127628,  0.04673828,  0.02610968, -0.01355533, -0.00070776,
         0.01639757, -0.01837927,  0.00612495,  0.02453266,  0.00846382,
         0.04046288, -0.04418833, -0.02671603, -0.03624015, -0.03614105,
        -0.04618943,  0.02791945, -0.04999537, -0.03544075, -0.02194861,
         0.01883854,  0.04143132,  0.00809699,  0.02718485, -0.01315432,
         0.00781249]], dtype=float32)>

In [18]:
# Metrics & Task
metrics = tfrs.metrics.FactorizedTopK(candidates=book_infos.map(lambda features: book_model(features)))
task = tfrs.tasks.Retrieval(metrics=metrics)

### Book to Book Model

In [19]:
class Book2BookModel(tfrs.Model):
    def __init__(self, book_id_model, book_author_model, book_publisher_model, task, output_dimension=64):
        super().__init__()
        self.book_id_model = book_id_model
        self.book_author_model = book_author_model
        self.book_publisher_model = book_publisher_model
        # combining book model with output dimension to fix output dimension
        self.book_model = tf.keras.Sequential([BookModel(self.book_id_model,
                                                         self.book_author_model,
                                                         self.book_publisher_model),
                                               tf.keras.layers.Dense(output_dimension)])
        # Metrics & Task
        self.candidates = book_infos.map(lambda x: self.book_model(x))
        metrics = tfrs.metrics.FactorizedTopK(candidates=self.candidates)
        self.task = tfrs.tasks.Retrieval(metrics=metrics)
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False):
        # Generation of main book embedding from main item features
        main_book_embedding = self.book_model({'ISBN':features['main_ISBN'],
                                          'Book-Author': features['main_Book-Author'],
                                          'Publisher': features['main_Publisher']})

        # Generation of similar book embedding from similar item features
        similar_book_embedding = self.book_model({'ISBN':features['similar_ISBN'],
                                                 'Book-Author': features['similar_Book-Author'],
                                                 'Publisher': features['similar_Publisher']})

        # loss and the metric calculation
        # compute metrics set false to skyrocket training speed
        return self.task(main_book_embedding,
                         similar_book_embedding,
                         compute_metrics=False)

In [24]:
# Book to Book Model initialization
book2book_model = Book2BookModel(book_id_model, book_author_model, book_publisher_model, task)
book2book_model.compile(optimizer=tf.keras.optimizers.legacy.Adagrad(learning_rate=0.1))

In [25]:
book2book_model.fit(book_pairs_final, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2ddfa2730>

In [22]:
book_embeddings = book_infos.map(lambda features: {'ISBN':features['ISBN'], 'embedding':book2book_model.book_model(features)})


In [33]:
# Function to apply the model to each book feature set and return ISBN with embeddings
def extract_embeddings_with_isbn(features):
    # Apply the book model to extract embeddings
    embeddings = book2book_model.book_model(features)
    # Return a tuple of ISBN and its corresponding embedding
    return features['ISBN'], embeddings

# Mapping the function over the dataset
book_embeddings = book_infos.map(extract_embeddings_with_isbn)
isbn_list = []
embeddings_list = []

# Example to inspect or use the embeddings with ISBNs
for isbn, embedding in book_embeddings:
    isbn_list.extend(list(isbn.numpy().astype(str)))  # Assuming ISBN is a string or properly decoded byte array
    embeddings_list.extend(list(embedding.numpy()))

book_embedding_dataset = pd.DataFrame({'ISBN':isbn_list, 'embedding':embeddings_list})
book_embedding_dataset.head()

Unnamed: 0,ISBN,embedding
0,book_0002005018,"[-0.12340857, -0.05511024, -0.0061488114, -0.0..."
1,book_0374157065,"[0.14642659, 0.21776597, 0.30584982, -0.323751..."
2,book_0399135782,"[-0.07726827, 0.0828505, -0.23633176, -0.42332..."
3,book_0440234743,"[-0.0023646206, 0.17485964, -0.12851611, -0.71..."
4,book_0452264464,"[0.030823871, -0.04213976, -0.27128625, -0.084..."


In [38]:
book_embedding_dict = dict(zip(book_embedding_dataset.ISBN, book_embedding_dataset.embedding))
book_title_dict = dict(zip(books['ISBN'], books['Book-Title']))

# ANN

In [30]:
dim = output_dimension

num_elements = book_embedding_dataset.shape[0]
# hnswlib initialization with cosine similarity
p = hnswlib.Index(space='cosine', dim=dim)

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

p.set_ef(10)

embeddings = np.vstack(book_embedding_dataset["embedding"].values)
p.add_items(embeddings)

## Similar Book Search

In [56]:
def book_search(isbn, k=3):
    """Gets input embeddings and return top k similar items"""

    # Generate embedding for the user query
    query_embedding = book_embedding_dict[isbn]

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    labels, _ = p.knn_query(query_embedding, k=k+1)
    results = book_embedding_dataset.iloc[list(labels[0][1:])].to_dict('records')
    similar_isbns = [similar_isbn['ISBN'] for similar_isbn in results]
    return similar_isbns

In [60]:
# Visualization of Recommendation
main_book = book_embedding_dataset['ISBN'][0]
print(f"Main Book:\n-{book_title_dict[main_book]} \nSimilar Books:")
similar_books = book_search(main_book)
for i, similar_book in enumerate(similar_books):
    print(f"{i+1}. {book_title_dict[similar_book]}")

Main Book:
-Clara Callan 
Similar Books:
1. Angelas Ashes
2. Postcards
3. Before and After


## References
- https://www.tensorflow.org/recommenders/examples/basic_retrieval
- https://www.tensorflow.org/recommenders/examples/featurization