## 1. Data Preprocessing 

### 1.1 Load Libraries

In [121]:
import pandas as pd
import numpy as np
import pprint

from typing import Dict, Text
import tensorflow as tf
import tensorflow_recommenders as tfrs

### 1.2 Load Dataset

In [180]:
# Datasets
# Column names are converted for simplicity
books = pd.read_csv("dataset/Books.csv")

ratings = pd.read_csv("dataset/Ratings.csv")

users = pd.read_csv("dataset/Users.csv")

  books = pd.read_csv("dataset/Books.csv")


In [181]:
# Visualization and Type Standardization
users["User-ID"] = users["User-ID"].apply(lambda x: f"user_{x}")

# Filter out books with missing or corrupted information
books["ISBN"] = books["ISBN"].apply(lambda x: f"book_{x}")
books.drop(["Image-URL-S", "Image-URL-M", "Image-URL-L"], axis=1, inplace=True)
books.dropna(inplace=True)

def clean_year(year):
    try:
        return int(year)
    except ValueError:
        return -1

books['Year-Of-Publication'] = books['Year-Of-Publication'].apply(clean_year)
books = books[books['Year-Of-Publication'] != -1].reset_index(drop=True)


ratings["ISBN"] = ratings["ISBN"].apply(lambda x: f"book_{x}")
ratings["User-ID"] = ratings["User-ID"].apply(lambda x: f"user_{x}")
ratings["Book-Rating"] = ratings["Book-Rating"].apply(lambda x: float(x))
ratings = ratings[ratings.ISBN.isin(books['ISBN'].unique())]
# Only consider high ratings
ratings = ratings[ratings["Book-Rating"] >= 8].reset_index(drop=True)


# Consider user & book that has rating in ratings dataset
books = books[books.ISBN.isin(ratings['ISBN'].unique())]
users = users[users['User-ID'].isin(ratings['User-ID'].unique())]
print(f"Number of Users: {users['User-ID'].nunique()}")
print(f"Number of Books: {books['ISBN'].nunique()}")
print(f"Number of Ratings: {ratings.shape[0]}")


Number of Users: 47074
Number of Books: 98413
Number of Ratings: 223803


In [182]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
1,book_0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,book_0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,book_0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
5,book_0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
6,book_0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group


### 1.3 Book to Book Matches

In [183]:
# Group books which are read from same user
book_groups_raw = ratings.groupby('User-ID')
book_groups = pd.DataFrame(
    data={
        "User-ID": list(book_groups_raw.groups.keys()),
        "ISBN_list": list(book_groups_raw.ISBN.apply(list)),
    }
)
# Eliminate if user has read one book
book_groups = book_groups[book_groups['ISBN_list'].apply(len) > 1].reset_index(drop=True)
print(f"Number of Groups: {book_groups.shape[0]}")
book_groups.head()


Number of Groups: 18578


Unnamed: 0,User-ID,ISBN_list
0,user_100004,"[book_0345339703, book_0399146652, book_042508..."
1,user_100009,"[book_0060392452, book_0060977337, book_031298..."
2,user_10001,"[book_0399143165, book_0684195976]"
3,user_10003,"[book_068483068X, book_0743446593]"
4,user_100043,"[book_0060937734, book_0375727345]"


In [184]:

book_matches = []
# for each book in our isbn_list we generate pairs
for isbn_list in book_groups['ISBN_list'].values:
    if len(isbn_list) <= 1:
        continue
    for i, main_isbn in enumerate(isbn_list[:-1]):
        for similar_isbn in isbn_list[i+1:]:
            book_matches.append([main_isbn, similar_isbn])
            
book_pairs_dataset = pd.DataFrame(book_matches, columns=["main_ISBN", "similar_ISBN"])
data_size = book_pairs_dataset.shape[0]
print(f"Number of Matches: {data_size}")
book_pairs_dataset.head()

Number of Matches: 30467210


Unnamed: 0,main_ISBN,similar_ISBN
0,book_0345339703,book_0399146652
1,book_0345339703,book_0425083837
2,book_0345339703,book_0439064872
3,book_0345339703,book_059035342X
4,book_0399146652,book_0425083837


In [185]:
main_books = books.rename(columns=lambda x: 'main_' + x if x != 'ISBN' else x).copy()
book_pairs_final = pd.merge(book_pairs_dataset, main_books,
                              left_on='main_ISBN',
                              right_on='ISBN')
book_pairs_final.drop("ISBN", axis=1, inplace=True)

similar_books = books.rename(columns=lambda x: 'similar_' + x if x != 'ISBN' else x).copy()
book_pairs_final = pd.merge(book_pairs_final, similar_books,
                              left_on='similar_ISBN',
                              right_on='ISBN')

book_pairs_final.drop("ISBN", axis=1, inplace=True)
book_pairs_final.head()

Unnamed: 0,main_ISBN,similar_ISBN,main_Book-Title,main_Book-Author,main_Year-Of-Publication,main_Publisher,similar_Book-Title,similar_Book-Author,similar_Year-Of-Publication,similar_Publisher
0,book_0345339703,book_0399146652,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,The Cat Who Smelled a Rat,Lilian Jackson Braun,2001,Putnam Publishing Group
1,book_0345339703,book_0425083837,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,The Hunt for Red October,Tom Clancy,1985,Berkley Publishing Group
2,book_0345339703,book_0439064872,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic
3,book_0345339703,book_059035342X,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books
4,book_0399146652,book_0425083837,The Cat Who Smelled a Rat,Lilian Jackson Braun,2001,Putnam Publishing Group,The Hunt for Red October,Tom Clancy,1985,Berkley Publishing Group


### 1.4 Convert Dataset to TFDS

In [186]:
book_pairs = tf.data.Dataset.from_tensor_slices({
    # main item features
    'main_ISBN': tf.cast(book_pairs_final['main_ISBN'], dtype=tf.string),
    'main_Book-Author': tf.cast(book_pairs_final['main_Book-Author'], dtype=tf.string),
    'main_Publisher': tf.cast(book_pairs_final['main_Publisher'], dtype=tf.string),

    # similar item features
    'similar_ISBN': tf.cast(book_pairs_final['similar_ISBN'], dtype=tf.string),
    'similar_Book-Author': tf.cast(book_pairs_final['similar_Book-Author'], dtype=tf.string),
    'similar_Publisher': tf.cast(book_pairs_final['similar_Publisher'], dtype=tf.string),
})

book_infos = tf.data.Dataset.from_tensor_slices({
    'ISBN': tf.cast(books['ISBN'], dtype=tf.string),
    'Book-Title': tf.cast(books['Book-Title'], dtype=tf.string),
    'Book-Author': tf.cast(books['Book-Author'], dtype=tf.string),
    'Year-Of-Publication': tf.cast(books['Year-Of-Publication'], dtype=tf.int32),
    'Publisher': tf.cast(books['Publisher'], dtype=tf.string),
})


In [187]:
for x in book_infos.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'Book-Author': b'Richard Bruce Wright',
 'Book-Title': b'Clara Callan',
 'ISBN': b'book_0002005018',
 'Publisher': b'HarperFlamingo Canada',
 'Year-Of-Publication': 2001}


In [188]:
for x in book_pairs.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'main_Book-Author': b'J.R.R. TOLKIEN',
 'main_ISBN': b'book_0345339703',
 'main_Publisher': b'Del Rey',
 'similar_Book-Author': b'Lilian Jackson Braun',
 'similar_ISBN': b'book_0399146652',
 'similar_Publisher': b'Putnam Publishing Group'}


In [189]:
tf.random.set_seed(1002)
train_percentage = 0.8
shuffled = book_pairs.shuffle(data_size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(data_size * train_percentage))
test = shuffled.skip(int(data_size * train_percentage)).take(int(data_size * (1 - train_percentage)))

In [190]:
# Unique ids, authors, publishers etc. for lookup embedding models
book_id_lookup = tf.keras.layers.StringLookup()
book_id_lookup.adapt(book_infos.map(lambda x: x["ISBN"]))

book_author_lookup = tf.keras.layers.StringLookup()
book_author_lookup.adapt(book_infos.map(lambda x: x["Book-Author"]))

book_publisher_lookup = tf.keras.layers.StringLookup()
book_publisher_lookup.adapt(book_infos.map(lambda x: x["Publisher"]))

## 2. Model Definition

### Lookup Embeddings

In [191]:
print(f"Number of unique books {books['ISBN'].nunique()}")
print(f"Number of unique authors {books['Book-Author'].nunique()}")
print(f"Number of unique publishers {books['Publisher'].nunique()}")


Number of unique books 98413
Number of unique authors 42694
Number of unique publishers 8749


In [192]:

# Book ID Model
book_embedding_dimension = 64
book_id_embedding = tf.keras.layers.Embedding(
    input_dim=book_id_lookup.vocabulary_size(),
    output_dim=book_embedding_dimension
)
book_id_model = tf.keras.Sequential([book_id_lookup, book_id_embedding])

# Author Model
author_embedding_dimension = 32
book_author_embedding = tf.keras.layers.Embedding(
    input_dim=book_author_lookup.vocabulary_size(),
    output_dim=author_embedding_dimension
)
book_author_model = tf.keras.Sequential([book_author_lookup, book_author_embedding])

# Publisher Model
publisher_embedding_dimension = 16
book_publisher_embedding = tf.keras.layers.Embedding(
    input_dim=book_publisher_lookup.vocabulary_size(),
    output_dim=publisher_embedding_dimension
)
book_publisher_model = tf.keras.Sequential([book_publisher_lookup, book_publisher_embedding])


In [193]:
book_author_model('Richard Bruce Wright')

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([-0.04628872,  0.00980667,  0.03819538, -0.04937264,  0.03588755,
       -0.01652881, -0.01204226,  0.00336093,  0.02372298,  0.04973009,
       -0.03273942, -0.04899427,  0.03525858, -0.0451337 ,  0.00672914,
        0.01025388,  0.0317633 ,  0.00057109,  0.021229  ,  0.00911748,
       -0.00168433, -0.01043292, -0.03307284, -0.04889264, -0.02043271,
       -0.0362847 ,  0.0450996 , -0.04497403, -0.01856596, -0.01401292,
       -0.02213863,  0.04905016], dtype=float32)>

In [194]:
class BookModel(tf.keras.Model):

  def __init__(self, book_id_model, book_author_model, book_publisher_model):
    super().__init__()

    self.book_id_model = book_id_model
    self.book_author_model = book_author_model
    self.book_publisher_model = book_publisher_model

  def call(self, features: Dict[Text, tf.Tensor]):

    return tf.concat([
        self.book_id_model(features["ISBN"]),
        self.book_author_model(features["Book-Author"]),
        self.book_publisher_model(features["Publisher"])
    ], axis=1)

In [195]:
book_model = BookModel(book_id_model, book_author_model, book_publisher_model)
book_model({'Book-Author': ['Richard Bruce Wright'],
            'Book-Title': ['Clara Callan'],
            'ISBN': ['book_0002005018'],
            'Publisher': ['HarperFlamingo Canada'],
            'Year-Of-Publication': [2001]})

<tf.Tensor: shape=(1, 112), dtype=float32, numpy=
array([[-0.0134122 , -0.04224319, -0.03837799,  0.02480504,  0.03014285,
         0.02383741, -0.00601326,  0.01344645,  0.03237287,  0.02488359,
         0.02762369,  0.02722977, -0.03640391, -0.03811057,  0.00512516,
        -0.01192813, -0.04547076, -0.02449411,  0.00400566,  0.02495438,
        -0.00774561,  0.03269542, -0.04670668,  0.04599568, -0.02222877,
         0.00293582,  0.04730506, -0.0312807 ,  0.019148  ,  0.01982771,
        -0.02242998, -0.04490442, -0.03699709, -0.03562649,  0.01551062,
         0.01284896,  0.01619971, -0.03642218,  0.03533652, -0.00893278,
        -0.04676353,  0.02842263,  0.02641115,  0.04681584, -0.01512507,
         0.00619549, -0.00829776,  0.03972492, -0.04693673, -0.03643564,
        -0.04171269,  0.04280094, -0.01155259, -0.01512391, -0.03418152,
         0.0188534 , -0.03014343,  0.04709292, -0.04209763, -0.00666716,
        -0.04999206, -0.03589796,  0.01303006,  0.0235743 , -0.04628872,
 

In [196]:
# Metrics & Task
metrics = tfrs.metrics.FactorizedTopK(candidates=book_infos.batch(128).map(book_model))
task = tfrs.tasks.Retrieval(metrics=metrics)

In [197]:
class Book2BookModel(tf.keras.Model):
    def __init__(self, book_model, task, embedding_dimension=64):
        super().__init__()
        
        self.book_model = tf.keras.Sequential([book_model,
                                               tf.keras.layers.Dense(embedding_dimension)])
        self.task = task

    def compute_loss(self, features: Dict[Text, tf.Tensor]):
        # Generation of main book embedding from main item features
        user_embeddings = self.book_model({'ISBN':features['main_ISBN'],
                                          'Book-Author': features['main_Book-Author'],
                                          'Publisher': features['main_Publisher']})

        # Generation of similar book embedding from similar item features
        similar_book_embedding = self.book_model({'ISBN':features['similar_ISBN'],
                                                 'Book-Author': features['similar_Book-Author'],
                                                 'Publisher': features['similar_Publisher']})

        # The task computes the loss and the metrics.
        return self.task(user_embeddings,
                         similar_book_embedding)

In [198]:
book2book_model = Book2BookModel(book_model, task)
book2book_model.compile(optimizer=tf.keras.optimizers.legacy.Adagrad(learning_rate=0.1))

In [199]:
book2book_model.fit(book_pairs_final, epochs=1)

## References
- https://www.tensorflow.org/recommenders/examples/basic_retrieval
- https://www.tensorflow.org/recommenders/examples/featurization