## 1. Data Preprocessing 

### 1.1 Load Libraries

In [1]:
import pandas as pd
import numpy as np
import pprint

import tensorflow as tf

### 1.2 Load Dataset

In [2]:
# Datasets
# Column names are converted for simplicity
books = pd.read_csv("dataset/Books.csv")

ratings = pd.read_csv("dataset/Ratings.csv")

users = pd.read_csv("dataset/Users.csv")

  books = pd.read_csv("dataset/Books.csv")


In [3]:
# Visualization and Type Standardization
users["User-ID"] = users["User-ID"].apply(lambda x: f"user_{x}")
print(f"Number of Users: {users['User-ID'].nunique()}")

# Filter out books with missing or corrupted information
books["ISBN"] = books["ISBN"].apply(lambda x: f"book_{x}")
books.drop(["Image-URL-S", "Image-URL-M", "Image-URL-L"], axis=1, inplace=True)
books.dropna(inplace=True)
is_integer_rows = books["Year-Of-Publication"].apply(lambda x: isinstance(x, int))
books = books[is_integer_rows].reset_index(drop=True)
books["Year-Of-Publication"] = books["Year-Of-Publication"].astype(int)


print(f"Number of Books: {books['ISBN'].nunique()}")

ratings["ISBN"] = ratings["ISBN"].apply(lambda x: f"book_{x}")
ratings["User-ID"] = ratings["User-ID"].apply(lambda x: f"user_{x}")
ratings["Book-Rating"] = ratings["Book-Rating"].apply(lambda x: float(x))
ratings = ratings[ratings.ISBN.isin(books['ISBN'].unique())]
# Only consider books with high ratings
ratings = ratings[ratings["Book-Rating"] >= 8].reset_index(drop=True)
print(f"Number of Ratings: {ratings.shape[0]}")


Number of Users: 278858
Number of Books: 205820
Number of Ratings: 208037


In [4]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,book_0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,book_0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,book_0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,book_0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,book_0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


### 1.3 Book to Book Matches

In [5]:
# Group books which are read from same user
book_groups_raw = ratings.groupby('User-ID')
book_groups = pd.DataFrame(
    data={
        "User-ID": list(book_groups_raw.groups.keys()),
        "ISBN_list": list(book_groups_raw.ISBN.apply(list)),
    }
)
# Eliminate if user has read one book
book_groups = book_groups[book_groups['ISBN_list'].apply(len) > 1].reset_index(drop=True)
print(f"Number of Groups: {book_groups.shape[0]}")
book_groups.head()


Number of Groups: 17935


Unnamed: 0,User-ID,ISBN_list
0,user_100004,"[book_0345339703, book_0399146652, book_042508..."
1,user_100009,"[book_0060392452, book_0060977337, book_031298..."
2,user_10001,"[book_0399143165, book_0684195976]"
3,user_10003,"[book_068483068X, book_0743446593]"
4,user_100043,"[book_0060937734, book_0375727345]"


In [6]:

book_matches = []
# for each book in our isbn_list we generate pairs
for isbn_list in book_groups['ISBN_list'].values:
    for i, main_isbn in enumerate(isbn_list[:-1]):
        for similar_isbn in isbn_list[i+1:]:
            book_matches.append([main_isbn, similar_isbn])
            
book_pairs_dataset = pd.DataFrame(book_matches, columns=["main_book", "similar_book"])
print(f"Number of Matches: {book_pairs_dataset.shape[0]}")
book_pairs_dataset.head()

Number of Matches: 26911272


Unnamed: 0,main_book,similar_book
0,book_0345339703,book_0399146652
1,book_0345339703,book_0425083837
2,book_0345339703,book_0439064872
3,book_0345339703,book_059035342X
4,book_0399146652,book_0425083837


### 1.4 Convert Dataset to TFDS

In [7]:
book_pairs = tf.data.Dataset.from_tensor_slices({
    'main_book': tf.cast(book_pairs_dataset['main_book'], dtype=tf.string),
    'similar_book': tf.cast(book_pairs_dataset['similar_book'], dtype=tf.string)
})

book_infos = tf.data.Dataset.from_tensor_slices({
    'ISBN': tf.cast(books['ISBN'], dtype=tf.string),
    'Book-Title': tf.cast(books['Book-Title'], dtype=tf.string),
    'Book-Author': tf.cast(books['Book-Author'], dtype=tf.string),
    'Year-Of-Publication': tf.cast(books['Year-Of-Publication'], dtype=tf.int32),
    'Publisher': tf.cast(books['Publisher'], dtype=tf.string),
})


In [8]:
for x in book_infos.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'Book-Author': b'Mark P. O. Morford',
 'Book-Title': b'Classical Mythology',
 'ISBN': b'book_0195153448',
 'Publisher': b'Oxford University Press',
 'Year-Of-Publication': 2002}


In [9]:
for x in book_pairs.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'main_book': b'book_0345339703', 'similar_book': b'book_0399146652'}


## 2. Model Definition