In [None]:
import os
import sys
import json
import random
import numpy as np
import pandas as pd

Files obtained from UCSD Book Graph dataset from Goodreads:
https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

Load the huge csv file containing comics and graphic novel interaction data

In [None]:
df_interactions = pd.read_json("./goodreads_interactions_comics_graphic.json.gz",lines=True)

In [None]:
# Filter out entries with "is_read"==False, 
# users with less than 5 books read, 
# and any books with less than 75 reviews
# select columns "user_id", "book_id", "is_read", "rating"
df_interactions = df_interactions.loc[
    df_interactions["user_id"].map( df_interactions["user_id"].value_counts() >= 5)&\
    df_interactions["book_id"].map(df_interactions["book_id"].value_counts()>=75)&\
    df_interactions["is_read"],\
    ["user_id","book_id","is_read","rating"]]

Replace all "book_id" with title

In [None]:
# create hashtable from 'book_id' to 'title' 
book_table = {}
with gzip.open("./goodreads_books_comics_graphic.json.gz") as fin:
    for l in fin:
        d = json.loads(l)
        book_table[d["book_id"]] = d["title"]

# map each "book_id" to "title"
df_interactions["book_title"] = df_interactions["book_id"].apply(str).map(book_table)

In [None]:
# label all books by numbers 1,...,len(book_list)
book_list = np.array(list(set(df_interactions["book_title"])))
n_books = len(book_list)
book_table = { book_list[i]: i for i in range(len(book_list))}
df_interactions["book_num"] = df_interactions["book_title"].map(book_table)

# label users by numbers 1,...,len(user_list)
user_list = np.array(list(set(df_interactions["user_num"])))
user_table = { user_list[i] : i for i in range(len(user_list))}
n_users = len(user_list)
df_interactions["user_num"] = df_interactions["user_num"].map(user_table)

# drop "user_id", "is_read", and "book_id"
df_interactions = df_interactions[["user_num","book_num","book_title","rating"]]

# Popularity Model

Find the top 10 most read graphic novels:

In [None]:
df_interactions.book_title.value_counts().head(10)

Watchmen                                                          23530
Saga, Vol. 1 (Saga, #1)                                           15704
V for Vendetta                                                    13165
Preludes & Nocturnes (The Sandman #1)                             12767
Saga, Vol. 2 (Saga, #2)                                           12003
Maus I: A Survivor's Tale: My Father Bleeds History (Maus, #1)    10715
Saga, Vol. 3 (Saga, #3)                                           10460
The Walking Dead, Vol. 01: Days Gone Bye                          10173
Batman: The Killing Joke                                           9939
Persepolis: The Story of a Childhood (Persepolis, #1)              9227
Name: book_title, dtype: int64

# Training and Testing Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
pairs = []
for i, row in df_interactions[["user_num","book_num"]].sort_values(by=["user_num","book_num"],ascending=True).iterrows():
  pairs.append((row["user_num"],row["book_num"]))

In [None]:
def binarySearch(pairs, el):
  u, b = el
  L=0
  R=len(pairs)-1
  mid = (L+R)//2
  while L <= R:
    mid = (L+R)//2
    if pairs[mid][0] < u:
      L = mid+1
    elif pairs[mid][0] > u:
      R = mid-1
    else:
      if pairs[mid][1] < b:
        L = mid+1
      elif pairs[mid][1] > b:
        R = mid-1
      else:
        return True
  return False

# hold 2.5% of data aside as testing data
pairs_train, pairs_test = train_test_split(pairs, test_size=0.025, random_state=99, shuffle=False)

# also generate negative pairs to hold for testing data. Hold a ratio of 7:1.
neg_ratio = 7
neg_test = []
i=0
while i < neg_ratio*len(pairs_test):
    rand_user = random.randrange(len(user_list))
    rand_book = random.randrange(len(book_list))
    # check that (rand_user, rand_book) is not in pairs set and not in validation set
    if not binarySearch(pairs, (rand_user, rand_book)):
        neg_test.append((rand_user,rand_book))
        i+=1
neg_test.sort()

# use positive pairs and negative pairs to create validation set
def val_set(pos, neg, classification=False):
    if classification:
        neg_val = 0
    else:
        neg_val = -1
    batch = np.zeros( (len(pairs_test) + len(neg_test), 3) )
    i=0
    for user, book in pos:
        batch[i,:] = (user, book, 1)
        i+=1
    for user, book in neg:
        batch[i,:] = (user, book, neg_val)
    return batch

validation_set = val_set(pairs_test, neg_test)

# for the remaining training pairs, define a generator to select positive and negative pairs
# do not include pairs that occur in testing sets: pairs_test and neg_test
def generate_pairs(pairs_train, n_positive=64, neg_ratio=1, pairs_test=[], neg_test=[], classification=False):
    size = n_positive*(1 + neg_ratio)
    batch = np.zeros((int(size), 3))

    if classification:
        neg_val = 0
    else:
        neg_val = -1
    while True:
        for i, (user, book) in enumerate(random.sample(pairs, n_positive)):
            batch[i,:] = (user, book, 1)
        i+=1
        while i < size:
            rand_user = random.randrange(len(user_list))
            rand_book = random.randrange(len(book_list))
            # check that (rand_user, rand_book) is not in pairs set and not in validation set
            if not binarySearch(pairs_train, (rand_user, rand_book)) and \
                not binarySearch(pairs_test, (rand_user, rand_book)) and \
                not binarySearch( neg_test, (rand_user, rand_book) ):
                batch[i,:] = (rand_user, rand_book, neg_val)
                i+=1
        np.random.shuffle(batch)
        yield {"user_input": batch[:,0], "book_input": batch[:,1]}, batch[:,2]

In [None]:
next(generate_pairs(pairs_train,2,2,pairs_test,neg_test))

({'book_input': array([8142., 1593., 8191., 4864., 1105., 2437.]),
  'user_input': array([49244., 86833., 26391., 43765., 47495., 44097.])},
 array([-1.,  1., -1., -1., -1.,  1.]))

# Matrix Factorization Model

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Dense, Reshape, Concatenate
from keras.models import Model

In [None]:
feature_dim = 64


book_input = Input(shape = [1], name="book_input")
book_embedding = Embedding(n_books,feature_dim, name="book_embedding")(book_input)
book_vec = Flatten(name="book_vec")(book_embedding)
user_input = Input(shape = [1], name="user_input")
user_embedding = Embedding(n_users,feature_dim,name = "user_embedding")(user_input)
user_vec = Flatten(name="user_vec")(user_embedding)
x = Dot(axes=1,name="DotProduct")([user_vec,book_vec])

MatrixFactorization = Model(inputs = [user_input,book_input], outputs=x, name="FeatureEmbeddings")
MatrixFactorization.compile(optimizer="adam",loss="mean_squared_error")

MatrixFactorization.summary()

Model: "FeatureEmbeddings"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
book_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 64)        6136768     user_input[0][0]                 
__________________________________________________________________________________________________
book_embedding (Embedding)      (None, 1, 64)        664896      book_input[0][0]                 
__________________________________________________________________________________

Train the model using only train data, evaluate fit using test data

In [None]:
# Future runs use epochs=7
n_positive = 2048
neg_ratio = 7

gen = generate_pairs(pairs_train, n_positive, neg_ratio, pairs_test, neg_test)

hist = MatrixFactorization.fit(gen, epochs=7, steps_per_epoch = len(pairs_train)//n_positive, 
    validation_data = ( [validation_set[:,0], validation_set[:,1] ], validation_set[:,2] ) )

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [None]:
MatrixFactorization.save("/content/drive/MyDrive/models/MatrixFactorization")

INFO:tensorflow:Assets written to: /content/drive/MyDrive/models/MatrixFactorization/assets


**Get weights and find similar books**

In [None]:
book_layer = MatrixFactorization.get_layer("book_embedding")
book_weights = book_layer.get_weights()[0]
book_weights = book_weights / np.linalg.norm(book_weights, axis = 1).reshape((-1, 1))

# Find books similar to "book_name"
def similarBooks(book_name):
  book_num = book_table[book_name]

  similarity = np.dot(book_weights, book_weights[book_num])

  sorted_similarity = np.argsort(similarity)
  closest = sorted_similarity[-11:-1]

  maxlength = max([len(book_list[c]) for c in closest])

  print(f"Books closest to {book_name}:")
  for c in reversed(closest):
    print(f"Book: {book_list[c]:{maxlength+2}}Sim: {similarity[c]:.{3}}")

In [None]:
similarBooks("The Hard Goodbye (Sin City #1)")
print("\n")
similarBooks("Watchmen")
print("\n")
similarBooks("Batman: The Dark Knight Returns (The Dark Knight Saga, #1)")
print("\n")
similarBooks("Captain America: Winter Soldier, Volume 1")
print("\n")
similarBooks("300")


Books closest to The Hard Goodbye (Sin City #1):
Book: A Dame to Kill For (Sin City #2)                            Sim: 0.869
Book: That Yellow Bastard (Sin City #4)                           Sim: 0.818
Book: 300                                                         Sim: 0.784
Book: The Big Fat Kill (Sin City #3)                              Sim: 0.784
Book: Family Values (Sin City #5)                                 Sim: 0.712
Book: Booze, Broads, and Bullets (Sin City #6)                    Sim: 0.683
Book: Hell and Back (Sin City #7)                                 Sim: 0.653
Book: From Hell                                                   Sim: 0.646
Book: V for Vendetta                                              Sim: 0.613
Book: Batman: The Dark Knight Returns (The Dark Knight Saga, #1)  Sim: 0.603


Books closest to Watchmen:
Book: V for Vendetta                                                  Sim: 0.812
Book: Preludes & Nocturnes (The Sandman #1)                           S

# Deep Neural Collaborative Filtering

In [None]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from keras.models import Model

In [None]:
feature_dim = 16


book_input = Input(shape = [1], name="book_input")
book_embedding = Embedding(n_books,feature_dim, name="book_embedding")(book_input)
book_vec = Flatten(name="book_vec")(book_embedding)
user_input = Input(shape = [1], name="user_input")
user_embedding = Embedding(n_users,feature_dim,name = "user_embedding")(user_input)
user_vec = Flatten(name="user_vec")(user_embedding)
x = Concatenate(axis=1,name="Cat")([user_vec,book_vec])
x = Dense(16,activation="relu", name="hidden_layer_1")(x)
x = Dense(16,activation="relu", name="hidden_layer_2")(x)
x = Dense(32,activation="relu", name="hidden_layer_3")(x)
x = Dense(32,activation="relu", name="hidden_layer_4")(x)
x = Dense(16,activation="relu", name="hidden_layer_5")(x)
x = Dense(16,activation="relu", name="hidden_layer_6")(x)
x = Dense(1, activation="sigmoid", name="sigmoid_out")(x)

dncf = Model(inputs = [user_input,book_input], outputs=x, name="Deep_Neural_Collaborative_Filtering")
dncf.compile(optimizer="adam",loss="binary_crossentropy")

dncf.summary()

Model: "Deep_Neural_Collaborative_Filtering"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
book_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 16)        1534192     user_input[0][0]                 
__________________________________________________________________________________________________
book_embedding (Embedding)      (None, 1, 16)        166224      book_input[0][0]                 
________________________________________________________________

In [None]:
# total epochs = 8

n_positive = 2048
neg_ratio = 7

gen = generate_pairs(pairs_train, n_positive, neg_ratio, pairs_test, neg_test, classification=True)

hist = dncf.fit(gen, epochs=8, steps_per_epoch = len(pairs_train)//n_positive,
    validation_data=([validation_set[:,0],validation_set[:,1]],validation_set[:,2]) )

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
dncf.save("/content/drive/MyDrive/models/dncf")

INFO:tensorflow:Assets written to: /content/drive/MyDrive/models/dncf/assets


# Model Evaluation

In [None]:
## Evaluate Top-10 Hit Rate for a given model 

def topKeval_test(model, k=10, n=100, times = 10000):
  # Select one true positive at random, and 99 negative interactions
  ct = 0
  batch = np.zeros((n,2))
  for j in range(times):
    # select a user and book pair at random from test set
    user, book = random.choice(pairs_test)
    i=0
    batch[i,:] = (user, book)
    i+=1
    while i < n:
      # keep user the same, select book at random
      rand_book = random.randrange(len(book_list))
      # check if (rand_user, rand_book) is in pairs
      if not binarySearch(pairs, (user, rand_book)):
        batch[i,:] = (user, rand_book)
        i+=1

    # generate predictions that user has interacted with each book
    similarity = model.predict([ batch[:,0], batch[:,1]])
    sorted_similarities = np.argsort(similarity,axis=0)

    # If 0 is in top 10, then record success
    if 0 in sorted_similarities[-k:]:
      ct+=1

    if j%(times//5)==0 and j>0:
      print("Completed iteration: ",j)
  return 100*ct/times

In [None]:
topKeval_test(model = MatrixFactorization)

Completed iteration:  2000
Completed iteration:  4000
Completed iteration:  6000
Completed iteration:  8000


93.17

In [None]:
topKeval_test(model = dncf)

Completed iteration:  2000
Completed iteration:  4000
Completed iteration:  6000
Completed iteration:  8000


85.34

# Top-10 Evaluation of Popularity Model

In [None]:
n=100
k=10
times = 10000
ct = 0
popularity_rank = df_interactions.book_title.value_counts()
for j in range(times):
    similarities = np.zeros(int(n))
    # choose user at random
    user = user_table[np.random.choice(user_list)]
    # Pick a book the user has read
    book = np.random.choice(df_interactions.book_title.loc[ df_interactions["user_num"] == user ])
    i=0
    similarities[i] = popularity_rank[book]
    i+=1
    while i < n:
        # keep user the same, select book at random
        rand_book = random.randrange(len(book_list))
        title = book_list[rand_book]
        # check if (rand_user, rand_book) is in pairs
        if not binarySearch(pairs, (user, rand_book)):
            similarities[i] = popularity_rank[title]
            i+=1
    sorted_similarities = np.argsort(similarities,axis=0)

    if 0 in sorted_similarities[-10:]:
        ct+=1
    if j%(times//5)==0 and j>0:
        print("Completed iteration: ",j)
print(100*ct/times)

Completed iteration:  2000
Completed iteration:  4000
Completed iteration:  6000
Completed iteration:  8000
0.6217
