In [1]:
# This notebook shows how to make use of the goodreads co-occurence rating matrix
# Please, download the files 'user_book_matrix.npy' and 'user_information.p' from the drive 
# and place them in a folder named "goodreads".

# Disclaimer: goodreads books were matched with dmc books via fuzzy string match of title and author, 
# some ill-matched cases are possible

In [88]:
import pickle
import numpy as np
from scipy.sparse import csr_matrix, load_npz
import csv
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

#### Load DMC Data

In [65]:
dmc_task_books = dict()
with open("DMC-2021-Task/items.csv") as i:
    csvreader = csv.reader(i,delimiter="|")
    next(csvreader) # header
    for line in csvreader:
        itemID, title = line[:2]
        dmc_task_books[int(itemID)] = title

list(dmc_task_books.items())[0]

(21310, 'Princess Poppy: The Big Mix Up')

#### Load User Information
-> Dictionary of userID and value-list with "avg rating" and "no. of rated book" per user

In [4]:
with open("goodreads/user_information.p", "rb") as u:
    user_information = pickle.load(u)

user_information[0]

(4.335329341317365, 167)

#### Load DMC Books that are on Goodreads

In [70]:
dmc_to_gr_string = pickle.load(open("goodreads/dmc_to_goodreads.p", "rb"))
dmc_to_gr = {int(k): int(v) for k,v in dmc_to_gr_string.items()}
list(dmc_to_gr.keys())[:10]

[71270, 39784, 64739, 4766, 79008, 7514, 50034, 71111, 50208, 63005]

#### Load Book-User-Matrix

In [55]:
user_book_matrix = np.load("goodreads/user_book_matrix.npy", allow_pickle=True)
#user_book_matrix = load_npz("goodreads/user_book_matrix.npy")

user_book_matrix = user_book_matrix.item()
print(user_book_matrix.shape)
print(type(user_book_matrix[0,:]))

(79058, 304959)
<class 'scipy.sparse.csr.csr_matrix'>


In [75]:
# retrieve relevant book columns
book_columns = dict()
coo_matrix = user_book_matrix.tocoo()
ratings_non_zero = set(zip(coo_matrix.row, coo_matrix.col))
for book,user in ratings_non_zero:
    if int(book) in dmc_task_books.keys():
        book_columns[int(book)] = user_book_matrix[book,:]

In [101]:
# 13834
# The Last Dragon
# 54197
# Fire & Ice

last_dragon = book_columns[13834]
fire_and_ice = book_columns[54197].toarray()

print(last_dragon)
print(fire_and_ice)


  (0, 202)	3
  (0, 740)	3
  (0, 6072)	4
  (0, 6934)	4
  (0, 8338)	4
  (0, 12898)	3
  (0, 14907)	4
  (0, 15571)	4
  (0, 15851)	3
  (0, 16887)	3
  (0, 17051)	3
  (0, 20808)	4
  (0, 21799)	5
  (0, 22188)	3
  (0, 22555)	5
  (0, 23150)	5
  (0, 25152)	4
  (0, 26634)	5
  (0, 27000)	4
  (0, 28159)	2
  (0, 29866)	3
  (0, 30203)	3
  (0, 30940)	4
  (0, 34845)	5
  (0, 36547)	4
  :	:
  (0, 272276)	1
  (0, 272500)	3
  (0, 272665)	5
  (0, 273315)	4
  (0, 273410)	3
  (0, 273695)	4
  (0, 273864)	3
  (0, 275605)	4
  (0, 276949)	3
  (0, 277558)	5
  (0, 277638)	3
  (0, 278631)	5
  (0, 279222)	4
  (0, 279389)	4
  (0, 280068)	4
  (0, 280604)	5
  (0, 284607)	3
  (0, 287991)	4
  (0, 288795)	3
  (0, 289350)	4
  (0, 289810)	4
  (0, 290390)	4
  (0, 290925)	3
  (0, 291061)	3
  (0, 292009)	4
[[0 0 0 ... 0 0 0]]
