In [96]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing

# Full code is at https://github.com/dataquestio/project-walkthroughs/tree/master/books

import pandas as pd

my_books = pd.read_csv("liked_books_full.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [97]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
4,-1,128717,5,Endurance
...,...,...,...,...
528,-1,28187,1,The Lightning Thief (Percy Jackson and the Ol...
529,-1,7094569,1,"Feed (Newsflesh, #1)"
530,-1,18925933,1,Mastering the VC Game: A Venture Capital Insid...
531,-1,25659450,1,Arkwright


In [98]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [99]:
book_set = set(my_books["book_id"])

In [100]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [101]:
len(overlap_users)

581443

In [105]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [106]:
len(filtered_overlap_users)

128

In [9]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [10]:
len(interactions_list)

4774073

In [11]:
interactions_list[0]

['394', '375802', '5']

In [12]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [13]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [14]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
4,-1,128717,5
...,...,...,...
4774068,441398,25790952,0
4774069,441398,28385685,0
4774070,441398,28525188,0
4774071,441398,584217,0


In [15]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [16]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [17]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [18]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [19]:
ratings_mat_coo.shape

(686, 772730)

In [20]:
ratings_mat = ratings_mat_coo.tocsr()

In [21]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,399666
1,-1,113576,5,0,36168
2,-1,35100,5,0,562316
3,-1,228221,5,0,340271
4,-1,128717,5,0,74530
...,...,...,...,...,...
528,-1,28187,1,0,466089
529,-1,7094569,1,0,680552
530,-1,18925933,1,0,259961
531,-1,25659450,1,0,416334


In [22]:
my_index = 0

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [24]:
similarity[0]

0.999999999999994

In [25]:
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [26]:
indices

array([141, 237, 469, 200, 257, 500, 674, 599, 559, 307, 668, 111, 369,
       503,   0])

In [27]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [28]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [29]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,394,375802,5,503,576845
1,394,11,5,503,25840
2,394,234225,5,503,360859
3,394,29580,5,503,489982
4,394,77565,5,503,701689
...,...,...,...,...,...
4599501,420503,29936142,0,559,495529
4599502,420503,28811016,0,559,476963
4599503,420503,31185918,0,559,513869
4599504,420503,18952341,3,559,260907


In [89]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [90]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,3.666667
10016013,1,4.000000
100344,3,1.666667
100365,11,2.818182
1005,1,0.000000
...,...,...
9968073,1,0.000000
9969571,12,3.750000
99702,2,2.000000
9975779,1,4.000000


In [73]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [91]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [92]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,6,3.666667,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10016013,1,4.000000,Harry Potter and the Methods of Rationality,8691,https://www.goodreads.com/book/show/10016013-h...,https://images.gr-assets.com/books/1293582551m...,harry potter and the methods of rationality
2,100344,3,1.666667,Protector (Known Space),7858,https://www.goodreads.com/book/show/100344.Pro...,https://images.gr-assets.com/books/1403179200m...,protector known space
3,100365,11,2.818182,The Mote in God's Eye,48736,https://www.goodreads.com/book/show/100365.The...,https://images.gr-assets.com/books/1399490037m...,the mote in gods eye
4,1005,1,0.000000,Think and Grow Rich,87634,https://www.goodreads.com/book/show/1005.Think...,https://s.gr-assets.com/assets/nophoto/book/11...,think and grow rich
...,...,...,...,...,...,...,...,...
4549,9968073,1,0.000000,"The Unholy Consult (Aspect-Emperor, #4)",434,https://www.goodreads.com/book/show/9968073-th...,https://images.gr-assets.com/books/1498753251m...,the unholy consult aspectemperor 4
4550,9969571,12,3.750000,Ready Player One,376328,https://www.goodreads.com/book/show/9969571-re...,https://images.gr-assets.com/books/1500930947m...,ready player one
4551,99702,2,2.000000,"Island in the Sea of Time (Nantucket, #1)",5889,https://www.goodreads.com/book/show/99702.Isla...,https://images.gr-assets.com/books/1340627722m...,island in the sea of time nantucket 1
4552,9975779,1,4.000000,"Portrait of a Spy (Gabriel Allon, #11)",17057,https://www.goodreads.com/book/show/9975779-po...,https://images.gr-assets.com/books/1468408664m...,portrait of a spy gabriel allon 11


In [76]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [77]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [78]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [79]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [80]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [81]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [82]:
book_recs = book_recs[book_recs["mean"] >=4]

In [93]:
book_recs = book_recs[book_recs["count"]>2]

In [94]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [95]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
3943,7235533,13,4.769231,"The Way of Kings (The Stormlight Archive, #1)",151473,Goodreads,,the way of kings the stormlight archive 1
1976,234225,11,4.727273,Dune (Dune Chronicles #1),494553,Goodreads,,dune dune chronicles 1
1416,186074,14,4.642857,"The Name of the Wind (The Kingkiller Chronicle, #1)",416634,Goodreads,,the name of the wind the kingkiller chronicle 1
2578,29580,5,4.6,Second Foundation (Foundation #3),98612,Goodreads,,second foundation foundation 3
625,13569581,10,4.6,"Blood Song (Raven's Shadow, #1)",45244,Goodreads,,blood song ravens shadow 1
1910,23129080,7,4.571429,"The Dread Wyrm (The Traitor Son Cycle, #3)",2954,Goodreads,,the dread wyrm the traitor son cycle 3
371,1215032,14,4.571429,"The Wise Man's Fear (The Kingkiller Chronicle, #2)",253462,Goodreads,,the wise mans fear the kingkiller chronicle 2
802,15241,4,4.5,"The Two Towers (The Lord of the Rings, #2)",490005,Goodreads,,the two towers the lord of the rings 2
3822,68428,12,4.5,"The Final Empire (Mistborn, #1)",216149,Goodreads,,the final empire mistborn 1
630,13578175,4,4.5,The Emperor's Soul,35260,Goodreads,,the emperors soul
