In [1]:
import pandas as pd
my_books = pd.read_csv("liked_books.csv", index_col=0)

In [2]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,464164,5,Harry Potter and the Prisoner of Azkaban (Harr...
1,-1,11250317,5,The Song of Achilles
2,-1,556323,5,Pride and Prejudice
3,-1,3,5,Harry Potter and the Sorcerer's Stone (Harry P...
4,-1,2181379,5,Harry Potter and the Deathly Hallows (Harry Po...
...,...,...,...,...
96,-1,16068755,1,Rebels: City of Indra (The Story of Lex and Li...
97,-1,22886110,1,I Hate Myselfie: A Collection of Essays
98,-1,30357799,1,Sidemen: The Book
99,-1,13707645,1,The Illicit Happiness of Other People


In [3]:
my_books["book_id"] = my_books["book_id"].astype(str)

In [4]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [5]:
book_set = set(my_books["book_id"])

In [6]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [7]:
len(overlap_users)

483438

In [8]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [9]:
len(filtered_overlap_users)

39

In [10]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [11]:
len(interactions_list)

1168606

In [12]:
interactions_list[0]

['2794', '682745', '5']

In [13]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [14]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [15]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,464164,5
1,-1,11250317,5
2,-1,556323,5
3,-1,3,5
4,-1,2181379,5
...,...,...,...
1168601,436968,34273236,0
1168602,436968,34637340,0
1168603,436968,20930755,0
1168604,436968,32319716,0


In [16]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [17]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [18]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [19]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [20]:
ratings_mat_coo.shape

(40, 442698)

In [21]:
ratings_mat = ratings_mat_coo.tocsr()

In [22]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,464164,5,0,349564
1,-1,11250317,5,0,17732
2,-1,556323,5,0,361652
3,-1,3,5,0,292206
4,-1,2181379,5,0,179910
...,...,...,...,...,...
96,-1,16068755,1,0,88990
97,-1,22886110,1,0,199483
98,-1,30357799,1,0,296476
99,-1,13707645,1,0,63141


In [23]:
my_index = 0

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [25]:
similarity[0]

0.9999999999999999

In [26]:
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [27]:
indices

array([39, 26, 15, 14, 29, 13, 31, 12, 33, 34, 11, 10,  9, 38,  0],
      dtype=int64)

In [28]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [29]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [30]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,2794,682745,5,15,388848
1,2794,11466,5,15,21228
2,2794,50275,0,15,354766
3,2794,79090,0,15,408878
4,2794,4980,0,15,354146
...,...,...,...,...,...
1168601,436968,34273236,0,33,322911
1168602,436968,34637340,0,33,325372
1168603,436968,20930755,0,33,172099
1168604,436968,32319716,0,33,309935


In [31]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [32]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,12,4.5
10,1,5.0
10000,1,0.0
10000063,1,0.0
10000191,2,0.0
...,...,...
9999107,6,0.5
9999458,1,0.0
9999576,1,0.0
9999795,3,0.0


In [33]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [34]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [35]:
book_recs

Unnamed: 0,book_id,count,mean,title,pages,ratings,url,cover_image,mod_title
0,1,12,4.5,Harry Potter and the Half-Blood Prince (Harry ...,652,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10,1,5.0,"Harry Potter Collection (Harry Potter, #1-6)",3342,25245,https://www.goodreads.com/book/show/10.Harry_P...,https://images.gr-assets.com/books/1328867351m...,harry potter collection harry potter 16
2,10000,1,0.0,The Face of Another,238,2079,https://www.goodreads.com/book/show/10000.The_...,https://images.gr-assets.com/books/1320415026m...,the face of another
3,10000063,1,0.0,The Two Deaths of Daniel Hayes,390,1698,https://www.goodreads.com/book/show/10000063-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the two deaths of daniel hayes
4,10000191,2,0.0,Yellow Crocus,229,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus
...,...,...,...,...,...,...,...,...,...
91755,9998891,1,0.0,Cat Calls,16,623,https://www.goodreads.com/book/show/9998891-ca...,https://images.gr-assets.com/books/1327250539m...,cat calls
91756,9999,3,0.0,The Box Man,178,2791,https://www.goodreads.com/book/show/9999.The_B...,https://images.gr-assets.com/books/1320459929m...,the box man
91757,9999107,6,0.5,The American Heiress,468,24522,https://www.goodreads.com/book/show/9999107-th...,https://images.gr-assets.com/books/1307342832m...,the american heiress
91758,9999795,3,0.0,"Venom (Secrets of the Eternal Rose, #1)",435,4251,https://www.goodreads.com/book/show/9999795-venom,https://images.gr-assets.com/books/1331321794m...,venom secrets of the eternal rose 1


In [36]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [37]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [38]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [39]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [40]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [41]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [42]:
book_recs = book_recs[book_recs["mean"] >=4]

In [43]:
book_recs = book_recs[book_recs["count"]>2]

In [44]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [45]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,pages,ratings,url,cover_image,mod_title,adjusted_count,score
12317,132391,4,4.75,The Little Mermaid,48,14582,Goodreads,,the little mermaid,0.001097,0.005212
42361,21480930,3,4.666667,Gone Girl,555,416225,Goodreads,,gone girl,2.2e-05,0.000101
58021,30,3,4.666667,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings,1728,92172,Goodreads,,jrr tolkien 4book boxed set the hobbit and the lord of the rings,9.8e-05,0.000456
80050,730596,3,4.666667,Snow White,48,7988,Goodreads,,snow white,0.001127,0.005258
21021,15881,12,4.5,"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",341,1821802,Goodreads,,harry potter and the chamber of secrets harry potter 2,7.9e-05,0.000356
72857,6,12,4.5,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",734,1792561,Goodreads,,harry potter and the goblet of fire harry potter 4,8e-05,0.000361
30780,18051352,3,4.333333,"To All the Boys I've Loved Before (To All the Boys I've Loved Before, #1)",355,6262,Goodreads,,to all the boys ive loved before to all the boys ive loved before 1,0.001437,0.006228
31577,18135,9,4.222222,Romeo and Juliet,283,1656919,Goodreads,,romeo and juliet,4.9e-05,0.000206
34433,18512,9,4.222222,"The Return of the King (The Lord of the Rings, #3)",490,473101,Goodreads,,the return of the king the lord of the rings 3,0.000171,0.000723
18196,15241,9,4.0,"The Two Towers (The Lord of the Rings, #2)",322,490005,Goodreads,,the two towers the lord of the rings 2,0.000165,0.000661
