In [1]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing

# Full code is at https://github.com/dataquestio/project-walkthroughs/tree/master/books

import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [2]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [3]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [5]:
book_set = set(my_books["book_id"])
book_set

{'113576',
 '12125412',
 '1215032',
 '128029',
 '139069',
 '1685995',
 '17662739',
 '18949861',
 '1898',
 '228221',
 '228665',
 '2517439',
 '25659450',
 '28187',
 '2913377',
 '35100',
 '356824',
 '437143',
 '5096865',
 '5439',
 '5578108',
 '6448772',
 '76680',
 '77203',
 '8161140',
 '82599',
 '883438'}

In [6]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        # why use get()?
        # https://stackoverflow.com/questions/11041405/why-dict-getkey-instead-of-dictkey
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [7]:
len(overlap_users)

316341

In [14]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/3.5])

In [15]:
len(filtered_overlap_users)

133

In [16]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [17]:
len(interactions_list)

1887046

In [18]:
interactions_list[0]

['2001', '11084145', '5']

In [19]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [20]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [21]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
1887041,439355,33574122,0
1887042,439355,32446437,0
1887043,439355,35906271,0
1887044,439355,760309,0


In [22]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [23]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [24]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [25]:
interactions.head()

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,287124
1,-1,113576,5,0,26386
2,-1,35100,5,0,400541
3,-1,228221,5,0,244752
5,-1,17662739,5,0,146326


In [28]:
interactions["user_index"].unique()

array([  0,  37,  69,  93, 133,  14,  28,  47,  55,  62,  65,  89,  96,
       103, 104, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132,   1,   2,   3,   4,   5,   6,   7,   8,   9,
        10,  11,  12,  13,  15,  16,  17,  18,  19,  20,  21,  22,  23,
        24,  25,  26,  27,  29,  30,  31,  32,  33,  34,  35,  36,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  48,  49,  50,  51,  52,
        53,  54,  56,  57,  58,  59,  60,  61,  63,  64,  66,  67,  68,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,
        83,  84,  85,  86,  87,  88,  90,  91,  92,  94,  95,  97,  98,
        99, 100, 101, 102, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117], dtype=int16)

In [29]:
from scipy.sparse import coo_matrix
#  A[i[k], j[k]] = data[k]
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [30]:
ratings_mat_coo.shape

(134, 560211)

In [31]:
ratings_mat = ratings_mat_coo.tocsr()

In [32]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,287124
1,-1,113576,5,0,26386
2,-1,35100,5,0,400541
3,-1,228221,5,0,244752
5,-1,17662739,5,0,146326
6,-1,356824,5,0,404515
7,-1,12125412,5,0,40358
8,-1,139069,5,0,84308
10,-1,76680,5,0,505713
11,-1,1898,5,0,189244


In [33]:
my_index = 0

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [35]:
similarity[0]

0.9999999999999999

In [36]:
similarity.shape

(134,)

In [37]:
import numpy as np
# largest 15 values
indices = np.argpartition(similarity, -15)[-15:]

In [38]:
indices

array([ 87,  50, 118,  48,  93,  47,  43,  32,  80,  27,  26,  18,  96,
        22,   0], dtype=int64)

In [39]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [40]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [41]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
13628,3653,2657,0,93,317560
13629,3653,4671,0,93,429298
13630,3653,5107,0,93,437438
13631,3653,18135,2,93,162129
13632,3653,9712,0,93,554232
...,...,...,...,...,...
1421930,339404,340243,0,87,392161
1421931,339404,11700593,0,87,32740
1421932,339404,19288043,0,87,193443
1421933,339404,16793,4,87,127863


In [42]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [43]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,13,4.0
10000063,1,3.0
10000191,2,0.0
10002296,2,0.0
100027,1,0.0
...,...,...
9998,1,0.0
9999,1,0.0
9999107,1,0.0
9999576,1,0.0


In [44]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [45]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [46]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,13,4.0,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10000063,1,3.0,The Two Deaths of Daniel Hayes,1698,https://www.goodreads.com/book/show/10000063-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the two deaths of daniel hayes
2,10000191,2,0.0,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus
3,10002296,2,0.0,Wildflower Hill,9475,https://www.goodreads.com/book/show/10002296-w...,https://images.gr-assets.com/books/1314025082m...,wildflower hill
4,100027,1,0.0,Quantum Enigma: Physics Encounters Consciousness,860,https://www.goodreads.com/book/show/100027.Qua...,https://s.gr-assets.com/assets/nophoto/book/11...,quantum enigma physics encounters consciousness
...,...,...,...,...,...,...,...,...
21686,9998,1,0.0,The Woman in the Dunes,11841,https://www.goodreads.com/book/show/9998.The_W...,https://images.gr-assets.com/books/1361254930m...,the woman in the dunes
21687,9999,1,0.0,The Box Man,2791,https://www.goodreads.com/book/show/9999.The_B...,https://images.gr-assets.com/books/1320459929m...,the box man
21688,9999107,1,0.0,The American Heiress,24522,https://www.goodreads.com/book/show/9999107-th...,https://images.gr-assets.com/books/1307342832m...,the american heiress
21689,9999576,1,0.0,Long Gone,3953,https://www.goodreads.com/book/show/9999576-lo...,https://s.gr-assets.com/assets/nophoto/book/11...,long gone


In [47]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [48]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [49]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [50]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [51]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [52]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [53]:
book_recs = book_recs[book_recs["mean"] >=4]

In [54]:
book_recs = book_recs[book_recs["count"]>2]

In [55]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [56]:
top_recs.shape

(25, 10)

In [57]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
6735,18619684,3,5.0,The Time Traveler's Wife,526550,Goodreads,,the time travelers wife,1.7e-05,8.5e-05
1296,114345,5,5.0,"The Little House Collection (Little House, #1-9)",125070,Goodreads,,the little house collection little house 19,0.0002,0.000999
14094,38743,3,5.0,All Things Bright and Beautiful,58061,Goodreads,,all things bright and beautiful,0.000155,0.000775
20233,862041,6,4.833333,"Harry Potter Boxset (Harry Potter, #1-7)",193057,Goodreads,,harry potter boxset harry potter 17,0.000186,0.000901
2506,12967,3,4.666667,Winter's Tale,20145,Goodreads,,winters tale,0.000447,0.002085
5115,16322,3,4.666667,"The A.B.C. Murders (Hercule Poirot, #13)",51072,Goodreads,,the abc murders hercule poirot 13,0.000176,0.000822
5134,16343,3,4.666667,"The Mysterious Affair at Styles (Hercule Poirot, #1)",142922,Goodreads,,the mysterious affair at styles hercule poirot 1,6.3e-05,0.000294
19028,77767,5,4.6,"Little House on the Prairie (Little House, #2)",200877,Goodreads,,little house on the prairie little house 2,0.000124,0.000572
14506,420282,4,4.5,The Polar Express,175318,Goodreads,,the polar express,9.1e-05,0.000411
4631,15931,3,4.333333,"The Notebook (The Notebook, #1)",1064723,Goodreads,,the notebook the notebook 1,8e-06,3.7e-05
