In [1]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing

# Full code is at https://github.com/dataquestio/project-walkthroughs/tree/master/books

import pandas as pd

my_books = pd.read_csv("liked_books_full.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [2]:
my_books #Books that are present in My Library and which I have Rated

Unnamed: 0,user_id,book_id,rating,title
0,-1,1841798,3,The Secret
1,-1,19060872,4,The Monk Who Sold his Ferrari
2,-1,23654228,5,the power of your subconcious Mind
3,-1,69571,4,"Rich Dad, Poor Dad"
4,-1,28257707,5,The Subtle Art of Not Giving a F*ck: A Counter...


In [3]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [4]:
book_set = set(my_books["book_id"])

In [5]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [6]:
len(overlap_users)

25960

In [7]:
#Users who have read atleast 20% of books we have read 
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [8]:
len(filtered_overlap_users)

1434

In [9]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [10]:
len(interactions_list)

1396083

In [11]:
interactions_list[0]

['119', '3388', '5']

In [12]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [13]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [14]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,1841798,3
1,-1,19060872,4
2,-1,23654228,5
3,-1,69571,4
4,-1,28257707,5
...,...,...,...
1396078,874099,25899336,0
1396079,874099,10884,0
1396080,874099,22543496,0
1396081,874099,12609433,5


In [15]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [16]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [17]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [18]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [19]:
ratings_mat_coo.shape

(1435, 442834)

In [20]:
ratings_mat = ratings_mat_coo.tocsr()

In [21]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,1841798,3,0,135618
1,-1,19060872,4,0,149746
2,-1,23654228,5,0,208930
3,-1,69571,4,0,388526
4,-1,28257707,5,0,265896


In [22]:
my_index = 0

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [24]:
similarity[0]

1.0000000000000002

In [25]:
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [26]:
indices

array([ 918, 1081, 1120, 1103, 1214,  142,  682, 1332, 1409, 1221, 1248,
       1346,  783, 1350,    0], dtype=int64)

In [27]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [28]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [29]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
640771,167471,5805,0,142,359800
640772,167471,23654228,4,142,208930
640773,167471,22318578,0,142,183606
640774,167471,24226153,0,142,214092
640775,167471,857333,0,142,420571
...,...,...,...,...,...
1396078,874099,25899336,0,1409,241907
1396079,874099,10884,0,1409,13143
1396080,874099,22543496,0,1409,187002
1396081,874099,12609433,5,1409,37872


In [30]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [31]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10009377,1,0.0
1005,1,0.0
10127019,5,1.8
10131648,1,0.0
10229138,1,0.0
...,...,...
976,1,3.0
9762,1,0.0
9969571,1,0.0
998,2,0.0


In [32]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [33]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")


In [34]:
#Added lines
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

General_Recommendationation = book_recs
book_recs.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,10009377,1,0.0,The 12 Week Year,834,Goodreads,,the 12 week year
1,1005,1,0.0,Think and Grow Rich,87634,Goodreads,,think and grow rich
2,10127019,5,1.8,The Lean Startup: How Today's Entrepreneurs Use Continuous Innovation to Create Radically Successful Businesses,95622,Goodreads,,the lean startup how todays entrepreneurs use continuous innovation to create radically successful businesses
3,10131648,1,0.0,The Greater Journey: Americans in Paris,14188,Goodreads,,the greater journey americans in paris
4,10229138,1,0.0,The FARC: The Longest Insurgency,46,Goodreads,,the farc the longest insurgency


In [85]:
#########################################################################################

In [86]:
#The above code is for one to many
#The remaining code is used for many to many recommendation
#

In [87]:
#########################################################################################

In [88]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,10009377,1,0.0,The 12 Week Year,834,https://www.goodreads.com/book/show/10009377-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the 12 week year
1,1005,1,0.0,Think and Grow Rich,87634,https://www.goodreads.com/book/show/1005.Think...,https://s.gr-assets.com/assets/nophoto/book/11...,think and grow rich
2,10127019,5,1.8,The Lean Startup: How Today's Entrepreneurs Us...,95622,https://www.goodreads.com/book/show/10127019-t...,https://images.gr-assets.com/books/1333576876m...,the lean startup how todays entrepreneurs use ...
3,10131648,1,0.0,The Greater Journey: Americans in Paris,14188,https://www.goodreads.com/book/show/10131648-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the greater journey americans in paris
4,10229138,1,0.0,The FARC: The Longest Insurgency,46,https://www.goodreads.com/book/show/10229138-t...,https://images.gr-assets.com/books/1315856656m...,the farc the longest insurgency
...,...,...,...,...,...,...,...,...
598,976,1,3.0,Deception Point,464544,https://www.goodreads.com/book/show/976.Decept...,https://s.gr-assets.com/assets/nophoto/book/11...,deception point
599,9762,1,0.0,Loving What Is: Four Questions That Can Change...,12880,https://www.goodreads.com/book/show/9762.Lovin...,https://s.gr-assets.com/assets/nophoto/book/11...,loving what is four questions that can change ...
600,9969571,1,0.0,Ready Player One,376328,https://www.goodreads.com/book/show/9969571-re...,https://images.gr-assets.com/books/1500930947m...,ready player one
601,998,2,0.0,The Millionaire Next Door: The Surprising Secr...,46032,https://www.goodreads.com/book/show/998.The_Mi...,https://s.gr-assets.com/assets/nophoto/book/11...,the millionaire next door the surprising secre...


In [35]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [36]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [37]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [38]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [39]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [40]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [41]:
book_recs = book_recs[book_recs["mean"] >=3]

In [42]:
book_recs = book_recs[book_recs["count"]>2]

In [43]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [44]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
578,865,4,4.25,The Alchemist,1342863,Goodreads,,the alchemist,1.2e-05,5.1e-05


In [101]:
General_Recommendationation[General_Recommendationation["count"]>2]

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
2,10127019,5,1.8,The Lean Startup: How Today's Entrepreneurs Us...,95622,https://www.goodreads.com/book/show/10127019-t...,https://images.gr-assets.com/books/1333576876m...,the lean startup how todays entrepreneurs use ...,0.000261,0.000471
11,1052,4,2.5,The Richest Man in Babylon,52607,https://www.goodreads.com/book/show/1052.The_R...,https://images.gr-assets.com/books/1348336780m...,the richest man in babylon,0.000304,0.00076
12,10534,3,1.333333,The Art of War,174266,https://www.goodreads.com/book/show/10534.The_...,https://images.gr-assets.com/books/1453417993m...,the art of war,5.2e-05,6.9e-05
26,11084145,6,1.333333,Steve Jobs,594067,https://www.goodreads.com/book/show/11084145-s...,https://images.gr-assets.com/books/1327861368m...,steve jobs,6.1e-05,8.1e-05
36,11468377,7,0.0,"Thinking, Fast and Slow",144569,https://www.goodreads.com/book/show/11468377-t...,https://images.gr-assets.com/books/1317793965m...,thinking fast and slow,0.000339,0.0
54,12609433,6,1.666667,The Power of Habit: Why We Do What We Do in Li...,160604,https://www.goodreads.com/book/show/12609433-t...,https://images.gr-assets.com/books/1366758683m...,the power of habit why we do what we do in lif...,0.000224,0.000374
62,1303,4,1.25,The 48 Laws of Power,39588,https://www.goodreads.com/book/show/1303.The_4...,https://s.gr-assets.com/assets/nophoto/book/11...,the 48 laws of power,0.000404,0.000505
72,13530973,3,0.0,Antifragile: Things That Gain from Disorder,16843,https://www.goodreads.com/book/show/13530973-a...,https://images.gr-assets.com/books/1352422827m...,antifragile things that gain from disorder,0.000534,0.0
111,16256798,4,2.5,The One Thing: The Surprisingly Simple Truth B...,15237,https://www.goodreads.com/book/show/16256798-t...,https://images.gr-assets.com/books/1362177469m...,the one thing the surprisingly simple truth be...,0.00105,0.002625
146,18050143,3,1.333333,"Zero to One: Notes on Startups, or How to Buil...",52400,https://www.goodreads.com/book/show/18050143-z...,https://images.gr-assets.com/books/1414347376m...,zero to one notes on startups or how to build ...,0.000172,0.000229


In [102]:
General_Recommendation=General_Recommendationation

In [45]:

General_Recommendationation = General_Recommendationation[~General_Recommendationation["mod_title"].isin(my_books["mod_title"])]
general = General_Recommendationation.sort_values(by=['count','score'], ascending=False)
general.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
428,36072,7,2.571429,The 7 Habits of Highly Effective People: Powerful Lessons in Personal Change,330726,Goodreads,,the 7 habits of highly effective people powerful lessons in personal change,0.000148,0.000381
36,11468377,7,0.0,"Thinking, Fast and Slow",144569,Goodreads,,thinking fast and slow,0.000339,0.0
54,12609433,6,1.666667,The Power of Habit: Why We Do What We Do in Life and Business,160604,Goodreads,,the power of habit why we do what we do in life and business,0.000224,0.000374
455,4865,6,2.333333,How to Win Friends and Influence People,296690,Goodreads,,how to win friends and influence people,0.000121,0.000283
458,4894,6,1.5,Who Moved My Cheese?,245187,Goodreads,,who moved my cheese,0.000147,0.00022
26,11084145,6,1.333333,Steve Jobs,594067,Goodreads,,steve jobs,6.1e-05,8.1e-05
2,10127019,5,1.8,The Lean Startup: How Today's Entrepreneurs Use Continuous Innovation to Create Radically Successful Businesses,95622,Goodreads,,the lean startup how todays entrepreneurs use continuous innovation to create radically successful businesses,0.000261,0.000471
441,4069,5,2.0,Man's Search for Meaning,179906,Goodreads,,mans search for meaning,0.000139,0.000278
576,8520610,5,1.0,Quiet: The Power of Introverts in a World That Can't Stop Talking,208008,Goodreads,,quiet the power of introverts in a world that cant stop talking,0.00012,0.00012
111,16256798,4,2.5,The One Thing: The Surprisingly Simple Truth Behind Extraordinary Results,15237,Goodreads,,the one thing the surprisingly simple truth behind extraordinary results,0.00105,0.002625
