In [5]:
!wc -l goodreads_books.json.gz

 7617498 goodreads_books.json.gz


In [6]:
!ls -lh | grep goodreads_books.json.gz

-rw-r--r--@ 1 clairec  staff   1.9G May  7 01:32 goodreads_books.json.gz


see how big files are. since files are so big, read in line by line to conserve memory

In [7]:
import gzip

with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()

line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [8]:
import json

data = json.loads(line)
data

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [9]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

In [10]:
books_titles = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 5:
            books_titles.append(fields)

In [11]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [12]:
titles["ratings"] = pd.to_numeric(titles["ratings"])
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
titles["mod_title"] = titles["mod_title"].str.lower()
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)
titles = titles[titles["mod_title"].str.len() > 0]
titles.to_json("books_titles.json")
titles

  titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)


Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
...,...,...,...,...,...,...
1782574,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...",12,https://www.goodreads.com/book/show/3084038-th...,https://images.gr-assets.com/books/1494763458m...,this sceptred isle vol 10 the age of victoria ...
1782575,26168430,Sherlock Holmes and the July Crisis,6,https://www.goodreads.com/book/show/26168430-s...,https://images.gr-assets.com/books/1440592011m...,sherlock holmes and the july crisis
1782576,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection
1782577,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,https://www.goodreads.com/book/show/22017381-1...,https://images.gr-assets.com/books/1398621236m...,101 nights volume one 101 nights 13


search engines need to be efficient so searches should be standardized, modify titles to only have standard characters, be lower cased, and not have extra spaces. save in json file for future use.

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

scikit-learn/sklearn auto generates the term frequency (frequency of words in your search) and nverse document frequency matrix (make rarer words more important in searches)

In [45]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    return results.head(5).style.format({'url': make_clickable, 'cover_image':show_image})

In [47]:
search("outliers", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1576805,26116473,"The Outliers (The Outliers, #1)",3905,Goodreads,,the outliers the outliers 1
671769,27406253,The Outliers,421,Goodreads,,the outliers
209722,6856680,Outliers,213,Goodreads,,outliers
1388651,7920926,Outliers,204,Goodreads,,outliers
644015,18679139,Outliers สัมฤทธิ์พิศวง,150,Goodreads,,outliers


show an example query of how we would search for a book title name using vectorizer. similarity returns most similar books and then we use indces and titles.iloc to translate those addresses to the actual book information from the dataset. however, goodreads has a lot of duplicate books so we take the book with the highest rating.
add a make_clickable function that passes basic HTML so we can check the URL and make sure we are finding the correct books.

In [58]:
liked_books = ["39660", "883438", "6534122", "6856680"]

In [50]:
!head book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


read in csv book mapping csv as we are going to start mapping ids between the different data sets so we can see what books are similar to the ones i like

In [61]:
csv_book_mapping = {}
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id
len(csv_book_mapping)

2360651

In [57]:
!wc -l goodreads_interactions.csv

1737.66s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


OSError: [Errno 5] Input/output error

In [63]:
!ls -lh | grep goodreads_interactions.csv

2245.48s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


-rw-r--r--@ 1 clairec  staff   4.0G May  7 01:35 goodreads_interactions.csv


In [64]:
overlap_users = set()

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        if user_id in overlap_users:
            continue
        
        try:
            rating = int(rating)

        except ValueError:
            continue
        book_id = csv_book_mapping[csv_id]

        if book_id in liked_books and rating >= 4:
            overlap_users.add(user_id)

In [65]:
rec_lines = []
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        if user_id in overlap_users:
            book_id = csv_book_mapping[csv_id]
            rec_lines.append([user_id, book_id, rating])

In [None]:
import pandas as pd
recs = pd.DataFrame(rec_lines, columns=["user_id", "book_id", "rating"])
recs["book_id"] = recs["book_id"].astype(str)

In [None]:
top_recs = recs["book_id"].value_counts().head(10)
top_recs = top_recs.index.values
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)
books_titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls


In [69]:
books_titles[books_titles["book_id"].isin(top_recs)]

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
236353,883438,East of Eden,2336,https://www.goodreads.com/book/show/883438.Eas...,https://images.gr-assets.com/books/1503315060m...,east of eden
386663,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
546297,5107,The Catcher in the Rye,2086945,https://www.goodreads.com/book/show/5107.The_C...,https://images.gr-assets.com/books/1398034300m...,the catcher in the rye
630937,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
649821,1,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
838525,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
1031472,38447,The Handmaid's Tale,648783,https://www.goodreads.com/book/show/38447.The_...,https://images.gr-assets.com/books/1498057733m...,the handmaids tale
1077226,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
1196415,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...
1229158,136251,Harry Potter and the Deathly Hallows (Harry Po...,1784684,https://www.goodreads.com/book/show/136251.Har...,https://images.gr-assets.com/books/1474171184m...,harry potter and the deathly hallows harry pot...


we had this "books_titles[books_titles["book_id"].isin(top_recs)]" but it gaves us pretty much all popular books. we want to look for books that are popular to OUR similar users, not everybody

In [None]:
all_recs = recs["book_id"].value_counts()
all_recs

book_id
883438      239
4671        183
5470        169
2657        165
2767052     154
           ... 
18926229      1
10912443      1
7143298       1
7897846       1
6248248       1
Name: count, Length: 130662, dtype: int64

In [75]:
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ["book_id", "book_count"]
all_recs

Unnamed: 0,book_id,book_count
0,883438,239
1,4671,183
2,5470,169
3,2657,165
4,2767052,154
...,...,...
130657,18926229,1
130658,10912443,1
130659,7143298,1
130660,7897846,1


In [76]:
all_recs = all_recs.merge(books_titles, how="inner", on="book_id")
all_recs

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title
0,883438,239,East of Eden,2336,https://www.goodreads.com/book/show/883438.Eas...,https://images.gr-assets.com/books/1503315060m...,east of eden
1,4671,183,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
2,5470,169,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
3,2657,165,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
4,2767052,154,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
...,...,...,...,...,...,...,...
127680,18926229,1,The Desert of Souls,11,https://www.goodreads.com/book/show/18926229-t...,https://images.gr-assets.com/books/1408940309m...,the desert of souls
127681,10912443,1,"Heaven's Needle (Ithelas, #2)",97,https://www.goodreads.com/book/show/10912443-h...,https://images.gr-assets.com/books/1379313640m...,heavens needle ithelas 2
127682,7143298,1,"The River Kings' Road (Ithelas, #1)",365,https://www.goodreads.com/book/show/7143298-th...,https://s.gr-assets.com/assets/nophoto/book/11...,the river kings road ithelas 1
127683,7897846,1,Shadow Spell (Seven Sorcerers #2),196,https://www.goodreads.com/book/show/7897846-sh...,https://s.gr-assets.com/assets/nophoto/book/11...,shadow spell seven sorcerers 2


In [78]:
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])
all_recs.sort_values("score", ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
0,883438,239,East of Eden,2336,https://www.goodreads.com/book/show/883438.Eas...,https://images.gr-assets.com/books/1503315060m...,east of eden,24.452483
370,6856680,36,Outliers,213,https://www.goodreads.com/book/show/6856680-ou...,https://images.gr-assets.com/books/1359390017m...,outliers,6.084507
356,39660,37,The Shawshank Redemption,528,https://www.goodreads.com/book/show/39660.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the shawshank redemption,2.592803
9146,32802595,4,"Record of a Spaceborn Few (Wayfarers, #3)",12,https://www.goodreads.com/book/show/32802595-r...,https://images.gr-assets.com/books/1498469008m...,record of a spaceborn few wayfarers 3,1.333333
11790,25985242,3,"Untitled (The Diviners, #4)",8,https://www.goodreads.com/book/show/25985242-u...,https://s.gr-assets.com/assets/nophoto/book/11...,untitled the diviners 4,1.125
5331,26856502,6,"Vengeful (Villains, #2)",35,https://www.goodreads.com/book/show/26856502-v...,https://s.gr-assets.com/assets/nophoto/book/11...,vengeful villains 2,1.028571
14822,2082086,3,Desert in the City,12,https://www.goodreads.com/book/show/2082086.De...,https://images.gr-assets.com/books/1415544383m...,desert in the city,0.75
2214,6534122,11,Eragon,178,https://www.goodreads.com/book/show/6534122-er...,https://images.gr-assets.com/books/1290432812m...,eragon,0.679775
27877,33000498,2,Curarsi con i libri: Rimedi letterari per ogni...,6,https://www.goodreads.com/book/show/33000498-c...,https://images.gr-assets.com/books/1506629729m...,curarsi con i libri rimedi letterari per ogni ...,0.666667
20632,23571643,2,The Gift of Charms (The Land of Dragor #1),6,https://www.goodreads.com/book/show/23571643-t...,https://images.gr-assets.com/books/1416217796m...,the gift of charms the land of dragor 1,0.666667


In [84]:
popular_recs = all_recs[all_recs["book_count"] > 75].sort_values("score", ascending=False)

In [87]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

popular_recs[~popular_recs["book_id"].isin(liked_books)].head(10).style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
8,38447,137,The Handmaid's Tale,648783,Goodreads,,the handmaids tale,0.02893
47,15783514,87,The Ocean at the End of the Lane,303213,Goodreads,,the ocean at the end of the lane,0.024963
24,168668,117,"Catch-22 (Catch-22, #1)",574130,Goodreads,,catch22 catch22 1,0.023843
41,168642,96,In Cold Blood,388922,Goodreads,,in cold blood,0.023696
70,14891,79,A Tree Grows in Brooklyn,295793,Goodreads,,a tree grows in brooklyn,0.021099
77,3876,77,The Sun Also Rises,289634,Goodreads,,the sun also rises,0.020471
37,18143977,101,All the Light We Cannot See,498685,Goodreads,,all the light we cannot see,0.020456
79,6334,76,Never Let Me Go,301855,Goodreads,,never let me go,0.019135
40,2187,97,Middlesex,494274,Goodreads,,middlesex,0.019036
80,4395,76,The Grapes of Wrath,322757,Goodreads,,the grapes of wrath,0.017896
