In [1]:
import gzip

with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()

In [2]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [3]:
import json

data = json.loads(line)

In [4]:
def parse_fields(line):
    data = json.loads(line)
    return {"book_id": data["book_id"], "title": data["title_without_series"], "ratings": data["ratings_count"], "url": data["url"]}

In [5]:
books_titles = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 5:
            books_titles.append(fields)

In [6]:
with open("books_titles.json", "w+") as f:
    json.dump(books_titles, f)

In [7]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [8]:
titles["ratings"] = pd.to_numeric(titles["ratings"])

In [9]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [10]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [11]:
titles.head()

Unnamed: 0,book_id,title,ratings,url,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,the aeneid for boys and girls


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [19]:
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

def make_clickable(val):
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = linear_kernel(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    
    return results.head(5).style.format({'url': make_clickable})

In [28]:
search("Wheel of time The Eye of the world", vectorizer)

Unnamed: 0,book_id,title,ratings,url,mod_title
791806,8153988,"The Eye of the World (Wheel of Time, #1)",5740,https://www.goodreads.com/book/show/8153988-the-eye-of-the-world,the eye of the world wheel of time 1
537081,1111608,"The Eye of the World (Wheel of Time, #1)",1800,https://www.goodreads.com/book/show/1111608.The_Eye_of_the_World,the eye of the world wheel of time 1
1128956,11203854,"The Eye of the World (Wheel of Time, #1)",911,https://www.goodreads.com/book/show/11203854-the-eye-of-the-world,the eye of the world wheel of time 1
1333452,13513481,"The Eye of the World (Wheel of Time, #1)",555,https://www.goodreads.com/book/show/13513481-the-eye-of-the-world,the eye of the world wheel of time 1
562701,7062520,"The Eye of the World (Wheel of Time, #1)",408,https://www.goodreads.com/book/show/7062520-the-eye-of-the-world,the eye of the world wheel of time 1
