In [1]:
# Create search engine!
# Goodreads json file is too large to open entirely using a pandas Dataframe. Instead, we first read it line by line:
import gzip
with gzip.open("goodreads_books.json.gz", 'r') as f:
    line = f.readline()

In [2]:
import json

json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [3]:
# Get most important fields from the goodreads json file. These fields include the book ID, title, number of ratings, url, and cover (url).
def parse_books(line):
    book = json.loads(line)
    return {
        "book_id": book["book_id"],
        "title": book["title_without_series"],
        "ratings": book["ratings_count"],
        "url": book["url"],
        "cover": book["image_url"]
    }

In [4]:
# Narrow down search by only including books for which there are at least 20 ratings. (Books with less than 20 ratings are obviously not very popular and thus are not necessary inclusions in our search engine).
# Append all of the books that match these requirements into a list called title.
titles = []
with gzip.open("goodreads_books.json.gz", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_books(line)
        
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 20:
            titles.append(fields)

In [5]:
# NOW we can use a pandas DataFrame
import pandas as pd

book_titles = pd.DataFrame.from_dict(titles)

In [6]:
book_titles["ratings"] = pd.to_numeric(book_titles["ratings"])

book_titles["modified_title"] = book_titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True) #this is a regular expression that modifies titles so that any that include characters other than 
                                                                                                #those in the brackets are removed.

In [7]:
book_titles["modified_title"] = book_titles["modified_title"].str.lower()

In [8]:
book_titles["modified_title"] = book_titles["modified_title"].str.replace("\s+", " ", regex=True) # replace multiple spaces with one space

In [9]:
book_titles = book_titles[book_titles["modified_title"].str.len() > 0] # removing blank titles

In [10]:
book_titles.to_json("book_titles.json")

In [11]:
#book_titles

In [12]:
#def clickable(val):
   # return '<a target="_blank" href={}"> Goodreads </a>'.format(val)
    
# ^^ This function would allow you to click on the link to see it in Goodreads. Feel like we might not want our app to redirect to another book site, so I commented it out but left it in case


# To create the search engine, we're using TF-IDF (term frequency - inverse document frequency). It uses both of these to assign keyword scores and estimate the importance/relevance of each word 
# put into the search engine.

# term frequency measures the frequency of each unique word.
# inverse document frequency minimizes the importance of common words (like the, and, etc.)

#from sklearn.feature_extraction.text import TfidfVectorizer


#tfidf = vectorizer.fit_transform(book_titles["modified_title"])
#from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(book_titles["modified_title"])

# show cover image in search
def cover(val):
    return '<img src="{}" width=60></image>'.format(val)

# search for a specific book (by title)
def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:] #find indices of 10 largest similarity values
    results = book_titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    return results.head(5).style.format({'cover':cover})

In [13]:
#liked_books = []

# All this search engine does right now is go through the dataset and show the closest matches. It doesn't save it to a 'liked books' file or anything.
def user_search():
    book = input("Please enter a book to search. Enter 'exit' when finished.")
    if book == 'exit':
        return False
    display(search(book, vectorizer))
    return True

def search_loop():
    userin = input("Would you like to search for a book? Enter Y/N.")
    if userin != 'N' and userin != 'Y':
        print("Please enter Y or N.")
    while True:
        if userin == 'N':
            print("Goodbye!")
            break
        if not user_search():
            break
        userin = input("Would you like to search for another book? Enter Y/N.")
    

In [14]:
search_loop()

Would you like to search for a book? Enter Y/N. Y
Please enter a book to search. Enter 'exit' when finished. Crooked Kingdom


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
285192,27840861,"Crooked Kingdom (Six of Crows, #2)",6254,https://www.goodreads.com/book/show/27840861-crooked-kingdom,,crooked kingdom six of crows 2
952851,28937572,"Crooked Kingdom (Six of Crows, #2)",2749,https://www.goodreads.com/book/show/28937572-crooked-kingdom,,crooked kingdom six of crows 2
909546,451301,Crooked,1239,https://www.goodreads.com/book/show/451301.Crooked,,crooked
296035,23507476,Crooked,942,https://www.goodreads.com/book/show/23507476-crooked,,crooked
1015810,683664,Crooked,231,https://www.goodreads.com/book/show/683664.Crooked,,crooked


Would you like to search for another book? Enter Y/N. Y
Please enter a book to search. Enter 'exit' when finished. Harry Potter and the Goblet of Fire


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
277706,28754622,Harry Potter and the Goblet of Fire,3314,https://www.goodreads.com/book/show/28754622-harry-potter-and-the-goblet-of-fire,,harry potter and the goblet of fire
1067811,818068,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",1389,https://www.goodreads.com/book/show/818068.Harry_Potter_and_the_Goblet_of_Fire,,harry potter and the goblet of fire harry potter 4
899201,7292005,Harry Potter and the Goblet of Fire,202,https://www.goodreads.com/book/show/7292005-harry-potter-and-the-goblet-of-fire,,harry potter and the goblet of fire
884637,17861465,Harry Potter and the Goblet of Fire,174,https://www.goodreads.com/book/show/17861465-harry-potter-and-the-goblet-of-fire,,harry potter and the goblet of fire
749008,1071182,Harry Potter and the Goblet of Fire,168,https://www.goodreads.com/book/show/1071182.Harry_Potter_and_the_Goblet_of_Fire,,harry potter and the goblet of fire


Would you like to search for another book? Enter Y/N. N


Goodbye!
