In [1]:
# Create search engine!
# Goodreads json file is too large to open entirely using a pandas Dataframe. Instead, we first read it line by line:
import gzip
with gzip.open("goodreads_books.json.gz", 'r') as f:
    line = f.readline()

In [2]:
import json

json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [3]:
# Get most important fields from the goodreads json file. These fields include the book ID, title, number of ratings, url, and cover (url).
def parse_books(line):
    book = json.loads(line)
    return {
        "book_id": book["book_id"],
        "title": book["title_without_series"],
        "ratings": book["ratings_count"],
        "url": book["url"],
        "cover": book["image_url"]
    }

In [4]:
# Narrow down search by only including books for which there are at least 20 ratings. (Books with less than 20 ratings are obviously not very popular and thus are not necessary inclusions in our search engine).
# Append all of the books that match these requirements into a list called title.
titles = []
with gzip.open("goodreads_books.json.gz", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_books(line)
        
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 20:
            titles.append(fields)

In [5]:
# NOW we can use a pandas DataFrame
import pandas as pd

book_titles = pd.DataFrame.from_dict(titles)

In [6]:
book_titles["ratings"] = pd.to_numeric(book_titles["ratings"])

book_titles["modified_title"] = book_titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True) #this is a regular expression that modifies titles so that any that include characters other than 
                                                                                                #those in the brackets are removed.

In [7]:
book_titles["modified_title"] = book_titles["modified_title"].str.lower()

In [8]:
book_titles["modified_title"] = book_titles["modified_title"].str.replace("\s+", " ", regex=True) # replace multiple spaces with one space

In [9]:
book_titles = book_titles[book_titles["modified_title"].str.len() > 0] # removing blank titles

In [10]:
book_titles.to_json("book_titles.json")

In [11]:
#book_titles

In [12]:
def clickable(val):
    return '<a target="_blank" href={}"> Goodreads </a>'.format(val)
    
# ^^ This function would allow you to click on the link to see it in Goodreads. Feel like we might not want our app to redirect to another book site


# To create the search engine, we're using TF-IDF (term frequency - inverse document frequency). It uses both of these to assign keyword scores and estimate the importance/relevance of each word 
# put into the search engine.

# term frequency measures the frequency of each unique word.
# inverse document frequency minimizes the importance of common words (like the, and, etc.)

#from sklearn.feature_extraction.text import TfidfVectorizer


#tfidf = vectorizer.fit_transform(book_titles["modified_title"])
#from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(book_titles["modified_title"])

# show cover image in search
def cover(val):
    return '<img src="{}" width=60></image>'.format(val)

# search for a specific book (by title)
def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:] #find indices of 10 largest similarity values
    results = book_titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    return results.head(5)#.style.format({'cover':cover, 'url': clickable})

In [29]:
liked_books = []
liked_book = pd.DataFrame(columns= ['book_id', 'title'])
book_ids = []
liked_title = []
def user_search(liked_books, book_ids, liked_title):
    book = input("Please enter a book to search. Enter 'exit' when finished.")
    if book == 'exit':
        return False
    results = search(book, vectorizer)
    display(results.style.format({'cover':cover, 'url': clickable}))
   # results['book_id']
    validate = input("Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. \n")
    if validate == 'N':
        print("Try again.")
    else:
        validate = int(validate)
        book_ids.append(results.iloc[validate-1, 0])
        liked_books.append(results.iloc[validate-1, 0])
        liked_title.append(results.iloc[validate-1, 1])
    return True, liked_books, liked_title

def search_loop():
    userin = input("\n\n\nWould you like to search for a book? Enter Y/N.")
    if userin != 'N' and userin != 'Y' and userin != 'n' and userin != 'y':
        print("Please enter Y or N.")
        userin = input("\n\n\nWould you like to search for a book? Enter Y/N.")
    while True:
        if userin == 'N':
            print("Goodbye!")
            break
        if not user_search(liked_books, book_ids, liked_title):
            break
        userin2 = input("Enter id # of the book you want: ")
        liked_books.insert(0, userin2)
        print(liked_books)
        userin = input("Would you like to search for another book? Enter Y/N.")

        


In [30]:
search_loop()
liked_book['book_id'] = book_ids
liked_book['title'] = liked_title




Would you like to search for a book? Enter Y/N. Y
Please enter a book to search. Enter 'exit' when finished. the handmaids tale


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
1142466,12961964,The Handmaid's Tale,39362,Goodreads,,the handmaids tale
482374,7439970,The Handmaid's Tale,7809,Goodreads,,the handmaids tale
1006756,33414148,The Handmaid's Tale,320,Goodreads,,the handmaids tale
289763,34211735,The Handmaid's Tale,244,Goodreads,,the handmaids tale
982168,6267821,The Handmaid's Tale,188,Goodreads,,the handmaids tale


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. y
Please enter a book to search. Enter 'exit' when finished. the secret history


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
985983,18887980,The Secret History,1497,Goodreads,,the secret history
265650,653135,The Secret History,907,Goodreads,,the secret history
998304,1202712,The Secret History,501,Goodreads,,the secret history
990672,153928,The Secret History,266,Goodreads,,the secret history
474484,1133999,The Secret History,104,Goodreads,,the secret history


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. y
Please enter a book to search. Enter 'exit' when finished. the goldfinch


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
750013,18902634,The Goldfinch,41942,Goodreads,,the goldfinch
1079113,17788865,The Goldfinch,5850,Goodreads,,the goldfinch
589050,18692995,The Goldfinch,2861,Goodreads,,the goldfinch
753928,18266071,The Goldfinch,868,Goodreads,,the goldfinch
989521,18405388,The Goldfinch,857,Goodreads,,the goldfinch


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. pride and prejudice
Please enter a book to search. Enter 'exit' when finished. pride and prejudice


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
312902,15769088,Pride and Prejudice,17631,Goodreads,,pride and prejudice
1002235,18619998,Pride and Prejudice,11922,Goodreads,,pride and prejudice
360644,2262783,Pride and Prejudice,756,Goodreads,,pride and prejudice
4099,1555826,Pride and Prejudice,710,Goodreads,,pride and prejudice
512845,10239347,Pride and Prejudice,214,Goodreads,,pride and prejudice


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. y
Please enter a book to search. Enter 'exit' when finished. gone girl


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
891266,19288043,Gone Girl,588451,Goodreads,,gone girl
621785,13261812,Gone Girl,70738,Goodreads,,gone girl
1014627,15704174,Gone Girl,7679,Goodreads,,gone girl
734784,15704161,Gone Girl,343,Goodreads,,gone girl
101807,15798097,Gone Girl,313,Goodreads,,gone girl


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. girl with a pearl earring
Please enter a book to search. Enter 'exit' when finished. girl with a pearl earring


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
1039942,76847,Girl With a Pearl Earring,8245,Goodreads,,girl with a pearl earring
330903,2866,Girl With a Pearl Earring,1691,Goodreads,,girl with a pearl earring
330902,2864,Girl with a Pearl Earring,1079,Goodreads,,girl with a pearl earring
1015741,823347,Girl with a Pearl Earring,397,Goodreads,,girl with a pearl earring
1151649,4996,Girl With A Pearl Earring,361,Goodreads,,girl with a pearl earring


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. y
Please enter a book to search. Enter 'exit' when finished. remarkable creatures


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
975566,6457081,Remarkable Creatures,30403,Goodreads,,remarkable creatures
830659,8201472,Remarkable Creatures,729,Goodreads,,remarkable creatures
203971,6568806,Remarkable Creatures,660,Goodreads,,remarkable creatures
388859,7992037,Remarkable Creatures,299,Goodreads,,remarkable creatures
818996,6693443,Remarkable Creatures,208,Goodreads,,remarkable creatures


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. y
Please enter a book to search. Enter 'exit' when finished. pachinko


Unnamed: 0,book_id,title,ratings,url,cover,modified_title
951151,29983711,Pachinko,8161,Goodreads,,pachinko
390601,958663,"Night Shadow (Night Trilogy, #2)",1578,Goodreads,,night shadow night trilogy 2
390600,958662,"Calypso Magic (Magic Trilogy, #2)",1475,Goodreads,,calypso magic magic trilogy 2
972551,32619967,Pachinko,1361,Goodreads,,pachinko
329867,684819,Dreaming Pachinko,283,Goodreads,,dreaming pachinko


Which result is the book you want to add to your list? Enter a number between 1 and 5. Enter N if it is not on the list. 
 1
Would you like to search for another book? Enter Y/N. n
Please enter a book to search. Enter 'exit' when finished. exit


In [37]:
display(liked_books)
def return_liked(liked_books):
    return liked_books

['12961964',
 '18887980',
 '18902634',
 '15769088',
 '19288043',
 '76847',
 '6457081',
 '29983711']

In [None]:
liked_book.to_csv("liked_books_json")