In [2]:
import os
import numpy as np
import pandas as pd
import string
import requests
import json
import time

In [3]:
DATA_DIR = "book-covers"

CATEGORIES = next(os.walk(DATA_DIR), (None, None, []))[1]
print(CATEGORIES)

['Art-Photography', 'Biography', 'Business-Finance-Law', 'Childrens-Books', 'Computing', 'Crafts-Hobbies', 'Crime-Thriller', 'Dictionaries-Languages', 'Entertainment', 'Food-Drink', 'Graphic-Novels-Anime-Manga', 'Health', 'History-Archaeology', 'Home-Garden', 'Humour', 'Medical', 'Mind-Body-Spirit', 'Natural-History', 'Personal-Development', 'Poetry-Drama', 'Reference', 'Religion', 'Romance', 'Science-Fiction-Fantasy-Horror', 'Science-Geography', 'Society-Social-Sciences', 'Sport', 'Stationery', 'Teaching-Resources-Education', 'Technology-Engineering', 'Teen-Young-Adult', 'Transport', 'Travel-Holiday-Guides']


In [4]:
def google_book_search(search_terms):
    url = 'https://www.googleapis.com/books/v1/volumes?q='
    response = requests.get(url+search_terms, verify = False)
    # load data using Python JSON module
    r = response.content
    data = json.loads(r)
    # Normalizing data
    try:
        df = pd.json_normalize(data, record_path =['items'])  
    except:
        print(data)
        temp_data = {'volumeInfo.description': [np.NaN]}
        df = pd.DataFrame(temp_data)
    #     df['volumeInfo.description'] = df['volumeInfo.description'].append(np.NaN)
        return df
    first_row = df.iloc[0]
    first_row_df = df.iloc[:1]
    first_row_df['search_term']=search_terms
    response.close()
    # Frequent requests will trigger the time limit error of Google Cloud Resources
    time.sleep(1)
    return first_row_df;

In [5]:
def add_description(INFO_DIR, CATEGORIES):
    for category in CATEGORIES:
        book_detail_path = os.path.join(INFO_DIR, category+".csv")
        print("Now running on:", category)
        # book_detail_path = "book-covers-details\\Business-Finance-Law.csv"
        book_info = pd.read_csv(book_detail_path)
        book_info["description"] = ""
        google_data = pd.DataFrame()
        for name in book_info.name:
            search_term = "intitle:"+ name
            search_result = google_book_search(search_term)
            if 'volumeInfo.description' in search_result.columns:
                search_result = search_result[['volumeInfo.description']]
            else:
                # search_result = search_result[['searchInfo.textSnippet']]
                # search_result.rename(columns={'searchInfo.textSnippet': 'volumeInfo.description'})
                temp_data = {'volumeInfo.description': [np.NaN]}
                search_result = pd.DataFrame(temp_data)
            google_data = google_data.append(search_result)
        google_data = (google_data).reset_index()
        book_info["description"] = google_data["volumeInfo.description"]
        book_info.to_csv(book_detail_path)    

Only run add description method once, it took almost 2 days to completely run it since sometimes google book api will restrict daily and hourly request limits. The returned error blocked the searching results. Add another function to recheck the missed book descriptions.

In [None]:
# INFO_DIR = "book-covers-details"
# add_description(INFO_DIR, CATEGORIES)

In [7]:
INFO_DIR = "book-covers-details"
book_detail_path = os.path.join(INFO_DIR, CATEGORIES[0]+".csv")
book_info = pd.read_csv(book_detail_path)
selected_rows = book_info[book_info['description'].isna()]
selected_rows

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,image,name,author,format,book_depository_stars,price,currency,old_price,isbn,category,img_paths,description
3,3,3,3,3,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Moleskine Large Watercolour Album And Notebook,Moleskine,Notebook / blank book,0.0,11.86,$,19.90,9788883705625,Art-Photography,dataset/Art-Photography/0000004.jpg,
9,9,9,9,9,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Moleskine Large Sketchbook Black,,Notebook / blank book,4.5,12.18,$,19.90,9788883701153,Art-Photography,dataset/Art-Photography/0000010.jpg,
23,23,23,23,23,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Moleskine Pocket Watercolour Notebook,Moleskine,Notebook / blank book,5.0,9.70,$,15.90,9788883705601,Art-Photography,dataset/Art-Photography/0000024.jpg,
30,30,30,30,30,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,The Legend of Zelda,Shigeru Miyamoto,Hardback,4.5,27.56,$,40.00,9781616550417,Art-Photography,dataset/Art-Photography/0000031.jpg,
31,31,31,31,31,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,The Ocean at the End of the Lane,Neil Gaiman,Paperback,4.0,7.70,$,11.40,9781472200341,Art-Photography,dataset/Art-Photography/0000032.jpg,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,955,955,955,955,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Doctor Who: Official Annual 2016,Various,Hardback,3.5,13.97,$,,9781405920018,Art-Photography,dataset/Art-Photography/0000956.jpg,
958,958,958,958,958,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Kaffe Fassett's Bold Blooms:Quilts and Other W...,Kaffe Fassett,Hardback,4.5,28.98,$,32.49,9781419722363,Art-Photography,dataset/Art-Photography/0000959.jpg,
961,961,961,961,961,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,"Piano Exam Pieces 2019 & 2020, ABRSM Grade 7",,Sheet music,5.0,12.10,$,13.00,9781786010254,Art-Photography,dataset/Art-Photography/0000962.jpg,
964,964,964,964,964,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Alive,Piers Paul Read,Paperback,4.0,8.47,$,,9780380003211,Art-Photography,dataset/Art-Photography/0000965.jpg,


In [20]:
selected_rows[selected_rows['name']=='Moleskine Large Watercolour Album And Notebook'].description

3    NaN
Name: description, dtype: object

In [30]:
search_term = "title:"+ 'Moleskine Large Watercolour Album And Notebook'
search_result = google_book_search(search_term)
search_result['volumeInfo.description'].values[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


nan

This method can run multiple times to ensure the book descriptions are retrieved.

In [8]:
def check_missed_descriptions(INFO_DIR, CATEGORIES):
    for category in CATEGORIES:
        book_detail_path = os.path.join(INFO_DIR, category+".csv")
        print("Now running on:", category)
        book_info = pd.read_csv(book_detail_path)
        selected_rows = book_info[book_info['description'].isna()]
        for name in selected_rows.name:
            print(name)
            search_term = "title:"+ name
            search_result = google_book_search(search_term)
            if 'volumeInfo.description' in search_result.columns:
                search_result = search_result['volumeInfo.description'].values[0]
            else:
                search_result = np.NaN
            book_info[book_info.name==name].description = search_result
        book_info.to_csv(book_detail_path)    

In [None]:
check_missed_descriptions(INFO_DIR, CATEGORIES)

In [9]:
SORTED_INFO_DIR = 'book-covers-detials-sorted'

In [39]:
temp = pd.read_csv("book-covers-details\\Transport.csv")
temp['description'].fillna(temp.name, inplace=True)
temp_sorted = pd.DataFrame(columns=["text", "label"])
# temp_sorted.text = temp.name + " " + temp.description
temp_sorted.text = temp.description
# temp_sorted.text = temp_sorted.text  + temp.description
temp_sorted.label = temp.category
temp_sorted.iloc[984,:]

text     The Scientific Design of Exhaust and Intake Sy...
label                                            Transport
Name: 984, dtype: object

In [44]:
def sort_book_description(INFO_DIR, SORTED_INFO_DIR, CATEGORIES):   
    for category in CATEGORIES:
        sorted_info_pd = pd.DataFrame(columns=['text', 'label'])
        info_pd = pd.read_csv(os.path.join(INFO_DIR, category+".csv"))
        info_pd['description'].fillna(info_pd.name, inplace=True)
        print("Now running on:", category)
        sorted_info_pd.text = info_pd['description']
        sorted_info_pd.label = info_pd['category']

        sorted_info_pd.to_csv(os.path.join(SORTED_INFO_DIR, category+".csv"))

In [45]:
sort_book_description(INFO_DIR, SORTED_INFO_DIR, CATEGORIES)

Now running on: Art-Photography
Now running on: Biography
Now running on: Business-Finance-Law
Now running on: Childrens-Books
Now running on: Computing
Now running on: Crafts-Hobbies
Now running on: Crime-Thriller
Now running on: Dictionaries-Languages
Now running on: Entertainment
Now running on: Food-Drink
Now running on: Graphic-Novels-Anime-Manga
Now running on: Health
Now running on: History-Archaeology
Now running on: Home-Garden
Now running on: Humour
Now running on: Medical
Now running on: Mind-Body-Spirit
Now running on: Natural-History
Now running on: Personal-Development
Now running on: Poetry-Drama
Now running on: Reference
Now running on: Religion
Now running on: Romance
Now running on: Science-Fiction-Fantasy-Horror
Now running on: Science-Geography
Now running on: Society-Social-Sciences
Now running on: Sport
Now running on: Stationery
Now running on: Teaching-Resources-Education
Now running on: Technology-Engineering
Now running on: Teen-Young-Adult
Now running on: Tra