## 16b_Scrape_Book_Details_Random_Samples

Author: Daniel Hui

License: MIT

This notebook takes the URLs from te previous notebook and extracts book-specific features, such as ratings, reviews, and book size dimensions and page number

In [2]:
from __future__ import print_function, division
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import collections
import re

### Global Variables

In [3]:
max_range = 250.  #set max records per file to be saved incrementally
location = 'random'  #set library branch

### Field-Level Functions

In [4]:
import ast

#Definition for finding information that is next to a text field
def find_text(textsoup, field):
    info = textsoup.find(text=re.compile(field))
    if info:
        return info.findNext().text.strip()
    else: 
        return 'N/A'

#function to extract book description
def find_description(textsoup):
    try:
        dictionary_string = textsoup.find("script",text=re.compile("@graph")).text  #this is a dictionary string 
        book_dict = ast.literal_eval(dictionary_string)                             #turn it into an actual dict
        book_details = []                                                           #empty list to hold book details
        book_sub_dict = book_dict.get("@graph")[0]
        ratings_dict = book_sub_dict.get("aggregateRating")
        try:                                                                #Avg Rating
            book_details.append(ratings_dict.get("ratingValue"))
        except: book_details.append("N/A")
        try:                                                                #num of Ratings
            book_details.append(ratings_dict.get("ratingCount"))
        except: book_details.append(0)
        try:                                                                #num of Reviews
            book_details.append(ratings_dict.get("reviewCount"))
        except: book_details.append(0)  
        try:                                                                #Hardcover/Softcover
            book_details.append(book_sub_dict.get("bookFormat").get("@id"))
        except: book_details.append("N/A")  
        try:                                                                #Subject areas
            book_details.append(book_sub_dict.get("about"))                                 
        except: book_details.append("N/A")   
        try:                                                                #URL to book image
            book_details.append(book_sub_dict.get("image"))                        
        except: book_details.append("N/A")  
        try:                                                                #Book description
            if len(book_sub_dict.get("description")[0]) > 1:
                book_details.append(book_sub_dict.get("description")[0])
            else: book_details.append(book_sub_dict.get("description"))     #some cases this is needed
        except: book_details.append("N/A")              
        return book_details    
    except:
        return 7*['N/A']

### Book-Level Function

In [5]:
def get_book_data(url_row):
    
    response = requests.get(f"{url_row}?active_tab=bib_info")        #take in the URL
    webpage = response.text
    soup = BeautifulSoup(webpage, "lxml")
    
    this_book_data = [url_row] 
    this_book_data.append(find_text(soup,'Characteristic'))          #Number of Pages, Book Size
    this_book_data.append(find_text(soup,'Branch Call Number'))      #Library Call Number 
    this_book_data = this_book_data + find_description(soup)         #concat two lists
    
    return this_book_data

### Data Cleaning Functions

In [6]:
#Extract the page count
def get_page_count(row):
    try:
        row = row.replace(" unnumbered","")    #handle cases where there are unnumbered pages
        if 'pages' in row:
            if len(row.split(' page')[0].strip().split(" ")) == 2:
                return row.split(' page')[0].strip().split(" ")[-1]
            else: return row.split(' page')[0].strip()
        else: return 'N/A'
    except: return 'N/A'

#extract the book dimensions
def get_book_dims(row):
    try:
        if 'cm' in row:
            if len(row.split(' cm')[0].strip().split(" ")) == 1:
                return row.split(' cm')[0].strip()
            else: return row.split(' cm')[0].strip().split(" ")[-1]
        else: return 'N/A'
    except: return 'N/A'

### Load URLS, Divide into DataFrame Chunks

In [7]:
url_df = pd.DataFrame(pd.read_csv(F"../01_Data/04_Scraped/isbn_url_{location}.csv",index_col=0))

In [8]:
url_df = url_df[url_df['link'].notna()]   #remove lines with no URL

In [9]:
#split the URL List into chunks so you can incrementally save
total_loops = (len(url_df) // max_range) + 1
url_dframes = np.array_split(url_df, total_loops)

### Loop Scrape

In [13]:
for i in range(0,len(url_dframes)):                             #adjust the lower number if the scrape stalled
    dframe = url_dframes[i]
    dframe = dframe.reset_index()                                #reset index so the ISBN below can match
    dframe["data"] = dframe["link"].apply(get_book_data)
    
    book_df = pd.DataFrame(list(dframe["data"]))                                      #turn data into dataframe
    book_df = book_df.rename({0: 'url', 1: 'page_dim', 2: 'callno', 3:'avg_rating',   #rename columns
                     4:'tot_ratings', 5: 'tot_reviews', 6:'type', 7:'subjects',
                     8: 'image', 9: 'desc'}, axis=1)

    #Clean Data.                                                  #Remove repetitive part of image URL
    book_df["page"] = book_df["page_dim"].apply(get_page_count)   #Extract Page Number
    book_df["dim"] = book_df["page_dim"].apply(get_book_dims)     #Extract book dimensions
    book_df["isbn"] = dframe["isbn"]
    
    #Keep useful columns
    book_df = book_df[["isbn","url","page","dim","avg_rating","tot_ratings","tot_reviews",
                       "type","callno","subjects","desc","image"]]
    
    book_df.to_csv(f'../01_Data/04_Scraped/book_data_{location}_{i}.csv')

### Combine Files Together into Combined Branch CSV

In [14]:
#start a dataframe with the first CSV
book_data_df = pd.read_csv(f'../01_Data/04_Scraped/book_data_{location}_0.csv',index_col=0)

#loop remaining CSVs
for i in range(1,len(url_dframes)):                                                          
    temp_df = pd.read_csv(f'../01_Data/04_Scraped/book_data_{location}_{i}.csv',index_col=0)
    book_data_df = pd.concat([book_data_df,temp_df])

In [15]:
book_data_df.to_csv(f'../01_Data/04_Scraped/book_data_{location}.csv')