In [1]:
#Import the necessary libraries for web scraping and saving the data

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
import json


In [2]:
#This block of code initializes an options container for a Selenium session
#Uncomment the headless option to run the web scrape without a browser

options = webdriver.ChromeOptions()
chrome_options = Options()
#options.headless = True

In [1]:
#This block of code converts the goodreads rating system from strings of text
#to an integer between one and five

rating_dict = {'it was amazing': 5,
               'really liked it': 4,
              'liked it': 3,
              'it was ok': 2,
              'did not like it': 1
              }


def convert_text_to_number_rating(rating_str):
    return int(rating_dict[rating_str])
    

In [5]:
#The location of the ISBN on goodreads is not uniform, this function extracts the 
#first three rows of the book information table on the current page and searches
#for the key 'ISBN'. If one is not found the function returns 'empty'

def get_first_three_info():
    info_dict = {}
    info_legnth = len(browser.find_elements_by_class_name("infoBoxRowTitle"))
    
    if info_legnth >= 3 :
        info_legnth = 3
        
    
    for i in range(0, info_legnth):
        key = browser.find_elements_by_class_name("infoBoxRowTitle")[i].text
        val = browser.find_elements_by_class_name("infoBoxRowItem")[i].text
        
        info_dict[key] = val
        
    if 'ISBN' in info_dict:
        return info_dict['ISBN']
    
    else:
        return 'empty'

In [6]:
#This function takes in a url for a specific book and (1) opens a prowser page
#(2) accesses the book information table (3) calls the ISBN locator function
#(4) calls the function that obtains the reviews for the book (5) quits and closes
#the browser window, and (6) returns a count of the reviews

def book_page(book_url):
    
    
    
    browser.get(book_url)
    
    book_title = browser.find_element_by_id('bookTitle').text
    
    more_info_button = browser.find_element_by_id("bookDataBoxShow")
    more_info_button.click()
    
    isbn = get_first_three_info().split(' ')[0]
    
    

    book_review_count = get_multiple_reviews_by_page(book_title, isbn)
    browser.close()
    browser.quit()
    
    
    return book_review_count
    
        
    

In [7]:
#This function takes in the title and ISBN of a book and (1) searches for the list of
#reviews by class name (2) saves the html of the reviews on one page as a variable
#(there can be multiple pages of reviews per book) (3) Uses a for loop to extract
#the review information from the block of html and saves each one as a list
#of dictionaries (4) saves each rating dictionary in a text file that updates continuously,
# and (5) returns a count of reviews

def get_reviews_for_book(book_title, isbn):
    
    count = 0
    rating_dict_list = []
    attempts = 0
    
    while attempts <5:
        
        try:
            
            time.sleep(2)
            reviews_list = browser.find_elements_by_class_name('friendReviews')
            break
            
        except:
            
            attempts +=1
            
            print("This is attempt number: {}".format(attempts))
            
            if attempts == 5:
                
                return rating_dict_list
    
    
    for i in reviews_list:
        
        if 'rated' in i.text.split('\n')[1]:
            
            rating = i.text.split('\n')[2]
            rating = convert_text_to_number_rating(rating)
            user_name = i.find_element_by_class_name('user').text
            user_id = i.find_element_by_class_name('user').get_attribute("href").split('/')[-1].split('-')[0]
            
            try:
                likes = i.find_element_by_xpath('.//span[@class = "likesCount"]').text.split(' ')[0]
            except:
                try:
                    likes = i.find_element_by_class_name('likesCount').text.split(' ')[0]
                except:
                    likes = 0
            
            count +=1
            
            ind_book_dict = {'user_id': user_id, 'user_name': user_name, 'rating': rating, 'isbn': isbn, 'title': book_title, 'likes': int(likes)}
            
            with open("book_reviews.txt", "a") as write_file:
                write_file.write(str(ind_book_dict) + ', ') 
            
    return count

In [8]:
#This function takes in the title and ISBN of a book and (1) counts the total pages of
#reviews per book (2) updates the counter from the function pulling and saving each review
#(3) prints the counter, and goes to the next page of reviews

def get_multiple_reviews_by_page(book_title, isbn):
    
    ind_review_count = 0
    all_reviews = []
    number = 1
    
    page_total = len(browser.find_element_by_xpath('//*[@id="reviews"]/div[4]/div').text.split(' ')[2:-2])
    
    for i in range(0, page_total):
        
        number += 1


        ind_review_count += get_reviews_for_book(book_title, isbn)
        
        print("Total reviews added for {}: {}".format(book_title, ind_review_count))
        
        if i < page_total-1:
        
            time.sleep(2)
            next_page(str(number))
            time.sleep(2)
            browser.execute_script("window.scrollTo(0, 12000);")
    
    return ind_review_count
        
        

In [9]:
#This function takes in the page number and clicks the next page of the reviews

def next_page(page_number):
    
    attempts = 0
    
    while attempts <15:
        
        try:
            page_link = browser.find_element_by_link_text(page_number)
            page_link.click()
            break
        
        except:
            attempts +=1
            print("This is attempt to reach page number {}: {} time(s)".format(page_number, attempts))
            time.sleep(2)
            browser.execute_script("window.scrollTo(0, 200);")

In [10]:
#This block of code reads in the list of book urls

book_df = pd.read_csv('book_info.csv')
len(book_df[103:].url)

417

In [19]:
#This for loop calls the book_page function that runs all the necessary functions in order
#to obtain the book data for each book's url

for i in book_df[452:].url.unique():

    browser = webdriver.Chrome('./chromedriver', options=options)
    browser.header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
    bookreviews = book_page(i)
    

Total reviews added for The Fates Divide: 24
Total reviews added for The Fates Divide: 50
Total reviews added for The Fates Divide: 79
Total reviews added for The Fates Divide: 108
Total reviews added for The Fates Divide: 137
Total reviews added for The Fates Divide: 167
Total reviews added for The Fates Divide: 196
Total reviews added for The Fates Divide: 225
Total reviews added for The Fates Divide: 255
Total reviews added for The Fates Divide: 284
Total reviews added for Sky in the Deep: 30
Total reviews added for Sky in the Deep: 58
Total reviews added for Sky in the Deep: 85
Total reviews added for Sky in the Deep: 113
Total reviews added for Sky in the Deep: 141
Total reviews added for Sky in the Deep: 170
Total reviews added for Sky in the Deep: 200
Total reviews added for Sky in the Deep: 227
Total reviews added for Sky in the Deep: 256
Total reviews added for Sky in the Deep: 286
Total reviews added for Muse of Nightmares: 26
Total reviews added for Muse of Nightmares: 50
Th

Total reviews added for Ghost Boys: 176
Total reviews added for Ghost Boys: 206
Total reviews added for Ghost Boys: 235
Total reviews added for Ghost Boys: 265
Total reviews added for Ghost Boys: 295
Total reviews added for The Penderwicks at Last: 25
Total reviews added for The Penderwicks at Last: 52
Total reviews added for The Penderwicks at Last: 82
Total reviews added for The Penderwicks at Last: 110
Total reviews added for The Penderwicks at Last: 138
Total reviews added for The Penderwicks at Last: 168
Total reviews added for The Penderwicks at Last: 198
Total reviews added for The Penderwicks at Last: 228
Total reviews added for The Penderwicks at Last: 258
Total reviews added for The Penderwicks at Last: 288
Total reviews added for Willa of the Wood: 28
Total reviews added for Willa of the Wood: 57
Total reviews added for Willa of the Wood: 85
Total reviews added for Willa of the Wood: 115
Total reviews added for Willa of the Wood: 145
Total reviews added for Willa of the Wood

This is attempt to reach page number 11: 13 time(s)
This is attempt to reach page number 11: 14 time(s)
This is attempt to reach page number 11: 15 time(s)
Total reviews added for The Unforgettable Guinevere St. Clair: 329
This is attempt to reach page number 12: 1 time(s)
This is attempt to reach page number 12: 2 time(s)
This is attempt to reach page number 12: 3 time(s)
This is attempt to reach page number 12: 4 time(s)
This is attempt to reach page number 12: 5 time(s)
This is attempt to reach page number 12: 6 time(s)
This is attempt to reach page number 12: 7 time(s)
This is attempt to reach page number 12: 8 time(s)
This is attempt to reach page number 12: 9 time(s)
This is attempt to reach page number 12: 10 time(s)
This is attempt to reach page number 12: 11 time(s)
This is attempt to reach page number 12: 12 time(s)
This is attempt to reach page number 12: 13 time(s)
This is attempt to reach page number 12: 14 time(s)
This is attempt to reach page number 12: 15 time(s)
Total 

This is attempt to reach page number 8: 13 time(s)
This is attempt to reach page number 8: 14 time(s)
This is attempt to reach page number 8: 15 time(s)
Total reviews added for Hello Lighthouse: 59
This is attempt to reach page number 9: 1 time(s)
This is attempt to reach page number 9: 2 time(s)
This is attempt to reach page number 9: 3 time(s)
This is attempt to reach page number 9: 4 time(s)
This is attempt to reach page number 9: 5 time(s)
This is attempt to reach page number 9: 6 time(s)
This is attempt to reach page number 9: 7 time(s)
This is attempt to reach page number 9: 8 time(s)
This is attempt to reach page number 9: 9 time(s)
This is attempt to reach page number 9: 10 time(s)
This is attempt to reach page number 9: 11 time(s)
This is attempt to reach page number 9: 12 time(s)
This is attempt to reach page number 9: 13 time(s)
This is attempt to reach page number 9: 14 time(s)
This is attempt to reach page number 9: 15 time(s)
Total reviews added for Hello Lighthouse: 59
T