In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome("./chromedriver.exe",options=options)

In [4]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

# click load more button on index pages
def get_js_soup_loading(url,driver):
    driver.get(url)
    page_num = 1
    if driver.find_elements_by_xpath("//span[contains(text(), 'Show more books')]"):
        print("loading", end='')
        loadingButton = WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), 'Show more books')]")))
        maxbooks = 500
        while loadingButton:
            try:
                loadingButton.click()
                print(".", end='')
            except Exception as e: 
                print(e)
                time.sleep(2)
                loadingButton = WebDriverWait(driver,30).until(expected_conditions.visibility_of_element_located((By.XPATH, "//span[contains(text(), 'Show more books')]")))
                continue
            time.sleep(2)
            WebDriverWait(driver,30).until_not(EC.presence_of_element_located((By.XPATH,"//*[contains(text(), 'Loading more books')]")))
            loadElems = driver.find_elements_by_xpath("//span[contains(text(), 'Show more books')]")
            if len(loadElems)>0:
                loadingButton = WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), 'Show more books')]")))
                length = len(driver.find_elements_by_css_selector(".BookListItem__body"))
            else:
                break
            if length >= maxbooks:
                break   
    res_html = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    final_length = len(driver.find_elements_by_css_selector(".BookListItem__body"))
    print(f"retrieved {final_length} books")
    return soup

#tidies extracted text 
def process_content(content):
    content = content.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    content = re.sub('\s+',' ',content)       #repalces repeated whitespace characters with single space
    return content

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup

In [5]:
def scrape_chars(url):
    soup = get_js_soup(url,driver)
    chars = []
    if soup:
        for a in soup.find_all('a', {'href': re.compile('^/characters/.*')}):
            chars.append(a.get_text().rstrip().lstrip())
    characters = ", ".join(chars)
    return characters if characters else 'N/A'

def scrape_dir_page(dir_url,driver, i):
    soup = get_js_soup_loading(dir_url,driver) 
    batch = {'url':[], 'title':[], 'story':[], 'writers': [], 'characters':[]}
    
    for book_holder in soup.find_all('div',class_='BookListItem__body'):
        new_url = new_title =  ""
        new_writers = []
        new_characters = ""
        new_story = ""
        
        # title & url
        h3 = book_holder.find('h3', class_='Text Text__title3 Text__umber')
        if h3:
            strong = h3.find('strong')
            if strong:
                title_n_url = strong.find('a')
                if title_n_url:
                    new_title = title_n_url.get_text().rstrip().lstrip()
                    new_url = title_n_url['href']
                    
        # writers
        for writer_span in book_holder.find_all('span', class_='ContributorLink__name'):
            new_writers.append(writer_span.get_text().rstrip().lstrip())
        new_writers = ", ".join(new_writers)
            
        # story
        for story_span in book_holder.find_all('span', class_="Formatted"):
            new_story = story_span.get_text().rstrip().lstrip()
        
        # characters
        if new_url:
            new_characters = scrape_chars(new_url)
        
        batch['url'].append(new_url)
        batch['title'].append(new_title)
        batch['story'].append(new_story)
        batch['writers'].append(new_writers)
        batch['characters'].append(new_characters)

    year = 1921 + last_i - 1
    print ('-'*20,'Finish scraping books published in {}'.format(str(year)),'-'*20)
    return batch


In [6]:
book_urls = []
book_titles = []
book_stories = []
book_writers = []
book_characters = []

In [16]:
import time

# crawl books published from 1921 to 2021, popularity descending order
print ('-'*20,'Scraping directory page','-'*20)

# i should start from 1, end at 101 (year 2021)
last_i = 15
for i in range(last_i, 102):
    year = 1921 + last_i - 1
    dir_url = 'https://www.goodreads.com/book/popular_by_date/' + str(year)
    try:
        batch = scrape_dir_page(dir_url,driver, i)
        book_urls.extend(batch['url'])
        book_titles.extend(batch['title'])
        book_stories.extend(batch['story'])
        book_writers.extend(batch['writers'])
        book_characters.extend(batch['characters'])
        last_i += 1
    except:
        time.sleep(1)
        batch = scrape_dir_page(dir_url,driver, i)
        book_urls.extend(batch['url'])
        book_titles.extend(batch['title'])
        book_stories.extend(batch['story'])
        book_writers.extend(batch['writers'])
        book_characters.extend(batch['characters'])
        last_i += 1

-------------------- Scraping directory page --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 1935 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 1936 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 1937 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 1938 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 1939 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 1940 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 1941 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping boo

loading.............retrieved 200 books
-------------------- Finish scraping books published in 2002 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 2003 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 2004 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 2005 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 2006 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 2007 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 2008 --------------------
loading.............retrieved 200 books
-------------------- Finish scraping books published in 2009 --------------------
loading.............retr

In [17]:
book_urls
len(book_urls)

20199

In [18]:
book_titles
len(book_titles)

20199

In [19]:
book_stories
len(book_stories)

20199

In [20]:
book_writers
len(book_writers)

20199

In [21]:
book_characters
len(book_characters)

20199

In [22]:
last_i

102

In [23]:
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(process_content(l))
            f.write('\n')

In [24]:
book_urls_file = '../IMDB_goodreads_data/book_urls.txt'
book_writers_file = '../IMDB_goodreads_data/book_writers.txt'
book_stories_file = '../IMDB_goodreads_data/book_stories.txt'
book_titles_file = '../IMDB_goodreads_data/book_titles.txt'
book_characters_file = '../IMDB_goodreads_data/book_characters.txt'

write_lst(book_urls,book_urls_file)
write_lst(book_writers,book_writers_file)
write_lst(book_stories,book_stories_file)
write_lst(book_titles,book_titles_file)
write_lst(book_characters,book_characters_file)
