In [1]:
# standard packages
import numpy as np
import pandas as pd
import scipy as sp 

# Web scraping tools
import lxml.html as lx
import requests
import requests_cache

# create cache for requested html code to limit repeat requests
requests_cache.install_cache("Audible")

In [2]:
# top scifi/fantasy books url with page size set to 50 per page
start_page = "https://www.audible.com/search?pf_rd_p=073d8370-97e5-4b7b-be04-aa06cf22d7dd&pf_rd_r=E0S46R88NAP7ESKNPGD5&ref=a_search_c1_sort_2&node=2226658011&sort=popularity-rank&pageSize=50"

In [3]:
# define function to gather all book page links from the list page
def get_page_links(url):
    """ 
    Retreives the urls for all the books on an audible book list webpage from the url of an audble book list page. 
    """
    # get response from url
    response = requests.get(url)
    response.raise_for_status()

    # parse response to html
    html = lx.fromstring(response.text)
    html.make_links_absolute(url)

    # get book urls
    book_urls = html.xpath("//li[contains(@class, 'bc-list-item')]/div/div[1]/div/div[2]/div/div/span/ul/li/h3/a/@href")
    
    return book_urls

In [4]:
# define function to find next page link
def next_page_link(url):
    """
    Finds link to next page in book list. If there is no next page the function returns NAN
    """
    # get response from url
    response = requests.get(url)
    response.raise_for_status()

    # parse response to html
    html = lx.fromstring(response.text)
    html.make_links_absolute(url)    

    # see if the next page link is disabled indicating that this is the last page in the list
    next_page_disabled = html.xpath("//*[contains(@class, 'button-disabled') and contains(@class, 'nextButton')]")

    # if there is a next page, grab the url, if not, tell me
    if len(next_page_disabled) == 0:
        next_page = html.xpath("//*[contains(@data-name, 'page')]/@href")[-1]
        return str(next_page)
    else:
        print('no next page')
        return np.NAN
        

In [5]:
# define function to loop through new pages to and gather all book links
def get_all_links(start_page, max_pages):
    """
    Wrapper function to get all the book urls in the book list. 
    Given the first page of the book list, the function will gather all the book urls from the current page, 
    find the url of the next page in the list, then repeat on the next page of the list.
    The function will continue until it has scraped up to the max_pages, or the next page link is disabled indicating the end of the list.
    Returns a single list containing the urls of all the books in the list.
    """
    # set starting values for loop
    i = 1
    next_page = start_page
    links = []

    # add book links to list until there is no next page or max_pages is reached 
    while type(next_page) == str:
        # get links on current page
        new_links = get_page_links(next_page)

        # add links to link list
        links = links + new_links

        # find next page if it exists 
        next_page = next_page_link(next_page)

        # if we have reached max pages break loop
        if i == max_pages:
            break

        # increment i
        i += 1

    return links

In [16]:
# define function to scrape each book page for book information
def book_info(book_url):
    """
    Function takes the url for an audible book page. 
    Using Xpath it scrapes the page for the book title, author, narrator, series name, runtime, 
    star rating, number of raters, subgenres, original publishing year, and the publishers summary.
    If any of this information is not found or in an unusual form, NAN is returned.
    Returns a dictionary containing all book information.
    """
    # get response from url
    response = requests.get(book_url)
    response.raise_for_status()

    # parse response to html
    html = lx.fromstring(response.text)
    html.make_links_absolute(book_url)

    # get book title
    title = html.xpath("//h1[contains(@class, 'bc-heading')]")[0]
    title = title.text

    # get author
    try:
        author = html.xpath("//*[contains(@class, 'authorLabel')]")[1].text_content()
        author = author.replace('\n', '')
        author = author.split(":")[-1]
    except:
        print('weird author')
        author = np.nan

    # get narrator
    try:
        narrator = html.xpath("//*[contains(@class, 'narratorLabel')]")[1].text_content()
        narrator = narrator.replace('\n', '')
        narrator = narrator.split(":")[-1]
    except:
        narrator = np.nan

    # get series
    try:
        series = html.xpath("//*[contains(@class, 'seriesLabel')]")[0].text_content()
        series = series.replace('\n', '')
        series = series.split(':')[-1].strip()
    except: 
        series = np.nan

    # get runtime
    try: 
        runtime = html.xpath("//*[contains(@class, 'runtimeLabel')]")[0].text_content()
        runtime = runtime.replace('\n','')
        runtime = runtime.split(':')[-1].strip()
    except:
        runtime = np.nan
    
    # try to convert to minutes for comparison
    try:
        hours = np.int(runtime.split(' hr')[0])
        mins = runtime.split(' min')[0]
        mins = np.int(mins.split(' ')[-1])
        total_mins = hours*60 + mins
    except:
        print('weird time')
        total_mins = np.nan

    try:
        # get overall rating
        rating = html.xpath("//*[contains(@class, 'ratingsLabel')]")[0].text_content()
        rating = rating.replace('\n', '')

        # extract star value from rating
        stars = rating.split('stars')[-1]
        stars = np.float(stars.split('(')[0].strip())

        # get number of ratings
        ratings_number = rating.split('(')[-1]
        ratings_number = np.int(ratings_number.split(' rat')[0].replace(',', ''))
    except:
        print('not rated yet')
        stars = np.nan
        ratings_number = np.nan
          
    # get categories 
    try:
        categories = html.xpath("//*[contains(@*, 'categoriesLabel')]")[0].text_content()
        categories = categories.replace('\n','')
        categories = categories.split(':')[-1].strip()
    except:
        print('weird category')
        categories = np.nan
    
    # get publisher's book summary
    try:
        summary = html.xpath("//*[contains(@*, 'productPublisherSummary')]")[0].text_content()
        summary = summary.replace('\n', '')
        summary = summary.replace('\t', '').strip()
    except:
        summary = np.nan

    try:
        # get publishing year from summary
        year = summary.split('©')[-1]
        year = np.int(year.split(' ')[0])
    except:
        print('weird year')
        year = np.nan

    # return dictionary
    return {"url": book_url, 
            "title": title,  
            "author": author, 
            "year": year,
            "narrator": narrator,
            "series": series, 
            "runtime": runtime,
            "total_mins": total_mins,
            "stars": stars,
            "number_of_ratings": ratings_number,
            "categories": categories,
            "summary": summary}

In [11]:
# get urls to the book page of scifi/fantasy best sellers
book_links = get_all_links(start_page = start_page, max_pages = 30)

no next page


In [12]:
# how many books?
len(book_links)

1150

In [17]:
# loop through book links to scrape information from each book page
scifi_books = [book_info(link) for link in book_links]

weird year
weird year
weird time
not rated yet
not rated yet
weird time
not rated yet
weird year
weird time
weird time
weird time
weird year
weird year
weird year
not rated yet
weird year
weird year
weird time
not rated yet
weird time
not rated yet
weird year
weird year
weird year
weird year
weird year
weird year
weird year
weird time
not rated yet
weird year
weird year
weird time
not rated yet
weird time
weird year
weird time
not rated yet
weird year
weird year
weird year
weird year
not rated yet
weird time
weird time
weird time
weird year
weird year
not rated yet
weird year
weird author
weird time
not rated yet
weird time
weird year
weird time
weird year
not rated yet
weird year
weird year
weird time
not rated yet
weird year
weird year
weird year
weird time
weird year
weird year
weird time
weird year
weird year
weird time
weird year
weird year
weird year
weird year
weird time
weird time
weird time
weird year
weird author
weird time
not rated yet
weird author
weird time
not rated yet


In [18]:
# transform to pandas dataframe
scifi_df = pd.DataFrame(scifi_books)
scifi_df.head(10)

Unnamed: 0,author,categories,narrator,number_of_ratings,runtime,series,stars,summary,title,total_mins,url,year
0,Faith Hunter,Contemporary,Khristine Hvam,4639.0,5 hrs and 2 mins,,4.2,,Junkyard Cats,302.0,https://www.audible.com/pd/Junkyard-Cats-Audio...,
1,Andrzej Sapkowski,Epic,Peter Kenny,18649.0,10 hrs and 17 mins,The Witcher Saga,4.7,Publisher's Summary Geralt of...,The Last Wish,617.0,https://www.audible.com/pd/The-Last-Wish-Audio...,2008.0
2,Michael J. Sullivan,Epic,Tim Gerard Reynolds,201.0,1 hr and 45 mins,"Legends of the First Empire, Book 0.5",4.3,Publisher's Summary Pile of ...,Pile of Bones - FREE,105.0,https://www.audible.com/pd/Pile-of-Bones-FREE-...,2020.0
3,Andrzej Sapkowski,Epic,Peter Kenny,11503.0,10 hrs and 55 mins,"The Witcher Saga, Book 1",4.6,Publisher's Summary Watch for...,Blood of Elves,655.0,https://www.audible.com/pd/Blood-of-Elves-Audi...,2015.0
4,Andrzej Sapkowski,Epic,Peter Kenny,8807.0,12 hrs and 48 mins,The Witcher Saga,4.8,Publisher's Summary The New Y...,Sword of Destiny,768.0,https://www.audible.com/pd/Sword-of-Destiny-Au...,2015.0
5,Andy Weir,Contemporary,Wil Wheaton,1040.0,10 hrs and 59 mins,,4.8,Publisher's Summary A brand-n...,The Martian,659.0,https://www.audible.com/pd/The-Martian-Audiobo...,
6,Andrzej Sapkowski,Epic,Peter Kenny,7272.0,11 hrs and 55 mins,"The Witcher Saga, Book 2",4.7,Publisher's Summary The Witch...,The Time of Contempt,715.0,https://www.audible.com/pd/The-Time-of-Contemp...,2013.0
7,Brandon Sanderson,Contemporary,Suzy Jackson,5848.0,14 hrs and 30 mins,"Skyward, Book 2",4.8,Publisher's Summary The seque...,Starsight,870.0,https://www.audible.com/pd/Starsight-Audiobook...,2019.0
8,Amy Harmon,"Romance, ...",Rob Shapiro,716.0,14 hrs and 32 mins,,4.7,Publisher's Summary From the ...,The First Girl Child,872.0,https://www.audible.com/pd/The-First-Girl-Chil...,2019.0
9,Andrzej Sapkowski,Epic,Peter Kenny,5844.0,11 hrs and 59 mins,"The Witcher Saga, Book 3",4.8,Publisher's Summary The New Y...,Baptism of Fire,719.0,https://www.audible.com/pd/Baptism-of-Fire-Aud...,2014.0


In [19]:
# save to file
scifi_df.to_csv('audible_scifi.csv', index=False)