<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#About" data-toc-modified-id="About-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>About</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Import" data-toc-modified-id="Import-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Import</a></span></li><li><span><a href="#Initialization" data-toc-modified-id="Initialization-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Initialization</a></span></li></ul></li><li><span><a href="#Get-Good-Book-Links" data-toc-modified-id="Get-Good-Book-Links-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get Good Book Links</a></span><ul class="toc-item"><li><span><a href="#Save-Book-Links" data-toc-modified-id="Save-Book-Links-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Save Book Links</a></span></li></ul></li><li><span><a href="#Extract-Download-Links" data-toc-modified-id="Extract-Download-Links-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Extract Download Links</a></span><ul class="toc-item"><li><span><a href="#Save-Download-Links" data-toc-modified-id="Save-Download-Links-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Save Download Links</a></span></li></ul></li></ul></div>

# About

In the following, LibriVox is scraped for good data points. 

A good data point is defined as being a complete, solo project. Additionally, only one example per reader is desired.

The result is a list of links to download

# Setup
## Import

In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pickle as pkl

## Initialization

In [2]:
path_chromedriver = '/anaconda3/chromedriver'
search_url = ("https://librivox.org/search?title=&author=&reader=&keywords=&genre_id=0&status=complete&project_type=solo&recorded_language=&sort_order=alpha&search_page={}&search_form=advanced")
no_pages = 322 # As of 10/06/2019

# Get Good Book Links

Scrape LibriVox for audiobooks that are complete and recorded using a single reader.

In [3]:
driver = webdriver.Chrome(path_chromedriver)
book_links = [] # List of books to download

for page in range(1,1+no_pages):
    if page%10 == 0:
        print('{} of {}'.format(page,no_pages))
        
    # Load page    
    driver.get(search_url.format(page))

    # Wait until search results have been loaded
    results_loaded = EC.presence_of_element_located((By.CLASS_NAME , "catalog-result"))
    element = WebDriverWait(driver,100).until(results_loaded)

    # Soupify HTML
    html_source = driver.page_source
    soup = BeautifulSoup(html_source,'html.parser')

    # Get results    
    results_list = soup.find('ul', {'class': 'browse-list'})
    results_links = results_list.find_all('li', {'class': 'catalog-result'})

    # Extract relevant book links
    for result in results_links:
        # Extract relevant result info
        result_data = result.find('div', {'class': 'result-data'})
        book_meta = result_data.find('p', {'class': 'book-meta'})
        link = result_data.a["href"]

        # Conditions for good datum
        is_complete = str(book_meta).find("Complete")
        is_new = link not in book_links

        if is_complete and is_new:
                    book_links.append(link)

driver.close()

10 of 322
20 of 322
30 of 322
40 of 322
50 of 322
60 of 322
70 of 322
80 of 322
90 of 322
100 of 322
110 of 322
120 of 322
130 of 322
140 of 322
150 of 322
160 of 322
170 of 322
180 of 322
190 of 322
200 of 322
210 of 322
220 of 322
230 of 322
240 of 322
250 of 322
260 of 322
270 of 322
280 of 322
290 of 322
300 of 322
310 of 322
320 of 322


## Save Book Links

In [4]:
with open('book_links.pkl','wb') as fout:
    pkl.dump(book_links,fout)

# Extract Download Links

In [5]:
readers = []
download_links = []
sizes = []
bad_links = []

In [None]:
driver = webdriver.Chrome(path_chromedriver)

for i,link in enumerate(book_links):
    if i%10 ==0:
        print('{} of {}'.format(i,len(book_links)))
    # Used to avoid connections being blocked from LibriVox.org
    if i%100 == 0:
        print('Refreshing Driver')
        driver.close()
        driver = webdriver.Chrome(path_chromedriver)
        
    driver.get(link)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source,'html.parser')

    download_button = soup.find('a',{'class':'book-download-btn'})
    if download_button:
        product_details = soup.find('dl', {'class': 'product-details clearfix'})
        
        if product_details != None:
            product_details_list = product_details.find_all("dd")
            download_link = download_button['href']
            reader = product_details_list[3].get_text()
            size_mb = product_details_list[1].get_text()
            try:
                size_mb = float(size_mb[:-2])
            except:
                pass
            
        if reader not in readers:   
            readers.append(reader)
            sizes.append(size_mb)
            download_links.append(download_link)
    else:
        if link not in bad_links:
            bad_links.append(link)

0 of 6510
Refreshing Driver
10 of 6510
20 of 6510
30 of 6510
40 of 6510
50 of 6510
60 of 6510
70 of 6510
80 of 6510
90 of 6510
100 of 6510
Refreshing Driver
110 of 6510
120 of 6510
130 of 6510
140 of 6510
150 of 6510
160 of 6510
170 of 6510
180 of 6510
190 of 6510
200 of 6510
Refreshing Driver
210 of 6510
220 of 6510
230 of 6510
240 of 6510
250 of 6510
260 of 6510
270 of 6510
280 of 6510
290 of 6510
300 of 6510
Refreshing Driver
310 of 6510
320 of 6510
330 of 6510
340 of 6510
350 of 6510
360 of 6510
370 of 6510
380 of 6510
390 of 6510
400 of 6510
Refreshing Driver
410 of 6510
420 of 6510
430 of 6510
440 of 6510
450 of 6510
460 of 6510
470 of 6510
480 of 6510
490 of 6510
500 of 6510
Refreshing Driver
510 of 6510
520 of 6510
530 of 6510
540 of 6510
550 of 6510
560 of 6510
570 of 6510
580 of 6510
590 of 6510
600 of 6510
Refreshing Driver
610 of 6510
620 of 6510
630 of 6510
640 of 6510
650 of 6510
660 of 6510
670 of 6510
680 of 6510
690 of 6510
700 of 6510
Refreshing Driver
710 of 6510
720

5610 of 6510
5620 of 6510
5630 of 6510
5640 of 6510
5650 of 6510
5660 of 6510
5670 of 6510
5680 of 6510
5690 of 6510
5700 of 6510
Refreshing Driver
5710 of 6510
5720 of 6510
5730 of 6510
5740 of 6510
5750 of 6510
5760 of 6510
5770 of 6510
5780 of 6510
5790 of 6510
5800 of 6510
Refreshing Driver
5810 of 6510
5820 of 6510
5830 of 6510
5840 of 6510
5850 of 6510
5860 of 6510
5870 of 6510
5880 of 6510
5890 of 6510
5900 of 6510
Refreshing Driver
5910 of 6510
5920 of 6510
5930 of 6510
5940 of 6510
5950 of 6510
5960 of 6510
5970 of 6510
5980 of 6510
5990 of 6510
6000 of 6510
Refreshing Driver
6010 of 6510
6020 of 6510
6030 of 6510
6040 of 6510
6050 of 6510
6060 of 6510
6070 of 6510
6080 of 6510
6090 of 6510


## Save Download Links

In [None]:
with open('download_links.pkl','wb') as fout:
    pkl.dump(download_links,fout)
    

In [9]:
len(book_links)

6510

In [11]:
book_links[1]

'https://librivox.org/the-30000-bequest-and-other-stories-by-mark-twain/'