In [None]:
## UIUX:

from ipywidgets import Layout, Button, Box
from ipywidgets import IntProgress
from IPython.display import display
import time

In [None]:
## Will be parsing JSON data, so import JSON
import json

In [None]:
## We will need to URL Encode the following search queries:
## GoodReads.com
## Stitcher.com 
 
import urllib.parse

In [None]:
## Add in the Selenium framework to automate Chrome:
import selenium
import selenium.webdriver as webdriver


In [None]:
## Import the selection tools from Selenium:

from selenium.webdriver.common.by import By 

In [None]:
## Load tools for waiting and testing for elements inside DOM

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [None]:
## Create and add in our custom options:

custom_options = webdriver.ChromeOptions()
custom_options.add_argument('headless')

In [None]:
## Edgecase:
## Needed for Mybinder.org Google Cloud deployment when saving browser data to cache and forcing /tmp/

custom_options.add_argument('--disable-dev-shm-usage')

In [None]:
## Create the browser:

browser = webdriver.Chrome(options=custom_options)

In [None]:
## Edgecase:
## - NewYorkTimes Fiction Years are consistent beyond 1940.
## - Non-Fiction starts around the year 2000.

## UIUX:
## Create the sliders and text entry for valid years

year_slider = widgets.IntSlider(
    min=1941,
    max=2019,
    step=1,
    value=2010,
    description='Year:',
    disabled=False,
    orientation='horizontal',
)

year_textbox = widgets.IntText(
    min=1941,
    max=2019,
    value=year_slider.value,
    description='1941-2019:',
    orientation='horizontal',
    disabled=False
)


widgets.jslink((year_textbox, 'value'), (year_slider, 'value'))

def on_change(value_to_watch):
    year_slider.value = value_to_watch['new']
    nytimes_url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_{}'.format(value_to_watch['new'])

year_slider.observe(on_change, names='value')

display(year_slider, year_textbox)


In [None]:
## Create the New York Times Best Seller Historical List:

nytimes_url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_{}'.format(year_slider.value)
browser.get(nytimes_url)
print( nytimes_url )

In [None]:
## Table selection of row data via CSS_SELECTOR and child selection via nth-child()

element_titles = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(2)' )
element_authors = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(3)' )


In [None]:
## Create JSON structure via a Python Dictionary

unsorted_books = [{
    
    'title': title.get_attribute('innerText'),
    'author': author.get_attribute('innerText')}
    
    for title, author in zip( element_titles,element_authors )
]

In [None]:
## Notes:

## Map the json.dumps,
## to encode the dictionaries as json objects which are immutable.

## Set can then be used to produce an iterable of unique immutables.

## Finally, we convert back to our dictionary representation using json.loads
## Sorting the keys to arrange in a unique form.

books = sorted(list(
    map(
        json.loads, set( map(json.dumps, unsorted_books) )
    )
),key=lambda key_value: key_value['title']) 

## Debug:
## confirm that books are now sorted by alphanum via title, 
## print( books )

In [None]:
## UI widget for book selection

ui_list = []

for item in books:
    ui_list.append('{} : {}'.format(item['title'],item['author'])) 

ui_list.sort()

In [None]:
## ui_html = widgets.HTML

ui_book_selection = widgets.SelectMultiple(
    options=ui_list,
    rows=5,
    layout={'width': '80%'},
    description='Book Selection:',
    disabled=False
)

display( ui_book_selection )

In [None]:
book_covers_to_fetch = []

for item in range(len(ui_book_selection.index)):
    selected = ui_book_selection.index[item]
    book_covers_to_fetch.append(books[selected]['title'])

## Debug:
## - Confirm that user has not deselected any items
print(book_covers_to_fetch)

In [None]:
## Edgecase:
## Wikipedia is unreliable for finding consitent book covers!
## Use GoodReads search urls to select the first result
## Example: 
## https://www.goodreads.com/search?utf8=%E2%9C%93&q={TITLE}+{AUTHOR}&search_type=books

book_queries = []

for item in range(len(book_covers_to_fetch)):
    selected = ui_book_selection.index[item]
    
    ## Edgecase:
    ## Searching for book titles that are two letters, or common words
    ## Must include the author name to generate results
    
    book_search_query = [
        books[selected]['author'],
        ' ',
        books[selected]['title']
    ]
    
    goodreads_url = [
        'https://www.goodreads.com/search?utf8=%E2%9C%93&q=',
        urllib.parse.quote(''.join(book_search_query)),
        '&search_type=books'
    ]
    
    book_queries.append(''.join(goodreads_url))


In [None]:
## UIUX:
## Progress bar for loading the book queries
## TODO: could be a dynamic function, to call the len(list_to_messure)
## NOTE:
## increment with the following:
## ui_progress_bar.value += 1
## ui_progress_count += 1

flex_layout = Layout(
    display='flex',
    flex_direction='column',
    flex_wrap='wrap',
    justify_content='center',
    align_items='center',
    align_content='center',
    border='none',
    height='100%',
    width='auto'
)

ui_text = widgets.Text(value='status: none')

ui_status = widgets.Text(
    value = ui_text.value,
    disabled = True,
    layout = flex_layout
)


ui_progress_count = 0
ui_progress_bar = widgets.IntProgress(
    min=0,
    max=len(book_queries),
    layout={'width':'100%'}
)

widgets.jslink((ui_text, 'value'), (ui_status, 'value'))

In [None]:
## Rest Lists:
book_images = []
book_covers = []
book_urls = []
url_match = None
timeout_delay = 5

In [None]:
## UIUX:
display(ui_progress_bar, ui_status)

## Loop throught each Book Query:
for item in range(len(book_queries)):
    ui_progress_bar.value += 1
    
    
    selected_item = ui_book_selection.index[item]
    selected_title = book_covers_to_fetch[item]
    
    # Debug:
    ui_text.value = 'now searching: {}'.format(book_queries[item])
                                             
    browser.get( book_queries[item] )
    browser.implicitly_wait(1)
    ## Edgecase:
    ## Popup may appear when loading multiple URLS for an email signup
    try:
        popup_elements = browser.find_elements(By.CSS_SELECTOR, 'img[alt^="Dismiss"')
        for icon in popup_elements:
            icon.click()
    except:
        ui_text.value = 'goodreads: no popup found'
    
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'bookCover'))
        WebDriverWait(browser, timeout_delay).until(element_present)
    except:
        ui_text.value = 'goodreads: no book cover found'
        
    ## Edgecase:
    ## Some titles are truncated and shortend when selecting the image text
    ## Use the span selection text for more reliable results.
    result_elements = browser.find_elements(By.TAG_NAME, 'tr')
    
    ## Edgecase:
    ## Search the book results from the query for the EXACT match of the selected title
    ## Select the correct result and scrape the URL
    ## Test currect selected title against found table row results from seach query
    for result in range(len(result_elements)):
        
        title_result = result_elements[result].get_attribute('innerText').splitlines()[0].strip()
        ## Debug:
        # print(title_result)
        if len(title_result) is len(selected_title):
            ## Debug:
            ui_text.value = result_elements[result].get_attribute('innerText')
            url_match = result_elements[result].find_element(By.TAG_NAME, 'a').get_attribute('href')
        else:
            try:
                ## Edgecase:
                ## There are books that truncate or add in extra sub titles to the match
                ## In that event, use a CSS_SELECTOR regex for nearest match.
                url_match = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( selected_title ) ).get_attribute('href')                
            except:
                ## Edgecase:
                ## The regex and the title do not match, just return the first result.
                url_match = browser.find_element(By.CSS_SELECTOR, 'tbody > tr:first-child > td > a').get_attribute('href')
        
    ## Debug:
    #  browser.implicitly_wait(1)
    ui_text.value = 'status: {}'.format(url_match)
    browser.get(url_match)

    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'coverImage'))
        WebDriverWait(browser, timeout_delay).until(element_present)
    except:
        ui_text.value = 'goodreads: no cover image found'

    cover_image = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')
    ui_text.value = 'status: {}'.format(cover_image)
    book_covers.append( cover_image )
    ui_progress_count += 1
        
ui_text.value = 'status: complete'

In [None]:
## Run the code to create the podcast search query URLS:
stitcher_queries = []

for item in range(len(ui_book_selection.index)):
    selected = ui_book_selection.index[item]
    selected_book = [ books[selected]['title'],' ',books[selected]['author'] ]
    
    stitcher_query_url = [
        'https://www.stitcher.com/search?q=',
        urllib.parse.quote(''.join(selected_book)),
        '#episodes'
    ]

    stitcher_queries.append(''.join(stitcher_query_url))

In [None]:
## Note:
## Get Stitcher podcast results for each query:
## TODO: could refactor this into an object to call properties

stitcher_results = []
podcast_covers = []
podcast_descriptions = []
podcast_urls = []
podcast_episodes = []
podcast_publishers = []

## UIUX:
## Progress bar for loading the podcast queries
## TODO: could be a dynamic function, to call the len(list_to_messure)
## reset the progress_count and text_status

ui_progress_count = 0
ui_progress_bar.value = 0
ui_text = widgets.Text(value='status: none')

widgets.jslink((ui_text, 'value'), (ui_status, 'value'))

display(ui_progress_bar,ui_status)    


## Fetch the Podcast Queries:

for query in stitcher_queries:
    ui_text.value = 'status: {}'.format(query)
    browser.get( query )
    browser.implicitly_wait(1)
    timeout_delay = 5
    try:
        ui_progress_bar.value += 1
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'play'))
        WebDriverWait(browser, timeout_delay).until(element_present)
        
        pod_result = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a')

        pod_cover = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > img')
        pod_description = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > p')
        pod_episode = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > h4')
        pod_producer = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > div[class^="info"]')

        ## Append Podcast Data before clicking to retrieve the audio ULR
        podcast_episodes.append( pod_episode.get_attribute('innerText'))
        podcast_covers.append( pod_cover.get_attribute('src'))
        podcast_descriptions.append( pod_description.get_attribute('innerText'))
        podcast_publishers.append( pod_producer.get_attribute('innerText'))
        

        pod_result.click()
        pod_audio = browser.find_element(By.TAG_NAME, 'audio').get_attribute('src')
        
        ui_text.value = 'status: {}'.format(pod_audio)
        stitcher_results.append( pod_audio )
        ui_progress_count += 1
        
    except TimeoutException:
        ui_progress_bar.value += 1
        ui_text.value = 'Error: Timed Out Waiting For Element Presence'
        ui_progress_count += 1

ui_text.value = 'status: complete'

In [None]:
html_data = []
for item in range(len(podcast_episodes)):
    html_data.append('''
    <div class="container" style="display: flex; flex-direction: row; flex-wrap: wrap; width: 100%;">
        
        <img style="height: 150px; width:120px;" src="{}">
        <img style="height: 150px; width: 140px;" src="{}">
        
        <div style="flex-direction: column; flex-basis: 100%; flex: 1; justify-content: center; padding-left:10px;">
            <p style="flex-basis: 50%;">
                <b id="publisher">{}</b>
                <h5 id="episode">Episode: {}</h5>
                <h5 style="font-style:italic;" id="description">Description: {}</h5>
            </p>
            <audio style="flex-basis: 50%;" controls src="{}"></audio>
        </div>
    </div>
    '''.format(
        book_covers[item],
        podcast_covers[item],
        
        podcast_publishers[item],
        podcast_episodes[item],
        podcast_descriptions[item],
        
        stitcher_results[item]
    )
)


display( widgets.HTML( ''.join(html_data) , layout=flex_layout) )