In [1]:
from ipywidgets import IntProgress
from IPython.display import display
import time

In [2]:
## Add in the Selenium framework to automate Chrome:

import json
import selenium
import selenium.webdriver as webdriver


In [3]:
## Create and add in our custom options:

custom_options = webdriver.ChromeOptions()
custom_options.add_argument('headless')

In [4]:
## Edgecase:
## Needed for Mybinder.org Google Cloud deployment when saving browser data to cache and forcing /tmp/

custom_options.add_argument('--disable-dev-shm-usage')

In [5]:
## Create the browser:

browser = webdriver.Chrome(options=custom_options)

In [6]:
## Edgecase:
## - NewYorkTimes Fiction Years are consistent beyond 1940.
## - Non-Fiction starts around the year 2000.

year_slider = widgets.IntSlider(
    min=1941,
    max=2019,
    step=1,
    value=2010,
    description='Year:',
    disabled=False,
    orientation='horizontal',
)

year_textbox = widgets.IntText(
    min=1941,
    max=2019,
    value=year_slider.value,
    description='1941-2019:',
    orientation='horizontal',
    disabled=False
)


widgets.jslink((year_textbox, 'value'), (year_slider, 'value'))


display(year_slider, year_textbox)


IntSlider(value=2010, description='Year:', max=2019, min=1941)

IntText(value=2010, description='1941-2019:')

In [7]:
## Create the New York Times Best Seller Historical List:

nytimes_url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_{}'.format(year_slider.value)

In [8]:
print( nytimes_url )

https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_2010


In [9]:
browser.get(nytimes_url)

In [10]:
## Import the selection tools from Selenium:

from selenium.webdriver.common.by import By 

In [11]:
## Table selection of row data via CSS_SELECTOR and child selection via nth-child()

element_titles = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(2)' )
element_authors = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(3)' )



In [12]:
## Create JSON structure via a Python Dictionary

unsorted_books = [{
    
    'title': title.get_attribute('innerText'),
    'author': author.get_attribute('innerText')}
    
    for title, author in zip( element_titles,element_authors )
]

## Debug:
# print( unsorted_books[0] )

In [13]:
## Debug:
## confirm that all results are the same length, and scraped properly

# print('json:' , len(unsorted_books) )
# print('tites:', len(element_titles) )
# print('authors:', len(element_authors) )

In [14]:
## Notes:

## Map the json.dumps,
## to encode the dictionaries as json objects which are immutable.

## Set can then be used to produce an iterable of unique immutables.

## Finally, we convert back to our dictionary representation using json.loads
## Note that initially, one must sort by keys to arrange the dictionaries in a unique form.

books = sorted(list(
    map(
        json.loads, set( map(json.dumps, unsorted_books) )
    )
),key=lambda key_value: key_value['title']) 

## Debug:
## confirm that books are now sorted by alphanum via title, 
## print( books )

In [15]:
## UI widget for book selection

ui_list = []

for item in books:
    ui_list.append('{} : {}'.format(item['title'],item['author'])) 

ui_list.sort()

In [16]:
## Note:
## Deprecated in favor of multi selection

# ui_index = widgets.IntText(
#     min=0,
#     max=(len(ui_list)-1),
#     value=0,
#     step=1,
#     description='index:',
#     disabled=False
# )


# ui_dropdown = widgets.Dropdown(
#     options=ui_list,
#     description='books:',
#     disabled=False,
# )


# widgets.jslink( (ui_index, 'value'), (ui_dropdown, 'index'))

# display(ui_dropdown)

## Debug:
## confirm that book selection and ui widget are the correct output
# print(books[ui_dropdown.index]['title'])
# print(books[ui_dropdown.index]['author'])

In [17]:
# ui_html = widgets.HTML

ui_book_selection = widgets.SelectMultiple(
    options=ui_list,
    rows=5,
    layout={'width': '80%'},
    description='Book Selection:',
    disabled=False
)

display( ui_book_selection )

SelectMultiple(description='Book Selection:', layout=Layout(width='80%'), options=('61 Hours : Lee Child', 'Am…

In [18]:
book_covers_to_fetch = []

for item in range(len(ui_book_selection.index)):
    selected = ui_book_selection.index[item]
    book_covers_to_fetch.append(books[selected]['title'])
#     book_covers_to_fetch.append(
#         str(
#             '{} {}'.format(books[selected]['title'],books[selected]['author'])
#         )
#     )


print(book_covers_to_fetch)

['The Help', 'The Postcard Killers', 'Towers of Midnight', 'Worth Dying For']


In [19]:
## Documentation:
## Complex css selector, to avoid using loops and itterators with javascript or python
## Example:
## browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( books[ui_dropdown.index]['title']) )

In [20]:
## We will need to URL Encode the following search queries:
## GoodReads.com
## ListenNotes.com 
 
import urllib.parse

In [21]:
## Edgecase:
## Wikipedia is unreliable for finding consitent book covers!
## Use GoodReads search urls to select the first result
## Example: 
## https://www.goodreads.com/search?utf8=%E2%9C%93&q={TITLE}+{AUTHOR}&search_type=books

book_queries = []

for item in range(len(book_covers_to_fetch)):
    selected = ui_book_selection.index[item]
    
    ## Edgecase:
    ## Searching for book titles that are two letters, or common words
    ## Must include the author name to generate results
    
    book_search_query = [
        books[selected]['author'],
        ' ',
        books[selected]['title']
    ]
    
    goodreads_url = [
        'https://www.goodreads.com/search?utf8=%E2%9C%93&q=',
        urllib.parse.quote(''.join(book_search_query)),
        '&search_type=books'
    ]
    
    book_queries.append(''.join(goodreads_url))

        
#     listnotes_url = 'https://www.goodreads.com/search?utf8=%E2%9C%93&q=',        
#     book_queries.append(
#         'https://www.goodreads.com/search?utf8=%E2%9C%93&q={}&search_type=books'.format(urllib.parse.quote(item))
#     )

print( book_queries[0] )

https://www.goodreads.com/search?utf8=%E2%9C%93&q=Kathryn%20Stockett%20The%20Help&search_type=books


In [23]:
book_images = []
book_covers = []
book_urls = []
url_match = None
progress_bar = widgets.IntProgress(
    min=0,
    max=len(book_queries),
    layout={'width': '100%'},
) # instantiate the bar

progress_count = 0

In [24]:
display(progress_bar)

## Loop throught each Book Query:
for item in range(len(book_queries)):
    progress_bar.value += 1
    selected_item = ui_book_selection.index[item]
    selected_title = book_covers_to_fetch[item]
    # Debug:
    print('now searching: ', book_queries[item] )
    browser.get( book_queries[item] )
    ## Edgecase:
    ## Some titles are truncated and shortend when selecting the image text
    ## Use the span selection text for more reliable results.
    result_elements = browser.find_elements(By.TAG_NAME, 'tr')
    
    ## Edgecase:
    ## Search the book results from the query for the EXACT match of the selected title
    ## Select the correct result and scrape the URL
    ## Test currect selected title against found table row results from seach query
    for result in range(len(result_elements)):
        title_result = result_elements[result].get_attribute('innerText').splitlines()[0].strip()
        ## Debug:
        # print(title_result)
        if len(title_result) is len(selected_title):
            ## Debug:
            # print('match: ', result)
            url_match = result_elements[result].find_element(By.TAG_NAME, 'a').get_attribute('href')
        else:
        ## Edgecase:
        ## There are books that truncate or add in extra sub titles to the match
        ## In that event, use a CSS_SELECTOR regex for nearest match.
            url_match = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( selected_title ) ).get_attribute('href')

        
            

    ## Debug:
    # print('url_match: ', url_match )
    browser.get(url_match)
    cover_image = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')
    book_covers.append( cover_image )
    progress_count += 1

print('complete')

IntProgress(value=0, layout=Layout(width='100%'), max=4)

now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Kathryn%20Stockett%20The%20Help&search_type=books
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=James%20Patterson%20and%20Liza%20Marklund%20The%20Postcard%20Killers&search_type=books
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Robert%20Jordan%20and%20Brandon%20Sanderson%20Towers%20of%20Midnight&search_type=books
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Lee%20Child%20Worth%20Dying%20For&search_type=books
complete


In [25]:
html_data = []
for image in range(len(book_covers)):
     html_data.append('<img sytle="display: inline-block" width="150" src="{}">'
        .format( book_covers[image] )
    )


ui_html = widgets.HTML(
    layout={'width': '100%'},
    value = str(html_data)
)

display( ui_html )

HTML(value='[\'<img sytle="display: inline-block" width="150" src="https://i.gr-assets.com/images/S/compressed…

In [26]:


# for item in range(len(ui_book_selection.index)):
#     selected = ui_book_selection.index[item]
#     print(books[selected]['title'], ' : ' ,books[selected]['author'])

# for item in book_covers_to_fetch:
#     book_queries.append(
#         'https://www.goodreads.com/search?utf8=%E2%9C%93&q={}&search_type=books'.format(urllib.parse.quote(item))
#     )

# print( book_queries )

In [45]:
stitcher_queries = []
listennotes_queries = []

## Run the code to create the podcast search query URLS:

for item in range(len(ui_book_selection.index)):
    selected = ui_book_selection.index[item]
    selected_book = [ books[selected]['title'],' ',books[selected]['author'] ]
    
    stitcher_query_url = [
        'https://www.stitcher.com/search?q=',
        urllib.parse.quote(''.join(selected_book)),
        '#episodes'
    ]
    
    listennotes_url = [
        'https://www.listennotes.com/search/?q=',
        urllib.parse.quote(''.join(selected_book)),
        '&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0'
    ]

    stitcher_queries.append(''.join(stitcher_query_url))
    listennotes_queries.append(''.join(listennotes_url))
    
#     selected_book = [ str(books[selected]['title']), ' ', str(books[selected]['author'])]
#     selected_query = urllib.parse.quote(str(selected_book))
#     book_query_url = ['https://www.listennotes.com/search/?q=', selected_query, '&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0']
#     podcast_queries.append(str(book_query_url))

In [63]:
## Edgecase:
## There are times when there will be ZERO results
## Example:
## https://www.stitcher.com/search?q=Random%20Harvest%20James%20Hilton#episodes
## Possible Solution:
## More work, pulling in both Sticher and ListenNotes results, take the one that has an option

for result in range(len(stitcher_queries)):
    print(
        str(result+1)+':',
        '\n',
        'stitcher: ',
        '\n ‣ ', stitcher_queries[result],
        '\n',
        'listennotes: ',
        '\n ‣', listennotes_queries[result]
    )



# for query in sticher_queries:
#     print('sticher:', query )
    
# for query in listennotes_queries:
#     print('listennotes:', query)

1: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=The%20Help%20Kathryn%20Stockett#episodes 
 listennotes:  
 ‣ https://www.listennotes.com/search/?q=The%20Help%20Kathryn%20Stockett&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0
2: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=The%20Postcard%20Killers%20James%20Patterson%20and%20Liza%20Marklund#episodes 
 listennotes:  
 ‣ https://www.listennotes.com/search/?q=The%20Postcard%20Killers%20James%20Patterson%20and%20Liza%20Marklund&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0
3: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=Towers%20of%20Midnight%20Robert%20Jordan%20and%20Brandon%20Sanderson#episodes 
 listennotes:  
 ‣ https://www.listennotes.com/search/?q=Towers%20of%20Midnight%20Robert%20Jordan%20and%20Brandon%20Sanderson&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0
4: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=Worth%20Dying%20For%20L

In [22]:
## Load tools for waiting and testing for elements inside DOM

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [127]:
## Note:
## Get Stitch 3 podcast results for each query:
stitcher_results = []
podcast_covers = []
podcast_descriptions = []
podcast_urls = []

## stitcher_query_url

for query in stitcher_queries:
    print( query )
    browser.get( query )
    browser.implicitly_wait(1)
    timeout_delay = 5
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'play'))
        WebDriverWait(browser, timeout_delay).until(element_present)
        pod_cover = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > img')
        pod_description = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > p')
        
        podcast_covers.append(pod_cover.get_attribute('src'))
        podcast_descriptions.append(pod_description.get_attribute('innerText'))
        
        pod_result = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a')
        pod_result.click()
        
        pod_audio = browser.find_element(By.TAG_NAME, 'audio').get_attribute('src')
        
        stitcher_results.append( pod_audio )
        
    except TimeoutException:
        print('Error: Timed Out Waiting For Element Presence')
        stitcher_results.append(None)
        podcast_covers.append(None)
        podcast_descriptions.append(None)

https://www.stitcher.com/search?q=The%20Help%20Kathryn%20Stockett#episodes
https://www.stitcher.com/search?q=The%20Postcard%20Killers%20James%20Patterson%20and%20Liza%20Marklund#episodes
https://www.stitcher.com/search?q=Towers%20of%20Midnight%20Robert%20Jordan%20and%20Brandon%20Sanderson#episodes
https://www.stitcher.com/search?q=Worth%20Dying%20For%20Lee%20Child#episodes


In [125]:
for match in stitcher_results:
    print( match )

https://cdn.simplecast.com/audio/0f24be/0f24bed7-a97a-44f9-acad-ae53dc40c90a/825d7505-62fe-4556-b9a7-736acbb73a9b/1680fe0e_tc.mp3?aid=rss_feed
https://sverigesradio.se/topsy/ljudfil/podrss/5409292.mp3
https://rss.art19.com/episodes/8ea8f676-0a20-429a-9c52-83dcd72851e0.mp3
https://play.podtrac.com/npr-510298/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/ted/2018/09/20180906_ted_tedpod-ec6a526b-eea4-4020-b496-082b1c4cd4b2.mp3?awCollectionId=510298&awEpisodeId=643774281&orgId=1&d=3180&p=510298&story=643774281&t=podcast&e=643774281&ft=pod&f=510298


In [130]:
html_data = []
for item in range(len(book_covers)):
     html_data.append('''
     
     <p>
         <img sytle="display: inline-block" width="150" src="{}">
         <li sytle="display: inline-block">
            <img sytle="display: inline-block" width="90" src="{}">
            <p sytle="display: inline-block">{}</p>
            <audio controls src="{}"></audio>
         </li>
     </p>
     '''
        .format(
            book_covers[item],
            podcast_covers[item],
            podcast_descriptions[item],
            stitcher_results[item]
            )
    )


ui_html = widgets.HTML(
    layout={'width': '100%'},
    value = str(html_data)
)

display( ui_html )

HTML(value='[\'\\n     \\n     <p>\\n         <img sytle="display: inline-block" width="150" src="https://i.gr…

In [None]:
## Notes:


# First List Element:  ul > li > a > div[class^="play"].click()

# https://www.stitcher.com/podcast/the-joe-rogan-experience/e/46639950?autoplay=false
# document.getElementsByTagName('audio')[0].getAttribute('src')




In [None]:
# for item in range(len(book_queries)):
#     progress_bar.value += 1
#     # selected_item = ui_book_selection.index[item]
#     selected_title = book_covers_to_fetch[item]
#     browser.get( book_queries[item] )
#     ## Edgecase:
#     ## Some titles are truncated and shortend when selecting the image text
#     ## Use the span selection text for more reliable results.
#     result_elements = browser.find_elements(By.TAG_NAME, 'tr')
    
#     ## Edgecase:
#     ## Search the book results from the query for the EXACT match of the selected title
#     ## Select the correct result and scrape the URL
#     ## Test currect selected title against found table row results from seach query
#     for result in range(len(result_elements)):
#         title_result = result_elements[result].get_attribute('innerText').splitlines()[0].strip()
#         ## Debug:
#         # print(title_result)
#         if len(title_result) is len(selected_title):
#             ## Debug:
#             # print('match: ', result)
#             url_match = result_elements[result].find_element(By.TAG_NAME, 'a').get_attribute('href')
#         else:
#         ## Edgecase:
#         ## There are books that truncate or add in extra sub titles to the match
#         ## In that event, use a CSS_SELECTOR regex for nearest match.
#             url_match = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( selected_title ) ).get_attribute('href')

#     ## Debug:
#     # print('url_match: ', url_match )
#     browser.get(url_match)
#     cover_image = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')
#     book_covers.append( cover_image )
#     progress_count += 1

# print('complete')