## Imports & Declarations

In [60]:
# import numpy as np
import os
import pandas as pd
import re
import requests

from datetime import datetime, timedelta
from IPython.display import clear_output
from time import time as timer

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.support import wait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

## For abandoned approaches
# import sys
# import time
# from PyQt5.QtWidgets import QApplication
# from PyQt5.QtCore import QUrl
# from PyQt5.QtWebEngineWidgets import QWebEngineView


In [40]:
REQUEST_WAIT_SECS = 3
# DATA_PATH = '../⁨data/⁨test_data/test_mp3⁩/' <-- This one has NPC from copying from Finder...
    # only discovered b/c google's search for this page https://www.reddit.com/r/learnpython/comments/aafp0x/how_should_i_open_a_txt_file_that_downloaded/
    # had the npc's encoded in the search result (https://www.google.com/search?q=%5BErrno+2%5D+No+such+file+or+directory%3A+%27..%2F%5Cu2068&rlz=1C5CHFA_enUS845US845&oq=%5BErrno+2%5D+No+such+file+or+directory%3A+%27..%2F%5Cu2068&aqs=chrome..69i57.4788j0j7&sourceid=chrome&ie=UTF-8)

DATA_PATH = '../data/'
TEST_MP3_OUT_PATH = DATA_PATH + 'test_data/test_mp3/'

TEST_FEED_ID = '18812'

USERNAME = 'cwchiu'
PASSWORD = 'datascientists'

## Notebook-level functions

In [3]:
def time_to_hhmm(s):
    # Converts a string representing a time in HH:MM AM/PM format to a string in 24-hr HHMM
    return ''.join(str(datetime.strptime(s, '%I:%M %p')).split(' ')[-1].split(':')[:2])

    # More details, since it's a one-line and this isn't freaking codewars:
        # strptime converts the string to datetime 
            # see https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
            # and https://stackoverflow.com/questions/19229190/convert-12-hour-into-24-hour-times
        # first split separates YYYY-MM-DD from HH:MM
        # second split gets rid of the colon between HH & MM
        # join puts HHMM together

In [4]:
def parse_archive_times_table(archive_page_soup):
    """
    Generates a list of Broadcastify archive file information from
    the `archiveTimes` table on a feed's archive page. The list includes
    two elements:
        - Date and end time of the transmission in the format YYYYMMDD-HHMM,
          on a 24-hour clock
        - The unique ID for the file, which can be used to find the file's
          individual download page
         
    Parameters
    ----------
    archive_page_soup : bs4.BeautifulSoup
        A BeautifulSoup object containing the feed archive page source code, 
        e.g. from https://m.broadcastify.com/archives/feed/[feed_id]
        
        
    """
    # Set up a blank list to return
    files_to_download = []
    
    # Isolate the `archive_times` table body
    archive_times = archive_page_soup.find('table', attrs={'id': 'archiveTimes'}).find('tbody')
    
    # Find the date of transmission of the archived files
    archive_date = get_archive_date(archive_page_soup)
    
    # Loop through all rows of the table
    for row in archive_times.find_all('tr'):
        file = []
        file_info = []

        # Grab the end time, contained in the row's second <td> tag
        file_end_time = time_to_hhmm(row.find_all('td')[1].text) 

        # Represent the date & end time of the file as YYYYMMDD-HHMM
        file_end_date_time = '-'.join([archive_date, file_end_time])

        # Grab the file ID
        file_id = row.find('a')['href'].split('/')[-1]

        # Put the file date/time and URL leaf (as a list) into the list
        files_to_download.append([file_end_date_time, file_id])

    return files_to_download

In [5]:
def get_archive_date(archive_page_soup):
    MONTHS = ['','January', 'February', 'March',
          'April', 'May', 'June',
          'July', 'August', 'September',
          'October', 'November', 'December']
    
    # Extract the day, month, and year of the data displayed on the page
    day = archive_page_soup.find('td', {'class': 'active day'}).text
    month, year = archive_page_soup.find('th', {'class': 'datepicker-switch'}).text.split()
    
    # Format the date as YYYYMMDD
    formatted_date = str(year) + str(MONTHS.index(month)).zfill(2) + day.zfill(2)

    return formatted_date

In [6]:
def get_mp3_path(download_page_soup):
    # Get the filepath for the mp3 archive
    return download_page_soup.find('a', {'href': re.compile('.mp3')}).attrs['href']

In [57]:
def courtesy_wait(last_hit, wait_time=REQUEST_WAIT_SECS):
    return time.time() - last_hit >= wait_time

----

## WORKING VERSION

### Scrape feed archive page with `selenium`

Need to use browser emulation because the page elements we need are rendered in JavaScript after the page loads.

In [7]:
# Instantiate a browser
browser = webdriver.Chrome('../assets/chromedriver')

# Open the feed archive page
browser.get('https://m.broadcastify.com/archives/feed/' + TEST_FEED_ID)

## Wait for page to render
element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "cursor-link")))

# Capture page content as a BSoup
archive_page_soup = BeautifulSoup(browser.page_source, 'lxml')

# Quit the browser
browser.quit()

### Parse the page's `archiveTimes` table

In [8]:
archive_info = parse_archive_times_table(archive_page_soup)
archive_info

[['20190725-1725', '773188378'],
 ['20190725-1655', '773180493'],
 ['20190725-1625', '773175133'],
 ['20190725-1555', '773168047'],
 ['20190725-1525', '773162347'],
 ['20190725-1455', '773153902'],
 ['20190725-1426', '773147529'],
 ['20190725-1356', '773142063'],
 ['20190725-1326', '773135498'],
 ['20190725-1256', '773127936'],
 ['20190725-1226', '773121041'],
 ['20190725-1156', '773113998'],
 ['20190725-1127', '773107706'],
 ['20190725-1057', '773100451'],
 ['20190725-1027', '773093817'],
 ['20190725-0957', '773088542'],
 ['20190725-0927', '773081457'],
 ['20190725-0857', '773074135'],
 ['20190725-0828', '773066956'],
 ['20190725-0758', '773062167'],
 ['20190725-0728', '773054298'],
 ['20190725-0658', '773047854'],
 ['20190725-0628', '773040908'],
 ['20190725-0558', '773033921'],
 ['20190725-0529', '773026897'],
 ['20190725-0459', '773020840'],
 ['20190725-0429', '773014535'],
 ['20190725-0359', '773007881'],
 ['20190725-0329', '773001398'],
 ['20190725-0259', '772994266'],
 ['2019072

### Get `mp3` paths for archived files

Login required to access pages in `/archives/id/`. 

Consider re-implementing with a `requests.Session`...not sure if it would be faster or by how much...

In [9]:
# Instantiate a browser
browser = webdriver.Chrome('../assets/chromedriver')

# Get the first page
browser.get('https://m.broadcastify.com/archives/id/' + archive_info[0][1])
last_page_request_time = time.time()

# Log in so we can download files
## Store the fields for username + password
username = browser.find_element_by_id("signinSrEmail") 
password = browser.find_element_by_id("signinSrPassword")

## Type username + password, and hit "enter"
username.send_keys(USERNAME)
password.send_keys(PASSWORD)
password.send_keys(Keys.RETURN)

## Wait for login to complete
browser.implicitly_wait(2)

# Get the filepath for the mp3 archive
archive_info[0].append(get_mp3_path(BeautifulSoup(browser.page_source, 'lxml')))

for row in archive_info[1:]:
   # Wait until some time has passed, out of courtesy
    while not courtesy_wait(last_page_request_time): pass

    # Get the next archive page, recording the time
    browser.get('https://m.broadcastify.com/archives/id/' + row[1])
    last_page_request_time = time.time()
    
    # Get the filepath for the mp3 archive
    row.append(get_mp3_path(BeautifulSoup(browser.page_source, 'lxml')))

# Quit the browser
browser.quit()

In [10]:
archive_info[:5]

[['20190725-1725',
  '773188378',
  'http://garchives1.broadcastify.com/18812/20190725/201907251555-210863-18812.mp3'],
 ['20190725-1655',
  '773180493',
  'http://garchives1.broadcastify.com/18812/20190725/201907251555-210863-18812.mp3'],
 ['20190725-1625',
  '773175133',
  'http://garchives1.broadcastify.com/18812/20190725/201907251525-847590-18812.mp3'],
 ['20190725-1555',
  '773168047',
  'http://garchives1.broadcastify.com/18812/20190725/201907251455-596093-18812.mp3'],
 ['20190725-1525',
  '773162347',
  'http://garchives1.broadcastify.com/18812/20190725/201907251425-507163-18812.mp3']]

### Download `.mp3` files

In [63]:
# https://markhneedham.com/blog/2018/07/15/python-parallel-download-files-requests/

def fetch_mp3(entry):
    path, uri = entry
    
    if not os.path.exists(path):
        r = requests.get(uri, stream=True)
        if r.status_code == 200:
            with open(path, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
    return path

In [None]:
start = timer()
last_page_request_time = time.time() - REQUEST_WAIT_SECS

for file in archive_info:
    file_date = file[0]
    file_id = file[1]
    url = file[2]
    
    # Build the filename we'll store the downloaded .mp3 under
    file_name = TEST_MP3_OUT_PATH + '-'.join([TEST_FEED_ID, file_date]) + '.mp3'

    print(f'Downloading {archive_info.index(file) + 1} of {len(archive_info)}')
    print(f'\tfrom {url}')
    print(f'\tto {file_name}')
    
    # Wait until some time has passed, out of courtesy
    while not courtesy_wait(last_page_request_time): pass

    fetch_mp3([file_name, url])
    last_page_request_time = time.time()
    clear_output(wait=True)
        # h/t @schmitty
    
print('**** Downloads complete! ****')
print(f'Elapsed Time: {round(timer() - start, 4)} seconds.')

----

<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>

##  

## Previous attempts

### Scrape `.mp3` download page with `requests`

In [None]:
login_data = {
    'username': 'cwchiu',
    'password': 'datascientists',
    'action': 'auth',
    'redirect': '/'
}

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) ' +
                  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' +
                  '75.0.3770.142 Safari/537.36'
}

login_url = 'https://m.broadcastify.com/login/'

In [None]:
# h/t to Indian Pythonista for the info about request.Session
    # https://www.youtube.com/watch?v=fmf_y8zpOgA

with requests.Session() as s:
    r = s.post(login_url, data=login_data, headers=headers)
    r = s.get('https://m.broadcastify.com/archives/id/772540078')
    soup = BeautifulSoup(r.text, 'lxml')
    a_list = soup.find('a', {'href': re.compile('.mp3')})

In [None]:
a_list.attrs['href']

In [None]:
r.text

### Scrape feed archive page with `requests`

**Atlanta Police Zone 5 and Fire Dispatch**

Broadcasting Atlanta PD Zone 5 dispatch and Atlanta Fire and Rescue dispatch from Uniden Home Patrol and Windows 10 laptop. Limited to dispatch only. Adding allowed tac sub-channels will talk-over the dispatch channels.

https://www.broadcastify.com/listen/feed/18812

Feed archives = https://m.broadcastify.com/archives/feed/18812

In [None]:
# h/t to Indian Pythonista for the info about request.Session
    # https://www.youtube.com/watch?v=fmf_y8zpOgA

with requests.Session() as s:
    r = s.post(login_url, data=login_data, headers=headers)
    start_time = time.time()
    r = s.get('https://m.broadcastify.com/archives/feed/18812')
    while time.time() - start_time <= 5: pass
    soup = BeautifulSoup(r.text, 'html5lib')
    a_list = soup.find_all('a', {'title': 'Download audio'})

In [None]:
a_list

In [None]:
soup

### Scrape feed archive page with `QWebEngineView` custom class

In [None]:
class Render(QWebEngineView):
        def __init__(self, url):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            #self.setHtml(html)
            self.load(QUrl(url))
            self.app.exec_()

        def _loadFinished(self, result):
            # This is an async call, you need to wait for this
            # to be called before closing the app
            self.page().toHtml(self._callable)

        def _callable(self, data):
            self.html = data
            # Data has been stored, it's safe to quit the app
            self.app.quit()

In [None]:
def render(source_url):
    """Fully render HTML, JavaScript and all."""
    return Render(source_url).html

In [None]:
url="https://m.broadcastify.com/archives/feed/18812"
print(render(url))

In [None]:
# import os

# for file in archive_info[0]:
#     file_id = file[1]
#     url = file[2]
    
#     print(f'Downloading {file_id}')
#     os.system(f'wget {url}')
    
    
#     'wget --http-user={USERNAME} --http-password={PASSWORD} {url}'
#     '--user-agent'