## Imports & Declarations

In [69]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.support import wait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView


## Scrape `.mp3` download page with `requests`

In [3]:
login_data = {
    'username': 'cwchiu',
    'password': 'datascientists',
    'action': 'auth',
    'redirect': '/'
}

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) ' +
                  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' +
                  '75.0.3770.142 Safari/537.36'
}

login_url = 'https://m.broadcastify.com/login/'

In [7]:
# h/t to Indian Pythonista for the info about request.Session
    # https://www.youtube.com/watch?v=fmf_y8zpOgA

with requests.Session() as s:
    r = s.post(login_url, data=login_data, headers=headers)
    r = s.get('https://m.broadcastify.com/archives/id/772540078')
    soup = BeautifulSoup(r.text, 'lxml')
    a_list = soup.find('a', {'href': re.compile('.mp3')})

In [8]:
a_list.attrs['href']

'http://garchives1.broadcastify.com/18812/20190723/201907231540-907553-18812.mp3'

In [9]:
r.text



## Scrape feed archive page with `requests`

**Atlanta Police Zone 5 and Fire Dispatch**

Broadcasting Atlanta PD Zone 5 dispatch and Atlanta Fire and Rescue dispatch from Uniden Home Patrol and Windows 10 laptop. Limited to dispatch only. Adding allowed tac sub-channels will talk-over the dispatch channels.

https://www.broadcastify.com/listen/feed/18812

Feed archives = https://m.broadcastify.com/archives/feed/18812

In [5]:
# h/t to Indian Pythonista for the info about request.Session
    # https://www.youtube.com/watch?v=fmf_y8zpOgA

with requests.Session() as s:
    r = s.post(login_url, data=login_data, headers=headers)
    start_time = time.time()
    r = s.get('https://m.broadcastify.com/archives/feed/18812')
    while time.time() - start_time <= 5: pass
    soup = BeautifulSoup(r.text, 'html5lib')
    a_list = soup.find_all('a', {'title': 'Download audio'})

In [6]:
a_list

[]

In [7]:
soup

<!DOCTYPE html>
<html lang="en"><head>
    <!-- Title -->
    <title>Atlanta Police Zone 5 and Fire Dispatch Audio Archives</title>

    <!-- Required Meta Tags Always Come First -->
    <meta charset="utf-8"/>
    <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>

    <!-- Favicon -->
    <link href="https://s.broadcastify.com/i/favicon.ico" rel="shortcut icon"/>
    <link href="https://s.broadcastify.com/i/apple-touch-icon-v2.png" rel="apple-touch-icon"/>

    <!-- Google Fonts -->
    <link href="//fonts.googleapis.com/css?family=Poppins:300,400,500,600,700" rel="stylesheet"/>

    <!-- CSS Implementing Plugins -->
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.9.0/css/all.min.css" rel="stylesheet"/>
    <link href="https://s.broadcastify.com/front-v2.8.0/assets/vendor/animate.css/animate.min.css" rel="stylesheet"/>
    <link href="https://s.broadcastify.com/front-v2.8.0/assets/vendor/hs-megamenu/src/hs.megamenu.css" rel="

## Scrape feed archive page with `QWebEngineView` custom class

In [2]:
class Render(QWebEngineView):
        def __init__(self, url):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            #self.setHtml(html)
            self.load(QUrl(url))
            self.app.exec_()

        def _loadFinished(self, result):
            # This is an async call, you need to wait for this
            # to be called before closing the app
            self.page().toHtml(self._callable)

        def _callable(self, data):
            self.html = data
            # Data has been stored, it's safe to quit the app
            self.app.quit()

In [3]:
def render(source_url):
    """Fully render HTML, JavaScript and all."""
    return Render(source_url).html

In [4]:
url="https://m.broadcastify.com/archives/feed/18812"
print(render(url))

<!DOCTYPE html><html lang="en"><head>
    <!-- Title -->
    <title>Atlanta Police Zone 5 and Fire Dispatch Audio Archives</title>

    <!-- Required Meta Tags Always Come First -->
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

    <!-- Favicon -->
    <link rel="shortcut icon" href="https://s.broadcastify.com/i/favicon.ico">
    <link rel="apple-touch-icon" href="https://s.broadcastify.com/i/apple-touch-icon-v2.png">

    <!-- Google Fonts -->
    <link href="//fonts.googleapis.com/css?family=Poppins:300,400,500,600,700" rel="stylesheet">

    <!-- CSS Implementing Plugins -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.9.0/css/all.min.css">
    <link rel="stylesheet" href="https://s.broadcastify.com/front-v2.8.0/assets/vendor/animate.css/animate.min.css">
    <link rel="stylesheet" href="https://s.broadcastify.com/front-v2.8.0/assets/vendor/hs-megamenu/src/hs.megamenu.c

## Scrape feed archive page with `selenium`

In [35]:
# Instantiate a browser for all activity
browser = webdriver.Chrome('assets/chromedriver')

# Open the feed archive page
browser.get('https://m.broadcastify.com/archives/feed/18812')

# Log in so that we can download files
## Open the login sidebar
browser.find_element_by_id('sidebarNavToggler').click()

## Store the fields for username + password
username = browser.find_element_by_id("signinUsername")
password = browser.find_element_by_id("signinPassword")

## Wait for login sidebar to render
browser.implicitly_wait(2)

## Type username + password, and hit "enter"
username.send_keys("cwchiu")
password.send_keys("datascientists")
password.send_keys(Keys.RETURN)

## Wait for page to render
element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "cursor-link")))

# Capture page content as a BSoup
soup = BeautifulSoup(browser.page_source, 'lxml')

# Quit the browser
browser.quit()

In [64]:
months = ['','January', 'February', 'March',
          'April', 'May', 'June',
          'July', 'August', 'September',
          'October', 'November', 'December']

In [65]:
day = soup.find('td', {'class': 'active day'}).text
month, year = soup.find('th', {'class': 'datepicker-switch'}).text.split()
current_date = ' '.join([day, month, year])
formatted_date = str(year) + str(months.index(month)).zfill(2) + day.zfill(2)
print(current_date, formatted_date)

24 July 2019 20190724


In [36]:
archive_times = soup.find('table', attrs={'id': 'archiveTimes'}).find('tbody')
archive_times

<tbody>
<tr class="cursor-link text-monospace odd" role="row"><td>08:01 PM</td><td>08:31 PM<a class="text-dark fa fa-download float-right" download="" href="/archives/download/772907459" title="Download audio"></a></td></tr><tr class="cursor-link text-monospace even" role="row"><td>07:32 PM</td><td>08:02 PM<a class="text-dark fa fa-download float-right" download="" href="/archives/download/772901789" title="Download audio"></a></td></tr><tr class="cursor-link text-monospace odd" role="row"><td>07:02 PM</td><td>07:32 PM<a class="text-dark fa fa-download float-right" download="" href="/archives/download/772895113" title="Download audio"></a></td></tr><tr class="cursor-link text-monospace even" role="row"><td>06:32 PM</td><td>07:02 PM<a class="text-dark fa fa-download float-right" download="" href="/archives/download/772887091" title="Download audio"></a></td></tr><tr class="cursor-link text-monospace odd" role="row"><td>06:02 PM</td><td>06:32 PM<a class="text-dark fa fa-download float-ri

In [142]:
files_to_download = []

for row in archive_times.find_all('tr'): # skip header row via [1:]
    file = []
    file_info = []
    
#     file_id = row.find('a')['href'].split('/')[-1] # Unique ID of mp3 file
#     file_info.append(formatted_date) # Transmission date as YYYYMMDD
    start_time = time_to_hhmm(row.find_all('td')[0].text)
    end_time = time_to_hhmm(row.find_all('td')[1].text)
#     file_info.append(row.find_all('td')[0].text) # Start time
#     file_info.append(row.find_all('td')[1].text) # End time

    ##### NEED TO CHECK THE DATE IF THE FILE STARTED BEFORE MIDNIGHT AND ENDS AFTER MIDNIGHT

    archive_start_date_time = '-'.join([formatted_date, start_time])
    file_info.append(archive_start_date_time) # Start time
    
    file_info.append(row.find('a')['href']) # URL leaf

    files_to_download.append(tuple(file_info)) # Convert to tuple for speed
    
# pd.DataFrame(files)
files_to_download

[('20190724-2001', '/archives/download/772907459'),
 ('20190724-1932', '/archives/download/772901789'),
 ('20190724-1902', '/archives/download/772895113'),
 ('20190724-1832', '/archives/download/772887091'),
 ('20190724-1802', '/archives/download/772880918'),
 ('20190724-1732', '/archives/download/772874547'),
 ('20190724-1702', '/archives/download/772866777'),
 ('20190724-1633', '/archives/download/772860749'),
 ('20190724-1603', '/archives/download/772853150'),
 ('20190724-1533', '/archives/download/772848456'),
 ('20190724-1503', '/archives/download/772840463'),
 ('20190724-1433', '/archives/download/772833350'),
 ('20190724-1403', '/archives/download/772828031'),
 ('20190724-1334', '/archives/download/772819909'),
 ('20190724-1304', '/archives/download/772813950'),
 ('20190724-1234', '/archives/download/772808237'),
 ('20190724-1204', '/archives/download/772801558'),
 ('20190724-1134', '/archives/download/772794221'),
 ('20190724-1104', '/archives/download/772787835'),
 ('20190724-

In [132]:
def time_to_hhmm(s):
    # Converts a string representing a time in HH:MM AM/PM format to a string in 24-hr HHMM
    return ''.join(str(datetime.strptime(s, '%I:%M %p')).split(' ')[-1].split(':')[:2])
        # strptime converts the string to datetime 
            # see https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
        # first split separates YYYY-MM-DD from HH:MM
        # second split gets rid of the colon between HH & MM
        # join puts HHMM together

In [133]:
time_to_hhmm('07:32 PM')

'1932'