# Library def

In [606]:
import collections
import os
import pandas as pd
import re
import requests
# import weakref

from datetime import date
from datetime import timedelta
from IPython.display import clear_output
from time import time as timer

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.support import wait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [583]:
# Constants
FEED_URL_STEM = 'https://www.broadcastify.com/listen/feed/'
ARCHIVE_URL_STEM = 'https://m.broadcastify.com/archives/feed/'
LOGIN_URL = 'https://www.broadcastify.com/login/'
MAIN_URL = 'https://www.broadcastify.com/'

WEBDRIVER_PATH = '../assets/chromedriver'

FILE_REQUEST_WAIT = 3 # seconds
PAGE_REQUEST_WAIT = 1 # seconds

MONTHS = ['','January', 'February', 'March',
      'April', 'May', 'June',
      'July', 'August', 'September',
      'October', 'November', 'December']

# Library-level variables
ArchiveEntry = collections.namedtuple('ArchiveEntry',
                                     'file_uri file_end_date_time mp3_url')
"""
file_uri : str
    The unique ID for an individual archive file, which corresponds to a feed's 
    transmission over a ~30-minute period on a given date. Can be used to find 
    the file's individual download page
file_end_date_time : str
    Date and end time of the transmission in the format YYYYMMDD-HHMM, on a 
    24-hour clock
mp3_url : str
    The URL of the corresponding mp3 file
""";

In [584]:
class BroadcastifyFeed:
    def __init__(self, feed_id, username=None, password=None):
        # Attributes
        self.id = feed_id
        self.url = FEED_URL_STEM + feed_id
        self.archive_url = ARCHIVE_URL_STEM + feed_id
        self.username = username
        self.password = password
        self.is_logged_in = False
        self.last_page_request_time = timer()
        self.last_file_request_time = timer()
        self.browser = webdriver.Chrome('../assets/chromedriver')
#         self.archive = Archive(self)
        self.archive = None
        
        # Properties
        @property
        def username(self):
            """Username for Broadcastify premium account."""
            print('Inside property construct.')
            return self.username
        @username.setter
        def username(self, value):
            self.username = value
            print('Inside property construct.')

        @property
        def password(self):
            """Password for Broadcastify premium account."""
            if self.__password:
                print('Inside property construct.')
                return True
            else:
                print('Inside property construct.')
                return False
        @password.setter
        def password(self, value):
            self.__password = value
            print('Inside property construct.')

#        # For future implementation
#         self.genre = None
#         self.listeners = None
#         self.status = None
#         self.description = None
#         self.notes = None

    def build_archive(self, days_back=-1):
        archive_builder = Archive()
        
    def courtesy_wait(self, request_type='Page'):
        if type == 'File':
            while timer() - self.last_file_request_time <= FILE_REQUEST_WAIT:
                pass
            last_file_request_time = timer()
        else:
            while timer() - self.last_page_request_time <= PAGE_REQUEST_WAIT:
                pass
            last_page_request_time = timer()
        
    def login(self):
        self.last_page_request_time = timer()
        self.browser.get(LOGIN_URL)

        ## Store the fields for username + password
        username_field = self.browser.find_element_by_id("uname") 
        password_field = self.browser.find_element_by_name("password")

        ## Type username + password, and hit "enter"
        username_field.send_keys(USERNAME)
        password_field.send_keys(PASSWORD)
        password_field.send_keys(Keys.RETURN)

        ## Wait for login to complete
        self.browser.implicitly_wait(2)
        
        ## Check that the login was successful
        if self.browser.current_url == MAIN_URL:
            is_logged_in = True
        else:
            is_logged_in = False
            raise ConnectionError('Login failed: please check username and password.')
    
    def __del__(self):
        print("Delete method called.")
        self.browser.quit()

In [585]:
class Archive:
    def __init__(self, parent):
        self.parent = parent
        self.archive_times_table = None # to interact with archivetimes table        
        self.archive_times_navigator = None # to navigate the "archive times" over multiple dates
        self.url = ARCHIVE_URL_STEM + parent.id
        self.entries = [] # list of ArchiveEntry objects
        self.earliest_date = None 
        self.latest_date = None
        
        self.archive_page_soup = None # make this private

    def build(self, days_back=0): # 0 days back means the active day
        
        all_timestable_entries = []
        all_mp3_paths = []
        
#         # Open the archive's navigation page
#         parent.courtesy_wait()
#         parent.browser.get(self.url)
        
#         # Wait for page to render
#         element = WebDriverWait(parent.browser, 10).until(
#                   EC.presence_of_element_located((By.CLASS_NAME, "cursor-link")))

#         # Capture page content as a BSoup
#         self.archive_page_soup = BeautifulSoup(parent.browser.page_source, 'lxml')
        
        # Populate the CalendarNavigator and the ArchiveTimesTable
#         self.archive_times_table = ArchiveTimesTable(self, self.archive_page_soup)
        self.calendar = ArchiveTimesNavigator()
        

        # Adjust days back, since loops start at 1
        days_back += 1
        
        # Ensure the "active day" is selected on the calendar
        
        # For each day requested...
        for day in range(days_back):
            pass
        
        # Get the mp3 file URLs
        
        # Put uri, end date, & URL into an ArchiveEntry, and make a list of all of them

    def get_mp3_files(self, start_date=-1, end_date=-1, time_of_day=-1):
        pass
    
    def __parse_timestable(self):
        """
        Generates a list of Broadcastify archive file information from
        the `archiveTimes` table on a feed's archive page. Each item in
        the list is a list of two elements:
            - The unique ID for the file, which can be used to find the file's
              individual download page
            - Date and end time of the transmission in the format YYYYMMDD-HHMM,
              on a 24-hour clock

        Parameters
        ----------
        self.soup : bs4.BeautifulSoup
            A BeautifulSoup object containing the feed archive page source code, 
            e.g. from https://m.broadcastify.com/archives/feed/[feed_id]


        """
        # Set up a blank list to return
        timestable_entries = []

        # Isolate the `archive_times` table body
        archive_times = self.soup.find('table', attrs={'id': 'archiveTimes'}).find('tbody')

        # Find the date of transmission of the archived files
        archive_date = self.__format_archive_date()

        # Loop through all rows of the table
        for row in archive_times.find_all('tr'):

            # Grab the end time, contained in the row's second <td> tag
            file_end_time = self.__time_to_hhmm(row.find_all('td')[1].text) 

            # Represent the date & end time of the file as YYYYMMDD-HHMM
            file_end_date_time = '-'.join([archive_date, file_end_time])

            # Grab the file ID
            file_uri = row.find('a')['href'].split('/')[-1]

            # Put the file date/time and URL leaf (as a list) into the list
            timestable_entries.append([file_uri, file_end_date_time])
        
        return timestable_entries
    
    def parse_mp3_path(download_page_soup):
        # Get the filepath for the mp3 archive
        return download_page_soup.find('a', {'href': re.compile('.mp3')}).attrs['href']

    def __repr__(self):
        return self.entries

In [682]:
class ArchiveTimesNavigator:
    def __init__(self, url):
        self.url = url
        self.calendar_soup = None
        self.archive_times_soup = None

        self.active_date = None # currently displayed date
        self.month_max_date = None # latest day in displayed month with archive entries
        self.month_min_date = None # earliest day in displayed month with archive entries
        
        self.browser = webdriver.Chrome('../assets/chromedriver')

        # Get initial page scrape & parse the calendar
        self.__load_nav_page()
        self.__scrape_nav_page()
        self.__parse_calendar()
        
        self.archive_max_date = self.active_date
        
        self.archive_min_date = self.archive_max_date - timedelta(days=181)
            # https://www.saltycrane.com/blog/2010/10/how-get-date-n-days-ago-python/    
        
    def click_prior_day(self):
        # calculate the prior day
        prior_day = self.active_date - timedelta(days=1)
        
        # would this take us past the archive? if so, stop.
        if prior_day < self.archive_min_date:
            return False
        
        # is the prior day in the previous month? set the class appropriately
        if prior_day < self.month_min_date:
            xpath_class = 'old day'
        else:
            xpath_class = 'day'
        
        xpath_day = prior_day.day
        
        # click the day before the currently displayed day
        calendar_day = self.browser.find_element_by_xpath(
                        f"//td[@class='{xpath_class}' "
                        f"and contains(text(), '{xpath_day}')]")
            # https://stackoverflow.com/questions/2009268/how-to-write-an-xpath-query-to-match-two-attributes
        calendar_day.click()

        # refresh soup & re-parse calendar
        self.__scrape_nav_page()
        self.__parse_calendar()
        
        return True
    
    def __load_nav_page(self):
        # Browse to feed archive page
        self.browser.get(self.url)   
    
    def __scrape_nav_page(self):
        # Wait for page to render
        element = WebDriverWait(self.browser, 10).until(
                  EC.presence_of_element_located((By.CLASS_NAME, 
                                                  "cursor-link")))

        # Scrape page content
        soup = BeautifulSoup(self.browser.page_source, 'lxml')

        # Isolate the calendar and the archiveTimes table
        self.calendar_soup = soup.find('table', 
                                       {'class': 'table-condensed'})
        self.archive_times_soup = soup.find('table', 
                                            attrs={'id': 'archiveTimes'}
                                           ).find('tbody')
        

    def __parse_calendar(self):
        """
        Uses a bs4 ResultSet of the <td> tags representing days currently displayed
        on the calendar to set calendarattributes. Items have the format of 
        `<td class="[class]">[D]</td>` where 
         - [D] is the one- or two-digit day (as a string) and
         - [class] is one of
             "old day"          = a day with archives but in a prior month (clicking
                                  will refresh the calendar)
             "day"              = a past day in the current month
             "active day"       = the day currently displayed in the archiveTimes 
                                  table
             "disabled day"     = a day for which no archive is available in a month
                                  (past or future) that has other days with archives. 
                                  For example, if today is July 27, July 28-31 will 
                                  be disabled days, as will January 1-26 (since the 
                                  archive goes back only 180 days). December 31 would
                                  be an "old disabled day".
                                  past month for which archives are no longer available
             "new disabled day" = a day in a future month
             "old disabled day" = see explanation in "disabled day"
         
        """
        # Get the tags representing the days currently displayed on the calendar
        days_on_calendar = self.calendar_soup.find_all('td')
        
        # Get the month & year currently displayed
        month, year = self.calendar_soup.find('th', 
                                              {'class': 'datepicker-switch'}
                                              ).text.split(' ')
        
        displayed_month = MONTHS.index(month)
        displayed_year = int(year)
        
        # Parse the various calendar attributes
        active_day = int([day.text for day in days_on_calendar
                           if (day['class'][0] == 'active')][0])
        
        month_max_day = int([day.text for day in days_on_calendar
                              if (day['class'][0] == 'day') or
                                 (day['class'][0] == 'active')][::-1][0])
        
        month_min_day = int(self.__parse_month_min_day(days_on_calendar))
        
        # Set class attributes
        self.active_date = date(displayed_year, displayed_month, active_day)        
        self.month_min_date = date(displayed_year, displayed_month, month_min_day)
        self.month_max_date = date(displayed_year, displayed_month, month_max_day)
        
    def __parse_month_min_day(self, days_on_calendar):
        """Parse the lowest valid day in the displayed month"""
        disabled_found = False
        for day in days_on_calendar:
            if day['class'][0] == 'disabled':
                disabled_found = True
            elif day['class'][0] in 'day active'.split():
                return day.text
            elif day['class'][0] != 'old' and disabled_found:
                return day.text
        
        return None
    
    def __repr__(self):
        return(f'ArchiveTimesNavigator(URL: {self.url}, '
               f'Currently Displayed: {str(self.active_date)}, '
               f'Max Day: {str(self.archive_max_date)}, '
               f'Min Day: {str(self.archive_min_date)}, ')

In [587]:
class ArchiveTimesTable:
    def __init__(self, parent, archive_page_soup):
        self.parent = parent
        self.soup = archive_page_soup
        self.table_entries = self.__parse_entries()
        
        # Properties
        @property
        def table_entries(self):
            """Username for Broadcastify premium account."""
            print('Inside property construct.')
            return self.table_entries
        @table_entries.setter
        def table_entries(self, value):
            self.table_entries = value
            print('Inside property construct.')
        
#     def __parse_entries(self):
#         """
#         Generates a list of Broadcastify archive file information from
#         the `archiveTimes` table on a feed's archive page. Each item in
#         the list is a list of two elements:
#             - The unique ID for the file, which can be used to find the file's
#               individual download page
#             - Date and end time of the transmission in the format YYYYMMDD-HHMM,
#               on a 24-hour clock

#         Parameters
#         ----------
#         self.soup : bs4.BeautifulSoup
#             A BeautifulSoup object containing the feed archive page source code, 
#             e.g. from https://m.broadcastify.com/archives/feed/[feed_id]


#         """
#         # Set up a blank list to return
#         table_entry_builder = []

#         # Isolate the `archive_times` table body
#         archive_times = self.soup.find('table', attrs={'id': 'archiveTimes'}).find('tbody')

#         # Find the date of transmission of the archived files
#         archive_date = self.__format_archive_date()

#         # Loop through all rows of the table
#         for row in archive_times.find_all('tr'):

#             # Grab the end time, contained in the row's second <td> tag
#             file_end_time = self.__time_to_hhmm(row.find_all('td')[1].text) 

#             # Represent the date & end time of the file as YYYYMMDD-HHMM
#             file_end_date_time = '-'.join([archive_date, file_end_time])

#             # Grab the file ID
#             file_uri = row.find('a')['href'].split('/')[-1]

#             # Put the file date/time and URL leaf (as a list) into the list
#             table_entry_builder.append([file_uri, file_end_date_time])
        
#         return table_entry_builder

    def __get_mp3_urls(self):
        # Get the first page
        parent.last_page_request_time = timer()
        browser.get('https://m.broadcastify.com/archives/id/' + self.table_entries[0][1])


#         # Log in so we can download files
#         ## Store the fields for username + password
#         username_field = browser.find_element_by_id("signinSrEmail") 
#         password_field = browser.find_element_by_id("signinSrPassword")

#         ## Type username + password, and hit "enter"
#         username_field.send_keys(USERNAME)
#         password_field.send_keys(PASSWORD)
#         password_field.send_keys(Keys.RETURN)

#         ## Wait for login to complete
#         browser.implicitly_wait(2)

        # Get the filepath for the mp3 archive
        self.table_entries[0].append(get_mp3_path(BeautifulSoup(browser.page_source, 'lxml')))

        for row in self.table_entries[1:11]:
           # Wait until some time has passed, out of courtesy
            while not courtesy_wait(parent.last_page_request_time): pass

            # Get the next archive page, recording the time
            browser.get('https://m.broadcastify.com/archives/id/' + row[1])
            parent.last_page_request_time = timer()

            # Get the filepath for the mp3 archive
            row.append(get_mp3_path(BeautifulSoup(browser.page_source, 'lxml')))
        
    def __format_archive_date(self):

        # Extract the day, month, and year of the data displayed on the page
        day = self.soup.find('td', {'class': 'active day'}).text
        month, year = self.soup.find('th', {'class': 'datepicker-switch'}).text.split()

        # Format the date as YYYYMMDD
        formatted_date = str(year) + str(MONTHS.index(month)).zfill(2) + day.zfill(2)

        return formatted_date
    
    def __time_to_hhmm(self, s):
        # More details, since it's a one-line method and this isn't freaking codewars:
            # strptime converts the string to datetime 
                # see https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
                # and https://stackoverflow.com/questions/19229190/convert-12-hour-into-24-hour-times
            # first split separates YYYY-MM-DD from HH:MM
            # second split gets rid of the colon between HH & MM
            # join puts HHMM together
        # Converts a string representing a time in HH:MM AM/PM format to a string in 24-hr HHMM
        return ''.join(str(datetime.strptime(s, '%I:%M %p')).split(' ')[-1].split(':')[:2])
            
    def __repr__(self):
        return (f'ArchiveTimesTable({len(self.table_entries)} entries)')

# Test code

In [588]:
DATA_PATH = '../data/'
TEST_MP3_OUT_PATH = DATA_PATH + 'test_data/test_mp3/'

TEST_FEED_ID = '18812'

USERNAME = 'cwchiu'
PASSWORD = 'datascientists'

In [620]:
datetime.today() - timedelta(days=181)

datetime.datetime(2019, 1, 27, 23, 44, 40, 970724)

In [675]:
atn = ArchiveTimesNavigator(ARCHIVE_URL_STEM + TEST_FEED_ID)

In [681]:
atn.click_prior_day()
print(atn)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//td[@class='old day' and contains(text(), '21')]"}
  (Session info: chrome=75.0.3770.142)


In [619]:
feed.__del__()

NameError: name 'feed' is not defined

In [474]:
del feed

In [457]:
feed = BroadcastifyFeed(TEST_FEED_ID, USERNAME, PASSWORD)

In [458]:
feed.archive.get_entries()

Delete method called.


In [472]:
print(feed.archive.archive_times_table.table_entries)

[['773831530', '20190727-1709'], ['773823986', '20190727-1639'], ['773817481', '20190727-1609'], ['773810895', '20190727-1539'], ['773805010', '20190727-1510'], ['773798449', '20190727-1440'], ['773790308', '20190727-1410'], ['773784180', '20190727-1340'], ['773777042', '20190727-1310'], ['773771190', '20190727-1240'], ['773765214', '20190727-1210'], ['773757101', '20190727-1141'], ['773750826', '20190727-1111'], ['773744097', '20190727-1041'], ['773737164', '20190727-1011'], ['773730937', '20190727-0941'], ['773723651', '20190727-0911'], ['773717272', '20190727-0842'], ['773710499', '20190727-0812'], ['773704908', '20190727-0742'], ['773697273', '20190727-0712'], ['773690608', '20190727-0642'], ['773682878', '20190727-0612'], ['773676590', '20190727-0543'], ['773669644', '20190727-0513'], ['773663813', '20190727-0443'], ['773657360', '20190727-0413'], ['773650877', '20190727-0343'], ['773643565', '20190727-0313'], ['773637411', '20190727-0244'], ['773629449', '20190727-0214'], ['77362

In [465]:
feed.last_file_request_time

1564263563.226176

In [463]:
feed.archive.last_file_request_time = 'foobar'

In [464]:
feed.archive.last_file_request_time

'foobar'