First, import the Python modules necessary for this notebook to run. These can be installed using, for instance, `pip` or `conda`. 

In [1]:
import numpy as np # Handles maths
import pandas as pd # Good for tables of data
import matplotlib.pyplot as plt # Handles graphing
import xarray as xr # Helpful for spatial data
import requests # Downloads webpages
from bs4 import BeautifulSoup # For parsing webpages
from selenium import webdriver 
from selenium.webdriver.common.action_chains import ActionChains
import os, sys
import time
import subprocess
import re

In [2]:
url = "http://epbcnotices.environment.gov.au/publicnoticesreferrals"

In [3]:
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Comment out to see the actions on website
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument("--start-maximized")

base_dir = '/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/files'

options.add_experimental_option("prefs", {
  "download.default_directory": base_dir,
  "download.prompt_for_download": False,
  "download.directory_upgrade": True,
  "safebrowsing.enabled": True,
  "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome('/usr/bin/chromedriver', options=options)
driver.get(url);
time.sleep(4)

def clean_columns(table):
    name_dict = {}
    clean_str = '  . Activate to sort in descending order'
    for col in range(len(table.columns)): 
        name_dict[table.columns[col]] = table.columns[col].replace(clean_str, '')
    return table.rename(name_dict, axis='columns')

In [7]:
def scrape_page(driver, page_number, table, stored_table, exist):
    
    # Two modes - download or update mode.
    # If in download mode, search forward, skipping files that already exist 
    # in table.
    # If in update mode, search forward, but assume new entries to website
    # appear first, so stop when a certain number of matches have occured. 

    xpath = '//a[@class="btn btn-default btn-xs" '
    xpath += 'and @href="#" and @data-toggle="dropdown"]'
    details_buttons = driver.find_elements_by_xpath(xpath)

    xpath = '//a[@class="details-link launch-modal" '
    xpath += 'and @href="#" and @title="View Details"]'
    details_links = driver.find_elements_by_xpath(xpath)

    next_button = driver.find_elements_by_xpath(
        '//a[@href="#" and @data-page="' + str(page_number+1) + '"]'
    )[1]

    # These will record the number of files and filenames for each submission
    num_files = []
    file_names = []

    # Iterate over the 30 entries in the table on current page checking for files
    for i in range(30):

        # If already downloaded, skip this row
        if exist[i]:
            continue

        if i < 29: 
            # Move to element i+1, as i may be blocked by Chrome download bar! 
            ActionChains(driver).move_to_element(details_buttons[i+1]).perform()
            details_buttons[i].click()
            time.sleep(1)
        else:
            # Move to navigation bar, as i may be blocked by Chrome download bar!
            ActionChains(driver).move_to_element(next_button).perform()
            details_buttons[i].click()
            time.sleep(1)

        ActionChains(driver).move_to_element(details_links[i]).perform()
        details_links[i].click()
        time.sleep(2)

        iframe = driver.find_elements_by_xpath(
            '//section[@class="modal fade modal-form modal-form-details in"]'
            + '/div/div/div/iframe'
        )
        driver.switch_to.frame(iframe[0])

        file_links = driver.find_elements_by_xpath(
            "//a[contains(@href, '/_entity/annotation/')]"
        )

        subprocess.run('rm ' + base_dir +'/*.pdf', shell=True)
        ref_num = table['Reference Number'].iloc[i].replace('/','')
        date = table['Date of notice'].iloc[i].strftime('%d%m%Y')
        
        # If no files, skip this row    
        if not file_links:
            num_files.append(0)
            file_names.append('')

            driver.switch_to.default_content()
            xpath = '//section[@class="modal fade modal-form '
            xpath += 'modal-form-details in"]/div/div/div/button'
            close_button = driver.find_elements_by_xpath(xpath)
            close_button[0].click()
            time.sleep(1)
            continue

        num_files.append(len(file_links))

        # Check if folder name already exists, if so append count
        folder_name = ref_num + '_' + date
        shell_cmd = 'find ' + base_dir + '/*' + folder_name + '* -maxdepth 1 '
        shell_cmd += '-type d | wc -l > ' + base_dir + '/folder_count.txt'
        subprocess.run(shell_cmd, shell=True)
        folder_count = int(np.loadtxt(base_dir + '/folder_count.txt'))
        subprocess.run('rm ' + base_dir + '/folder_count.txt', shell=True)

        if folder_count > 0:
            if folder_count == 1:
                # Append '_1' to existing folder
                shell_cmd = 'mv ' + base_dir + "/" + folder_name + ' '
                shell_cmd += base_dir + "/" + folder_name + '_1'
                subprocess.run(shell_cmd, shell=True)
            # Appead folder_count + 1 to new folder
            folder_name += '_' + str(folder_count+1)
        folder_path = base_dir + '/' + folder_name
        
        import pdb; pdb.set_trace()
        
        successful = False
        attempts = 0
        while not successful:
            if attempts > 5:
                raise RuntimeError('Download timed out too many times.')
            try:
                for j in range(len(file_links)):
                    file_links[j].click()
                    time.sleep(1)

                # Wait for files to download
                file_count = 0
                iterations = 0
                while file_count < len(file_links):
                    if iterations > 600:
                        raise RuntimeError('Download timed out.')
                    time.sleep(1)
                    shell_cmd = '''find ''' + base_dir + '''/*.PDF -maxdepth 1 -exec sh -c 'mv "$1" "${1%.PDF}.pdf"' _ {} \;'''
                    subprocess.run(shell_cmd, shell=True)
                    shell_cmd = 'find ' + base_dir + '/*.pdf '
                    shell_cmd += '-type f -print | wc -l > ' 
                    shell_cmd += base_dir + '/num_files.txt'
                    subprocess.run(shell_cmd, shell=True)
                    file_count = int(np.loadtxt(base_dir + '/num_files.txt'))
                    iterations += 1
                subprocess.run('rm ' + base_dir + '/num_files.txt', shell=True)
                successful = True
                time.sleep(1)
            except:
                attempts += 1
        

        # After files downloaded, move them to appropriate folder
        subprocess.run(['rm', '-r', folder_path])
        subprocess.run(['mkdir', folder_path])
        shell_cmd = 'mv ' + base_dir + '/*.pdf ' + folder_path 
        subprocess.run(shell_cmd, shell=True)

        # Record the filenames
        shell_cmd = 'find ' + folder_path + '/*.pdf -maxdepth 1 -type f '
        shell_cmd += '-printf "%f\n" > ' + folder_path + '/file_names.txt'
        subprocess.run(shell_cmd, shell=True)
        with open(folder_path + '/file_names.txt') as f:
            lines = f.readlines()
        file_names.append(', '.join(lines).replace('\n',''))
        subprocess.run('rm ' + folder_path + '/file_names.txt', shell=True)

        shell_cmd = 'pdfunite ' + folder_path + '/*.pdf ' + folder_path 
        shell_cmd += '/' + folder_name + '_combined.pdf'
        subprocess.run(shell_cmd, shell=True)

        driver.switch_to.default_content()
        xpath = '//section[@class="modal fade modal-form '
        xpath += 'modal-form-details in"]/div/div/div/button'
        close_button = driver.find_elements_by_xpath(xpath)
        close_button[0].click()
        time.sleep(1)
        
        # Append the downloaded row to the stored table and save
        row = table.iloc[i]
        stored_table = stored_table.append(row, ignore_index=True)
        stored_table = stored_table.sort_values(
            by='Date of notice', axis = 0, 
            ascending=False
        )
        stored_table = stored_table.reset_index(drop=True)
        stored_table['Date of notice'] = stored_table['Date of notice'].apply(lambda x: x.strftime('%d/%m/%Y'))
        stored_table.to_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices.csv', index=False, header=True)
        stored_table['Date of notice'] = pd.to_datetime(stored_table['Date of notice'], dayfirst=True)

In [None]:
for i in range(1,167):
    
    loading = True
    attempts = 0
    while loading:
        if attempts > 30:
            raise RuntimeError('Could not load website')
        try:
            time.sleep(2)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source)
            table = pd.read_html(soup.prettify())[0]            
            if len(table) == 30:
                loading = False
        except:
            attempts += 1

    table = clean_columns(table)
    table['Date of notice'] = pd.to_datetime(table['Date of notice'], dayfirst=True)
    table.drop(labels='Actions', axis=1, inplace=True)  
    
    try:
        stored_table = pd.read_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices.csv')
        stored_table['Date of notice'] = pd.to_datetime(stored_table['Date of notice'], dayfirst=True)
        shared = pd.merge(table, stored_table, how='left', indicator='Exist')
        shared['Exist'] = np.where(shared.Exist == 'both', True, False)
        exist = shared['Exist']
        del shared
    except:
        stored_table = table.iloc[0:0]
        stored_table['Date of notice'] = pd.to_datetime(stored_table['Date of notice'], dayfirst=True)
        exist = [False]*30
        exist = pd.Series(exist,name='Exist')
     
    if np.any(~exist):
        scrape_page(driver, i, table, stored_table, exist)
    
    next_button = driver.find_elements_by_xpath(
        '//a[@href="#" and @data-page="' + str(i+1) + '"]'
    )[1]
    ActionChains(driver).move_to_element(next_button).perform()
    next_button.click()
    time.sleep(3)

    del table, stored_table
    
driver.quit()

> <ipython-input-7-b5c85f237f97>(96)scrape_page()
-> successful = False
(Pdb) l
 91  	            folder_name += '_' + str(folder_count+1)
 92  	        folder_path = base_dir + '/' + folder_name
 93  	
 94  	        import pdb; pdb.set_trace()
 95  	
 96  ->	        successful = False
 97  	        attempts = 0
 98  	        while not successful:
 99  	            if attempts > 5:
100  	                raise RuntimeError('Download timed out too many times.')
101  	            try:
(Pdb) page_number
23
(Pdb) n
> <ipython-input-7-b5c85f237f97>(97)scrape_page()
-> attempts = 0
(Pdb) n
> <ipython-input-7-b5c85f237f97>(98)scrape_page()
-> while not successful:
(Pdb) n
> <ipython-input-7-b5c85f237f97>(99)scrape_page()
-> if attempts > 5:
(Pdb) n
> <ipython-input-7-b5c85f237f97>(101)scrape_page()
-> try:
(Pdb) n
> <ipython-input-7-b5c85f237f97>(102)scrape_page()
-> for j in range(len(file_links)):
(Pdb) n
> <ipython-input-7-b5c85f237f97>(103)scrape_page()
-> file_links[j].click()
(Pdb) n
> <