First, import the Python modules necessary for this notebook to run. These can be installed using, for instance, `pip` or `conda`. 

In [2]:
import numpy as np # Handles maths
import pandas as pd # Good for tables of data
import matplotlib.pyplot as plt # Handles graphing
import xarray as xr # Helpful for spatial data
import requests # Downloads webpages
from bs4 import BeautifulSoup # For parsing webpages
from selenium import webdriver 
import os, sys
import time
import subprocess
import re

In [3]:
url = "http://epbcnotices.environment.gov.au/publicnoticesreferrals"

In [14]:
stored_table = pd.read_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices.csv')

In [13]:
table.to_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices.csv', index=False, header=True)

In [41]:
table.loc[31] = stored_table.iloc[15]

In [38]:
table['Title of referral'].loc[15] = 'bogus'

In [46]:
shared = pd.merge(table, stored_table, how='left', indicator='Exist')
shared['Exist'] = np.where(shared.Exist == 'both', True, False)
shared = shared['Exist']
shared

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15    False
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30    False
31     True
Name: Exist, dtype: bool

In [44]:
stored_table

Unnamed: 0,Reference Number,Title of referral,Notification from EPBC Act,Date of notice
0,2021/8900,APA Northern Goldfields Interconnect Pty Ltd/E...,Invitation for Public Comment on Referral,1/03/2021
1,2021/8888,Snowy Hydro Limited/Energy Generation and Supp...,Invitation for Public Comment on Referral,1/03/2021
2,2021/8889,AGL SA Generation Pty Limited/Energy Generatio...,Invitation for Public Comment on Referral,27/02/2021
3,2021/8887,Josko Garbin/Residential Development/Lot 877 P...,Notification of Proposal Withdrawn,26/02/2021
4,2018/8343,LA TROBE UNIVERSITY/Private/Lot- 1/PS444016/Vi...,Decision on approval of action: Approved with ...,25/02/2021
5,2020/8841,Sylvanvale Foundation/Residential Development/...,Suspension of Referral Decision Timeframe Period,25/02/2021
6,2009/5173,Gladstone Area Water Board and SunWater Limite...,Correction notice,25/02/2021
7,2021/8874,Department of Transport/Transport - Land/Betwe...,Decision whether action needs approval/Approva...,24/02/2021
8,2021/8874,Department of Transport/Transport - Land/Betwe...,Decision on Assessment Approach: Accredited As...,24/02/2021
9,2021/8883,Western Australian Land Authority trading as D...,Invitation for Public Comment on Referral,24/02/2021


In [20]:
stored_table.iloc[0]

Reference Number                                                      2021/8900
Title of referral             APA Northern Goldfields Interconnect Pty Ltd/E...
Notification from EPBC Act            Invitation for Public Comment on Referral
Date of notice                                                        1/03/2021
Name: 0, dtype: object

In [16]:
stored_table

Unnamed: 0,Reference Number,Title of referral,Notification from EPBC Act,Date of notice
0,2021/8900,APA Northern Goldfields Interconnect Pty Ltd/E...,Invitation for Public Comment on Referral,1/03/2021
1,2021/8888,Snowy Hydro Limited/Energy Generation and Supp...,Invitation for Public Comment on Referral,1/03/2021
2,2021/8889,AGL SA Generation Pty Limited/Energy Generatio...,Invitation for Public Comment on Referral,27/02/2021
3,2021/8887,Josko Garbin/Residential Development/Lot 877 P...,Notification of Proposal Withdrawn,26/02/2021
4,2018/8343,LA TROBE UNIVERSITY/Private/Lot- 1/PS444016/Vi...,Decision on approval of action: Approved with ...,25/02/2021
5,2020/8841,Sylvanvale Foundation/Residential Development/...,Suspension of Referral Decision Timeframe Period,25/02/2021
6,2009/5173,Gladstone Area Water Board and SunWater Limite...,Correction notice,25/02/2021
7,2021/8874,Department of Transport/Transport - Land/Betwe...,Decision whether action needs approval/Approva...,24/02/2021
8,2021/8874,Department of Transport/Transport - Land/Betwe...,Decision on Assessment Approach: Accredited As...,24/02/2021
9,2021/8883,Western Australian Land Authority trading as D...,Invitation for Public Comment on Referral,24/02/2021


In [4]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument("--start-maximized")
# options.add_argument('--headless') # Remove to see the actions on website

base_dir = '/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/files'

options.add_experimental_option("prefs", {
  "download.default_directory": base_dir,
  "download.prompt_for_download": False,
  "download.directory_upgrade": True,
  "safebrowsing.enabled": True,
  "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome('/usr/bin/chromedriver', options=options)
driver.get(url);
time.sleep(5) # Give data time to load
page_source = driver.page_source
soup = BeautifulSoup(page_source)

table = pd.read_html(soup.prettify())[0]

def clean_columns(table):
    name_dict = {}
    clean_str = '  . Activate to sort in descending order'
    for col in range(len(table.columns)): 
        name_dict[table.columns[col]] = table.columns[col].replace(clean_str, '')
    return table.rename(name_dict, axis='columns')
    
table = clean_columns(table)
table.drop(labels='Actions', axis=1, inplace=True)
table.to_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices.csv', index=False)

In [4]:
# Two modes - download or update mode.
# If in download mode, search forward, skipping files that already exist 
# in table.
# If in update mode, search forward, but assume new entries to website
# appear first, so stop when a certain number of matches have occured. 

xpath = '//a[@class="btn btn-default btn-xs" '
xpath += 'and @href="#" and @data-toggle="dropdown"]'
details_buttons = driver.find_elements_by_xpath(xpath)

xpath = '//a[@class="details-link launch-modal" '
xpath += 'and @href="#" and @title="View Details"]'
details_links = driver.find_elements_by_xpath(xpath)

# These will record the number of files and filenames for each submission
num_files = []
file_names = []

# Iterate over the 30 entries in the table on current page checking for files
for i in range(10):
    
    
    details_buttons[i].click()
    time.sleep(1)
    details_links[i].click()
    time.sleep(2)
    
    iframe = driver.find_elements_by_xpath(
        '//section[@class="modal fade modal-form modal-form-details in"]'
        + '/div/div/div/iframe'
    )
    driver.switch_to.frame(iframe[0])

    file_links = driver.find_elements_by_xpath(
        "//a[contains(@href, '/_entity/annotation/')]"
    )
    
    subprocess.run('rm ' + base_dir +'/*.pdf', shell=True)
    ref_num = table['Reference Number'].iloc[i].replace('/','')
    date = table['Date of notice'].iloc[i]
    date = re.sub(r'(^\d\/)',r'0\1',date)
    date = date.replace('/','')
        
    # If no files, skip this entry    
    if not file_links:
        num_files.append(0)
        file_names.append('')
        
        driver.switch_to.default_content()
        xpath = '//section[@class="modal fade modal-form '
        xpath += 'modal-form-details in"]/div/div/div/button'
        close_button = driver.find_elements_by_xpath(xpath)
        close_button[0].click()
        time.sleep(1)
        continue
        
    num_files.append(len(file_links))
        
    # Check if folder name already exists, if so append count
    folder_name = ref_num + '_' + date
    shell_cmd = 'find ' + base_dir + '/*' + folder_name + '* -maxdepth 1 '
    shell_cmd += '-type d | wc -l > ' + base_dir + '/folder_count.txt'
    subprocess.run(shell_cmd, shell=True)
    folder_count = int(np.loadtxt(base_dir + '/folder_count.txt'))
    subprocess.run('rm ' + base_dir + '/folder_count.txt', shell=True)

    if folder_count > 0:
        if folder_count == 1:
            # Append '_1' to existing folder
            shell_cmd = 'mv ' + base_dir + "/" + folder_name + ' '
            shell_cmd += base_dir + "/" + folder_name + '_1'
            subprocess.run(shell_cmd, shell=True)
        # Appead folder_count + 1 to new folder
        folder_name += '_' + str(folder_count+1)
    folder_path = base_dir + '/' + folder_name

    for j in range(len(file_links)):
        file_links[j].click()
        time.sleep(0.5)

    # Wait for files to download
    file_count = 0
    iterations = 0
    while file_count < len(file_links):
        if iterations > 1800:
            raise RuntimeError('Download timed out.')
        time.sleep(.5)
        shell_cmd = 'find ' + base_dir + '/*.pdf '
        shell_cmd += '-type f -print | wc -l > ' 
        shell_cmd += base_dir + '/num_files.txt'
        subprocess.run(shell_cmd, shell=True)
        file_count = int(np.loadtxt(base_dir + '/num_files.txt'))
        iterations += 1
    subprocess.run('rm ' + base_dir + '/num_files.txt', shell=True)
    time.sleep(1)

    # After files downloaded, move them to appropriate folder
    subprocess.run(['rm', '-r', folder_path])
    subprocess.run(['mkdir', folder_path])
    shell_cmd = 'mv ' + base_dir + '/*.pdf ' + folder_path 
    subprocess.run(shell_cmd, shell=True)

    # Record the filenames
    shell_cmd = 'find ' + folder_path + '/*.pdf -maxdepth 1 -type f '
    shell_cmd += '-printf "%f\n" > ' + folder_path + '/file_names.txt'
    subprocess.run(shell_cmd, shell=True)
    with open(folder_path + '/file_names.txt') as f:
        lines = f.readlines()
    file_names.append(', '.join(lines).replace('\n',''))
    subprocess.run('rm ' + folder_path + '/file_names.txt', shell=True)

    # Create merged pdf
    if len(file_links) > 1:
        shell_cmd = 'pdfunite ' + folder_path + '/*.pdf ' + folder_path 
        shell_cmd += '/' + folder_name + '_combined.pdf'
        subprocess.run(shell_cmd, shell=True)

    driver.switch_to.default_content()
    xpath = '//section[@class="modal fade modal-form '
    xpath += 'modal-form-details in"]/div/div/div/button'
    close_button = driver.find_elements_by_xpath(xpath)
    close_button[0].click()
    time.sleep(1)

In [None]:
for i in range(2,3):

    next_button = driver.find_elements_by_xpath(
        '//a[@href="#" and @data-page="' + str(i) + '"]'
    )[1]
    next_button.click()
    time.sleep(10+np.random.rand()*10-5)
    page_source = driver.page_source
    new_soup = BeautifulSoup(page_source)
    new_table = pd.read_html(new_soup.prettify())[0]
    new_table = clean_columns(new_table)
    table = table.append(new_table, ignore_index=True)

driver.quit()