First, import the Python modules necessary for this notebook to run. These can be installed using, for instance, `pip` or `conda`. 

In [1]:
import numpy as np # Handles maths
import pandas as pd # Good for tables of data
import matplotlib.pyplot as plt # Handles graphing
import xarray as xr # Helpful for spatial data
import requests # Downloads webpages
from bs4 import BeautifulSoup # For parsing webpages
from selenium import webdriver 
import os, sys
import time
import subprocess

In [2]:
url = "http://epbcnotices.environment.gov.au/publicnoticesreferrals"

In [3]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument("--start-maximized")
# options.add_argument('--headless') # Remove to see the actions on website

base_dir = '/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/files'

options.add_experimental_option("prefs", {
  "download.default_directory": base_dir,
  "download.prompt_for_download": False,
  "download.directory_upgrade": True,
  "safebrowsing.enabled": True,
  "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome('/usr/bin/chromedriver', options=options)
driver.get(url);
time.sleep(10 + np.random.rand()*10-5) # Give data time to load
page_source = driver.page_source
soup = BeautifulSoup(page_source)

table = pd.read_html(soup.prettify())[0]

def clean_columns(table):
    name_dict = {}
    clean_str = '  . Activate to sort in descending order'
    for col in range(len(table.columns)): 
        name_dict[table.columns[col]] = table.columns[col].replace(clean_str, '')
    return table.rename(name_dict, axis='columns')
    
table = clean_columns(table)

In [4]:
details_buttons = driver.find_elements_by_xpath(
        '//a[@class="btn btn-default btn-xs" and @href="#" and @data-toggle="dropdown"]'
    )
details_links = driver.find_elements_by_xpath(
        '//a[@class="details-link launch-modal" and @href="#" and @title="View Details"]'
    )

# Iterate over the 30 entries in the table
for i in range(1):
    details_buttons[i].click()
    time.sleep(2)
    details_links[i].click()
    time.sleep(2)
    
    iframe = driver.find_elements_by_xpath(
        '//section[@class="modal fade modal-form modal-form-details in"]'
        + '/div/div/div/iframe'
    )
    driver.switch_to.frame(iframe[0])

    file_links = driver.find_elements_by_xpath(
        "//a[contains(@href, '/_entity/annotation/')]"
    )
    
    for i in range(len(file_links)):
        file_links[i].click()
        time.sleep(2)
            
    # After files downloaded, move them to appropriate folder
    folder_name = table['Reference Number'].iloc[i].replace('/','')
    folder_path = base_dir + '/' + folder_name
    subprocess.run(['rm', '-r', folder_path]) 
    subprocess.run(['mkdir', folder_path])
    subprocess.run('mv ' + base_dir + '/*.pdf ' + folder_path, shell=True)


# driver.switch_to.default_content()

In [None]:
for i in range(2,3):

    next_button = driver.find_elements_by_xpath(
        '//a[@href="#" and @data-page="' + str(i) + '"]'
    )[1]
    next_button.click()
    time.sleep(10+np.random.rand()*10-5)
    page_source = driver.page_source
    new_soup = BeautifulSoup(page_source)
    new_table = pd.read_html(new_soup.prettify())[0]
    new_table = clean_columns(new_table)
    table = table.append(new_table, ignore_index=True)

driver.quit()

In [5]:
file_links[1].click()

In [None]:
details_buttons[0].click()
time.sleep(2)
details_links[0].click()

In [None]:
iframe = driver.find_elements_by_xpath('//section[@class="modal fade modal-form modal-form-details in"]/div/div/div/iframe')

In [None]:
driver.switch_to.frame(iframetest[0])

In [None]:
<section aria-hidden="false" class="modal fade modal-form modal-form-details in" role="dialog" tabindex="-1" style="display: block; padding-right: 15px;"><div class="modal-lg modal-dialog"><div class="modal-content"><div class="modal-header"><button class="close" data-dismiss="modal" type="button"><span aria-hidden="true">×</span><span class="sr-only">Close</span></button><h1 class="modal-title h4">Decision Notice Details</h1></div><div class="modal-body"><div class="form-loading" style="display: none;"><span class="fa fa-spinner fa-spin fa-4x" aria-hidden="true"></span></div><iframe data-page="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5" src="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5?id=bd343a19-e377-eb11-80c2-00505684c137&amp;entityformid=bd49f92c-14e8-431d-bd40-e6fdc206cddb&amp;languagecode=1033"></iframe></div></div></div></section>

In [None]:
driver.find_elements_by_xpath("//iframe[contains(@data_page, '')]")

In [None]:
<section aria-hidden="false" class="modal fade modal-form modal-form-details in" role="dialog" tabindex="-1" style="display: block; padding-right: 15px;"><div class="modal-lg modal-dialog"><div class="modal-content"><div class="modal-header"><button class="close" data-dismiss="modal" type="button"><span aria-hidden="true">×</span><span class="sr-only">Close</span></button><h1 class="modal-title h4">Decision Notice Details</h1></div><div class="modal-body"><div class="form-loading" style="display: none;"><span class="fa fa-spinner fa-spin fa-4x" aria-hidden="true"></span></div><iframe data-page="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5" src="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5?id=bd343a19-e377-eb11-80c2-00505684c137&amp;entityformid=bd49f92c-14e8-431d-bd40-e6fdc206cddb&amp;languagecode=1033"></iframe></div></div></div></section>

In [None]:
<iframe data-page="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5" src=""></iframe>

In [None]:
file_links = driver.find_elements_by_xpath(
        "//a[contains(@href, '/_entity/annotation/')]"
)

In [None]:
file_links

In [None]:
for i in range(len(file_links)):
    file_links[i].click()
    time.sleep(2)

In [6]:
download_headless(driver, base_dir)

In [None]:
file_links[0].click()

In [None]:
driver.find_elements_by_xpath('//section[@class="modal fade modal-form modal-form-details in" and @role="dialog"]')

In [None]:
<section aria-hidden="false" class="modal fade modal-form modal-form-details in" role="dialog" tabindex="-1" style="display: block; padding-right: 15px;"><div class="modal-lg modal-dialog"><div class="modal-content"><div class="modal-header"><button class="close" data-dismiss="modal" type="button"><span aria-hidden="true">×</span><span class="sr-only">Close</span></button><h1 class="modal-title h4">Decision Notice Details</h1></div><div class="modal-body"><div class="form-loading" style="display: none;"><span class="fa fa-spinner fa-spin fa-4x" aria-hidden="true"></span></div><iframe data-page="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5" src="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5?id=bd343a19-e377-eb11-80c2-00505684c137&amp;entityformid=bd49f92c-14e8-431d-bd40-e6fdc206cddb&amp;languagecode=1033"></iframe></div></div></div></section>

In [None]:
<iframe data-page="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5" src="/_portal/modal-form-template-path/a71d58ad-4cba-48b6-8dab-f3091fc31cd5?id=bd343a19-e377-eb11-80c2-00505684c137&amp;entityformid=bd49f92c-14e8-431d-bd40-e6fdc206cddb&amp;languagecode=1033"></iframe>

In [None]:

driver.quit()

In [None]:
<a href="/_entity/annotation/dabd9519-e477-eb11-80c2-00505684c137/a71d58ad-4cba-48b6-8dab-f3091fc31cd5?t=1614398517390" target="_blank">2021-8889 - referral.pdf (333.70 KB)</a>

In [None]:
len(details_links)
# <a class="details-link launch-modal" role="menuitem" tabindex="-1" href="#" title="View Details" data-entityformid="bd49f92c-14e8-431d-bd40-e6fdc206cddb"><span class="fa fa-info-circle fa-lg fa-fw" aria-hidden="true"></span> View Details</a>

In [None]:
table