# Import

First, import the Python modules necessary for this notebook to run. These can be installed using, for instance, `pip` or `conda`. 

In [1]:
import numpy as np # Handles maths
import pandas as pd # Good for tables of data
import matplotlib.pyplot as plt # Handles graphing
import requests # Downloads webpages
from bs4 import BeautifulSoup # For parsing webpages
from selenium import webdriver 
from selenium.webdriver.common.action_chains import ActionChains
import os, sys
import time
import subprocess
import re
import string

import scrape_EPBC
%load_ext autoreload
%autoreload 2

# Initialise

In [2]:
# stored_table = pd.read_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices_test.csv')
# stored_table[stored_table['Reference Number'] == '2019/8527']
# stored_table = stored_table.drop(1539, axis=0).reset_index(drop=True)
# stored_table.to_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices_test.csv', index=False, header=True)

In [3]:
url = "http://epbcnotices.environment.gov.au/publicnoticesreferrals"

In [4]:
options = webdriver.ChromeOptions()
# options.add_argument('--headless') # Comment out to see the actions on website
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument("--start-maximized")

base_dir = '/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/' 
sub_dir = 'files_test'
files_dir = base_dir + sub_dir 

options.add_experimental_option("prefs", {
  "download.default_directory": files_dir,
  "download.prompt_for_download": False,
  "download.directory_upgrade": True,
  "safebrowsing.enabled": True,
  "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome('/usr/bin/chromedriver', options=options)
driver.get(url);
time.sleep(4)

def clean_columns(table):
    name_dict = {}
    clean_str = '  . Activate to sort in descending order'
    for col in range(len(table.columns)): 
        name_dict[table.columns[col]] = table.columns[col].replace(clean_str, '')
    return table.rename(name_dict, axis='columns')

In [5]:
%pdb

Automatic pdb calling has been turned ON


# Scrape Website

In [None]:
for i in range(1,50):
    
    loading = True
    attempts = 0
    while loading:
        if attempts > 30:
            raise RuntimeError('Could not load website')
        try:
            time.sleep(2)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source)
            table = pd.read_html(soup.prettify())[0]
            if len(table) == 30:
                loading = False
        except:
            attempts += 1

    table = clean_columns(table)
    table['PDF Attachments'] = ['TBD']*30
    table['Non PDF Attachments'] = ['TBD']*30
    table['Download'] = ['TBD']*30
    table['Download Folder'] = ['TBD']*30
    table['PDFs Combined'] = ['TBD']*30
    
    table['Date of notice'] = pd.to_datetime(
        table['Date of notice'], dayfirst=True)
    table.drop(labels='Actions', axis=1, inplace=True) 
    
    try:
        stored_table = pd.read_csv(base_dir + '/EPBC_notices_test.csv')
        stored_table['Date of notice'] = pd.to_datetime(
            stored_table['Date of notice'], dayfirst=True)
        label_list = [
            'PDF Attachments', 'Non PDF Attachments', 
            'Download', 'Download Folder', 'PDFs Combined']
        shared = pd.merge(
            table.drop(labels=label_list, axis=1), 
            stored_table.drop(labels=label_list, axis=1), 
            how='left', indicator='Exist')
        shared['Exist'] = np.where(shared.Exist == 'both', True, False)
        exist = shared['Exist']
        del shared
    except:
        stored_table = table.iloc[0:0]
        stored_table['Date of notice'] = pd.to_datetime(
            stored_table['Date of notice'], dayfirst=True)
        exist = [False]*30
        exist = pd.Series(exist, name='Exist')
     
    if np.any(~exist):
        scrape_EPBC.scrape_page(
            driver, i, table, stored_table, exist, base_dir, files_dir)
    
    try:
        next_button = driver.find_elements_by_xpath(
            '//a[@href="#" and @data-page="' + str(i+1) + '"]'
        )[1]
        ActionChains(driver).move_to_element(next_button).perform()
        next_button.click()
    except:
        print('Quitting.')
    
    del table, stored_table
    
driver.quit()

Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Downloading. Please Wait.
Still Do

In [2]:
stored_table = pd.read_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices_test.csv')

In [6]:
stored_table.loc[stored_table['Download'] == 'Fail']

Unnamed: 0,Reference Number,Title of referral,Notification from EPBC Act,Date of notice,PDF Attachments,Non PDF Attachments,Download,Download Folder,PDFs Combined


In [4]:
stored_table

Unnamed: 0,Reference Number,Title of referral,Notification from EPBC Act,Date of notice,PDF Attachments,Non PDF Attachments,Download,Download Folder,PDFs Combined
0,2021/8963,Western Region Water Corporation/Water Managem...,Invitation for Public Comment on Referral,07/06/2021,Yes,No,Success,20218963_07062021_western_region_water_corpora...,Yes
1,2021/8966,VIC Offshore Windfarm Pty Ltd/Energy Generatio...,Invitation for Public Comment on Referral,07/06/2021,Yes,No,Success,20218966_07062021_vic_offshore_windfarm_pty_lt...,Yes
2,2021/8969,Byron Shire Council/Waste Management (sewerage...,Invitation for Public Comment on Referral,07/06/2021,Yes,No,Success,20218969_07062021_byron_shire_council_invitati...,Yes
3,2021/8909,MMG Australia Limited/Mining/1km west of Roseb...,Suspension of Referral Decision Timeframe Period,04/06/2021,Yes,No,Success,20218909_04062021_mmg_australia_limited_suspen...,Yes
4,2018/8286,Transport for NSW/Transport - Land/M7 Motorway...,Decision on approval of action: Approved with ...,03/06/2021,Yes,No,Success,20188286_03062021_transport_for_nsw_decision_o...,Yes
...,...,...,...,...,...,...,...,...,...
145,2021/8895,Fraser Coast Regional Council/Waste Management...,Decision on Assessment Approach: Preliminary D...,19/04/2021,Yes,No,Success,20218895_19042021_fraser_coast_regional_counci...,Yes
146,2011/6179,Australian Zircon NL (AZC) /Mining/Approx 20km...,Notification of Proposal Withdrawn,18/04/2021,No,No,Not Applicable,Not Applicable,Not Applicable
147,2021/8898,Department of Finance/Commonwealth/1 Kelliher ...,Decision whether action needs approval/Approva...,16/04/2021,Yes,No,Success,20218898_16042021_department_of_finance_decisi...,Yes
148,2021/8902,Rawling Road Pty Ltd/Commercial Development/So...,Decision whether action needs approval/Approva...,16/04/2021,Yes,No,Success,20218902_16042021_rawling_road_pty_ltd_decisio...,Yes
