# Import

First, import the Python modules necessary for this notebook to run. These can be installed using, for instance, `pip` or `conda`. 

In [1]:
import numpy as np # Handles maths
import pandas as pd # Good for tables of data
import matplotlib.pyplot as plt # Handles graphing
import requests # Downloads webpages
from bs4 import BeautifulSoup # For parsing webpages
from selenium import webdriver 
from selenium.webdriver.common.action_chains import ActionChains
import os, sys
import time
import subprocess
import re
import string

import scrape_EPBC
%load_ext autoreload
%autoreload 2

# Initialise

In [2]:
# stored_table = pd.read_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices_test.csv')
# stored_table[stored_table['Reference Number'] == '2019/8527']
# stored_table = stored_table.drop(1539, axis=0).reset_index(drop=True)
# stored_table.to_csv('/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/EPBC_notices_test.csv', index=False, header=True)

In [3]:
url = "http://epbcnotices.environment.gov.au/publicnoticesreferrals"

In [4]:
options = webdriver.ChromeOptions()
# options.add_argument('--headless') # Comment out to see the actions on website
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument("--start-maximized")

base_dir = '/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/' 
sub_dir = 'files_test'
files_dir = base_dir + sub_dir 

options.add_experimental_option("prefs", {
  "download.default_directory": files_dir,
  "download.prompt_for_download": False,
  "download.directory_upgrade": True,
  "safebrowsing.enabled": True,
  "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome('/usr/bin/chromedriver', options=options)
driver.get(url);
time.sleep(4)

def clean_columns(table):
    name_dict = {}
    clean_str = '  . Activate to sort in descending order'
    for col in range(len(table.columns)): 
        name_dict[table.columns[col]] = table.columns[col].replace(clean_str, '')
    return table.rename(name_dict, axis='columns')

# Define Page Scraper


Two modes - download or update mode.
If in download mode, search forward, skipping files that already exist 
in table.
If in update mode, search forward, but assume new entries to website
appear first, so stop when a certain number of matches have occured. 

In [5]:
%pdb

Automatic pdb calling has been turned ON


# Scrape Website

In [6]:
for i in range(1,4):
    
    loading = True
    attempts = 0
    while loading:
        if attempts > 30:
            raise RuntimeError('Could not load website')
        try:
            time.sleep(2)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source)
            table = pd.read_html(soup.prettify())[0]
            if len(table) == 30:
                loading = False
        except:
            attempts += 1

    table = clean_columns(table)
    table['Attachments'] = ['TBD']*30
    table['Download'] = ['TBD']*30
    table['Download Folder'] = ['TBD']*30    
    table['Date of notice'] = pd.to_datetime(
        table['Date of notice'], dayfirst=True)
    table.drop(labels='Actions', axis=1, inplace=True)  
    
    try:
        stored_table = pd.read_csv(base_dir + '/EPBC_notices_test.csv')
        stored_table['Date of notice'] = pd.to_datetime(
            stored_table['Date of notice'], dayfirst=True)
        label_list = ['Attachments', 'Download', 'Download Folder']
        shared = pd.merge(
            table.drop(labels=label_list, axis=1), 
            stored_table.drop(labels=label_list, axis=1), 
            how='left', indicator='Exist')
        shared['Exist'] = np.where(shared.Exist == 'both', True, False)
        exist = shared['Exist']
        del shared
    except:
        stored_table = table.iloc[0:0]
        stored_table['Date of notice'] = pd.to_datetime(
            stored_table['Date of notice'], dayfirst=True)
        exist = [False]*30
        exist = pd.Series(exist,name='Exist')
     
    if np.any(~exist):
        scrape_EPBC.scrape_page(
            driver, i, table, stored_table, exist, base_dir, files_dir)
    
    try:
        next_button = driver.find_elements_by_xpath(
            '//a[@href="#" and @data-page="' + str(i+1) + '"]'
        )[1]
        ActionChains(driver).move_to_element(next_button).perform()
        next_button.click()
    except:
        print('Quitting.')
    
    del table, stored_table
    
driver.quit()

> [0;32m<ipython-input-6-e3ed8c7f91ec>[0m(28)[0;36m<module>[0;34m()[0m
[0;32m     26 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     27 [0;31m[0;34m[0m[0m
[0m[0;32m---> 28 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     29 [0;31m        [0mstored_table[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mbase_dir[0m [0;34m+[0m [0;34m'/EPBC_notices_test.csv'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     30 [0;31m        stored_table['Date of notice'] = pd.to_datetime(
[0m
ipdb> n
> [0;32m<ipython-input-6-e3ed8c7f91ec>[0m(29)[0;36m<module>[0;34m()[0m
[0;32m     27 [0;31m[0;34m[0m[0m
[0m[0;32m     28 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 29 [0;31m        [0mstored_table[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0mbas

ipdb> n
> [0;32m<ipython-input-6-e3ed8c7f91ec>[0m(33)[0;36m<module>[0;34m()[0m
[0;32m     31 [0;31m            stored_table['Date of notice'], dayfirst=True)
[0m[0;32m     32 [0;31m        [0mlabel_list[0m [0;34m=[0m [0;34m[[0m[0;34m'Attachments'[0m[0;34m,[0m [0;34m'Download'[0m[0;34m,[0m [0;34m'Download Folder'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 33 [0;31m        shared = pd.merge(
[0m[0;32m     34 [0;31m            [0mtable[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mlabels[0m[0;34m=[0m[0mlabel_list[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     35 [0;31m            [0mstored_table[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mlabels[0m[0;34m=[0m[0mlabel_list[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m<ipython-input-6-e3ed8c7f91ec>[0m(37)[0;36m<module>[0;34m()[0m
[0;32m 

ipdb> n
> [0;32m/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/scrape_EPBC.py[0m(150)[0;36mscrape_page[0;34m()[0m
[0;32m    148 [0;31m    [0;31m# Iterate over the 30 entries in the table on current page checking for files[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    149 [0;31m    [0;32mfor[0m [0mi[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m30[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 150 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    151 [0;31m        [0;32mif[0m [0mexist[0m[0;34m[[0m[0mi[0m[0;34m][0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    152 [0;31m            [0;32mcontinue[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m/home/student.unimelb.edu.au/shorte1/Documents/ACF_consulting/scrape_EPBC.py[0m(151)[0;36mscrape_page[0;34m()[0m
[0;32m    149 [0;31m    [0;32m

BdbQuit: 

> [0;32m/home/student.unimelb.edu.au/shorte1/anaconda3/envs/acf/lib/python3.9/bdb.py[0m(113)[0;36mdispatch_line[0;34m()[0m
[0;32m    111 [0;31m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mstop_here[0m[0;34m([0m[0mframe[0m[0;34m)[0m [0;32mor[0m [0mself[0m[0;34m.[0m[0mbreak_here[0m[0;34m([0m[0mframe[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    112 [0;31m            [0mself[0m[0;34m.[0m[0muser_line[0m[0;34m([0m[0mframe[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 113 [0;31m            [0;32mif[0m [0mself[0m[0;34m.[0m[0mquitting[0m[0;34m:[0m [0;32mraise[0m [0mBdbQuit[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    114 [0;31m        [0;32mreturn[0m [0mself[0m[0;34m.[0m[0mtrace_dispatch[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    115 [0;31m[0;34m[0m[0m
[0m
ipdb> q
