In [2]:
# Using webscraping env
import datetime, os
import re, time, requests
import pandas as pd
from parsel import Selector
from selenium import webdriver
from selenium.webdriver import *
from selenium.webdriver.chrome.options import Options

## Scrapting data

In [2]:
def get_driver(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(
        executable_path="/home/danph/Repos/run/drivers/chromedriver",
        options=chrome_options)
    driver.get(url)
    return driver

In [3]:
url = "https://www.coronavirus.vic.gov.au/exposure-sites"
driver = get_driver(url)
all_sites = [] # Store info of all sites

### Click to different pages

In [4]:
def get_site_info(site):
    """This function get info for each site"""
    site_name = site.xpath("td[2]/div/span[1]/text()").get()
    address = site.xpath("td[2]/div/span[2]/text()").get()
    suburb = site.xpath("td[2]/div/span[3]/text()").get()
    postcode = site.xpath("td[2]/div/span[5]/text()").get()
    period = site.xpath("td[3]/span[2]/text()").get()
    note = site.xpath("td[4]/span[2]/text()").get()
    date_added = site.xpath("td[5]/span[2]/text()").get()
    advice = site.xpath("td[6]/div/button/span/text()").get()
    return (site_name, address, suburb, postcode, period, note, date_added, advice)

def get_sites(driver):
    """This function get sites in one page"""
    sites = Selector(driver.page_source)
    sites = sites.xpath('//div/table[@class="rpl-search-results-table"]/tbody/tr')
    
    for site in sites:
        all_sites.append(get_site_info(site))

In [5]:
%%time
# Get data in all pages
## Click each page and get data in each table
while True:
    try:
        get_sites(driver)
        next_page = driver.find_element_by_xpath("//nav[@class='rpl-pagination']/div/button[last()]")
        next_page.click()        
    except Exception:
        print("Finished")
        break

Finished
CPU times: user 129 ms, sys: 5.58 ms, total: 135 ms
Wall time: 1.78 s


### Save into a dataframe

In [6]:
sites_df = pd.DataFrame(all_sites, columns=['site_name','address','suburb', 'postcode', 'period', 'note', 'date_added','advice'])
sites_df.shape

(117, 8)

In [7]:
sites_df.head()

Unnamed: 0,site_name,address,suburb,postcode,period,note,date_added,advice
0,Dukes Gym Abbotsford,571-573 Victoria Street,Abbotsford,3067,24/05/2021 6:15pm - 8:15pm,Case attended venue,26/05/2021,Tier 1 - Get tested immediately and quarantin...
1,Chelsea Heights Hotel,Springvale Road & Wells Road,Aspendale Gardens,3196,23/05/2021 12:10pm - 1:30pm,Case dined at venue,27/05/2021,Tier 1 - Get tested immediately and quarantin...
2,Axedale Tavern,105 High Street,Axedale,3551,23/05/2021 11:45am - 1:30pm,Case attended function,26/05/2021,Tier 1 - Get tested immediately and quarantin...
3,Endless Lifestyle Studio\r\n\r,Shop 3/188 Whitehorse Road\r\n\r,Balwyn,3103,23/05/2021 9:45am - 12:40pm,Case attended venue,26/05/2021,Tier 1 - Get tested immediately and quarantin...
4,Mcdonalds Bayswater North,296 Canterbury Road,Bayswater North,3153,21/05/2021 11:00pm - 12:00am,Case dined in restaurant,26/05/2021,Tier 1 - Get tested immediately and quarantin...


In [8]:
sites_df.isna().sum()

site_name     0
address       0
suburb        0
postcode      0
period        0
note          0
date_added    0
advice        0
dtype: int64

### Basic data processing and save into a csv file

In [4]:
def get_tier(text):
    if text.find("Tier 1") > -1:
        res = "Tier 1"
    elif text.find("Tier 2") > -1:
        res = "Tier 2"
    else:
        res = "Tier 3"
    return res

In [5]:
sites_df['tier'] = sites_df.advice.apply(lambda x: get_tier(x))

In [6]:
# TODO: Clean address

In [7]:
sites_df['full_address'] = sites_df['address'] + ", " + sites_df['suburb'] + ", Victoria, " + sites_df['postcode'].astype(str) + ", Australia"

In [8]:
sites_df.to_json("../data/covid_sites.json")