# Queensland workplace incidents

Goal: Scrape [a paginated list of Queensland workplace incidents](https://www.worksafe.qld.gov.au/news-and-events/alerts) into a CSV file.

In [None]:
import time
import csv

import requests
from bs4 import BeautifulSoup

In [None]:
# list of headers for output CSV
csv_headers = [
    'title',
    'link',
    'description',
    'date'
]

# get a variable for the base url (will use a couple times)
base_url = 'https://www.worksafe.qld.gov.au/news-and-events/alerts'

In [None]:
# first, we need to figure out how many items are on each page,
# and how many pages there are, in our initial request

# request the page
req = requests.get(base_url)

# check for HTTP errors
req.raise_for_status()

In [None]:
# turn the HTML into soup
soup = BeautifulSoup(req.text, 'html.parser')

In [None]:
soup

In [None]:
# fetch a list of pagination links, then grab the last one in the list ([-1])
# and access its `href` attribute
pagination = soup.find_all('a', {'class': 'pagination__link'})[-1]['href']

In [None]:
pagination

In [None]:
# get the final `start_rank` number -- the offset -- which is the 
# number after the `=` in the URL, and coerce to a number
last_start_rank = int(pagination.split('=')[-1])

In [None]:
last_start_rank

In [None]:
# now figure out how many elements are on one page
items = soup.find_all('li', {'class': 'search-results__item'})
per_page = len(items)

In [None]:
per_page

In [None]:
# a tracking list to hold the parsed data
data = []

# the `start_rank` param is basically, which item number should I start with
# on this search results page? so we want to use the `range()` function to build
# a range of numbers from 1 to `last_start_rank` (plus one, because the top end of
# the range is always exclusive), counting by the number of items per page (12)

for start_rank in range(1, last_start_rank+1, per_page):

    # grab the page
    req = requests.get(
        base_url,
        params={
            'start_rank': start_rank
        }
    )

    # check for HTTP errors
    req.raise_for_status()

    print(f'Grabbing items starting at {start_rank} ...')

    # turn the HTML into soup
    soup = BeautifulSoup(req.text, 'html.parser')

    # grab a list of items on the page
    items = soup.find_all('li', {'class': 'search-results__item'})

    # loop over the list of items
    for item in items:
        hed = item.find('h4')
        title = hed.text.strip()
        link = hed.find('a')['href']
        description = item.find('p').text.strip()
        date = item.find('span').text.strip()

        # build a list of data in the same order as the headers
        row_data = [
            title,
            link,
            description,
            date
        ]

        data.append(row_data)

    time.sleep(0.5)

In [None]:
data

In [None]:
# and write to file
with open('qld-incidents.csv', 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(csv_headers)
    writer.writerows(data)