In [51]:
%pip install BeautifulSoup4
%pip install requests

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [144]:
import json
import requests
import re
from bs4 import BeautifulSoup
from typing import List, Dict


def build_soup():
    html_doc = requests.get('https://c21ch.newcastle.edu.au/colonialmassacres/timeline.php').content

    return BeautifulSoup(html_doc, 'html.parser').find("table", { "id": "timeline" }).find_all('tr')


def convert_people(row: str)-> Dict:
    result = re.search(r'(?P<count>\d+) \((?P<group>[^)]+)\)', " ".join(row))
    if result:
        peoples_count = result.groupdict()
        peoples_count['count'] = int(peoples_count['count'])
        return peoples_count
    
    raise Exception(f"Could not parse row '{row}'")
    
def parse_location(location_cell):
    return location_cell if isinstance(location_cell, list) else location_cell.contents[0]

    
def convert_row_to_document(row_contents: List[str]):
    return {
        'estimated_date': "".join(row_contents[0][0]),
        'colony':row_contents[1][0],
        'location': parse_location(row_contents[2][0]), 
        'language_group': " ".join(row_contents[3]), 
        'estimated_victims_killed': convert_people(row_contents[4]), 
        'estimated_attackers_killed': convert_people(row_contents[5]), 
        'sources': " ".join([str(s) for s in row_contents[6]])
    }


def extract_tds(raw_row):
    return [
        cell.contents
        for cell 
        in raw_row.find_all("td")
    ]


def build_groups():
    soup = build_soup()
    
    
    for raw_row in soup[1:]:
        doc = convert_row_to_document(extract_tds(raw_row))
        yield doc
    

massacre_data = list(build_groups())

with open('aus-colonial-massacres.json', 'w') as fp:
    json.dump(massacre_data, fp, indent=4)

In [154]:
# Convert dates to python dates

import datetime

def clean_date(date_str):
    parts = date_str.split(" ")
    parts[0] = parts[0].rjust(2, '0')
    padded_date = " ".join(parts)
    return datetime.datetime.strptime(padded_date, '%d %b %Y').isoformat()

def parse_estimated_date(estimated_date: str):
    if ' to ' in estimated_date:
        return dict(zip(['from', 'to'], estimated_date.split(' to ')))
    else:
        return {'from': estimated_date, 'to': estimated_date}

def parse_cleaned_location(location):
    return re.sub(r'\s+?\(\d+\)', '', location)

def parse_victims_killed(killed: Dict):
    response = {
        'killed_victims_aboriginal_people': 0,
        'killed_victims_colonisers': 0,
        'killed_victims_other': 0
    }
    if killed['group'] == 'Aboriginal People':
        response['killed_victims_aboriginal_people'] = killed['count']
    elif killed['group'] == 'Colonisers':
        response['killed_victims_colonisers'] = killed['count']
    elif killed['group'] == 'Other':
        response['killed_victims_other'] = killed['count']
    else:
        raise Exception(f"Unexpected group {killed['group']}")
    return response
    
def parse_attackers_killed(killed: Dict):
    response = {
        'killed_attackers_aborginal_people': 0,
        'killed_attackers_colonisers': 0
    }
    if killed['group'] == 'Aboriginal People':
        response['killed_attackers_aborginal_people'] = killed['count']
    elif killed['group'] == 'Colonisers':
        response['killed_attackers_colonisers'] = killed['count']
    else:
        raise Exception(f"Unexpected group {killed['group']}")
    return response

    
def clean_for_plotting(massacre: Dict):
    estimated_date = parse_estimated_date(massacre['estimated_date'])
    return {
        **{
            "estimated_date_from": clean_date(estimated_date['from']),
            "estimated_date_to": clean_date(estimated_date['to']),
            "colony": massacre['colony'],
            "location": parse_cleaned_location(massacre['location']),
            "language_group": massacre['language_group'],
        },
        **parse_attackers_killed(massacre['estimated_attackers_killed']),
        **parse_victims_killed(massacre["estimated_victims_killed"]),
        **{
            "sources": massacre['sources']
        }
    }

massacre_data_cleaned = [
    clean_for_plotting(m) for m in massacre_data
]

with open('aus-colonial-massacres-cleaned.json', 'w') as fp:
    json.dump(massacre_data_cleaned, fp, indent=4)


[
    {
        "estimated_date_from": "1794-09-01T00:00:00",
        "estimated_date_to": "1794-09-01T00:00:00",
        "colony": "NSW",
        "location": "Hawkesbury",
        "language_group": "Bediagal",
        "killed_attackers_aborginal_people": 0,
        "killed_attackers_colonisers": 0,
        "killed_victims_aboriginal_people": 7,
        "killed_victims_colonisers": 0,
        "killed_victims_other": 0,
        "sources": "Fletcher 1975: vol.1: 326; Turbet 2011: 81"
    },
    {
        "estimated_date_from": "1795-06-07T00:00:00",
        "estimated_date_to": "1795-06-07T00:00:00",
        "colony": "NSW",
        "location": "Hawkesbury",
        "language_group": "Bediagal",
        "killed_attackers_aborginal_people": 0,
        "killed_attackers_colonisers": 0,
        "killed_victims_aboriginal_people": 7,
        "killed_victims_colonisers": 0,
        "killed_victims_other": 0,
        "sources": "Fletcher 1975:  348-9;  <i>HRA, I,ii</i> : 416;  <i>HRNSW,  II</i