In [51]:
%pip install BeautifulSoup4
%pip install requests

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [124]:
import json
import requests
import re
from bs4 import BeautifulSoup
from typing import List, Dict


def build_soup():
    html_doc = requests.get('https://c21ch.newcastle.edu.au/colonialmassacres/timeline.php').content

    return BeautifulSoup(html_doc, 'html.parser').find("table", { "id": "timeline" }).find_all('tr')


def convert_people(row: str)-> Dict:
    result = re.search(r'(?P<count>\d+) \((?P<group>[^)]+)\)', " ".join(row))
    if result:
        peoples_count = result.groupdict()
        peoples_count['count'] = int(peoples_count['count'])
        return peoples_count
    
    raise Exception(f"Could not parse row '{row}'")
    
def parse_location(location_cell):
    return location_cell if isinstance(location_cell, list) else location_cell.contents[0]

    
def convert_row_to_document(row_contents: List[str]):
    return {
        'estimated_date': "".join(row_contents[0][0]),
        'colony':row_contents[1][0],
        'location': parse_location(row_contents[2][0]), 
        'language_group': " ".join(row_contents[3]), 
        'estimated_victims_killed': convert_people(row_contents[4]), 
        'estimated_attackers_killed': convert_people(row_contents[5]), 
        'sources': " ".join([str(s) for s in row_contents[6]])
    }


def extract_tds(raw_row):
    return [
        cell.contents
        for cell 
        in raw_row.find_all("td")
    ]


def build_groups():
    soup = build_soup()
    
    
    for raw_row in soup[1:]:
        doc = convert_row_to_document(extract_tds(raw_row))
        yield doc
    

massacre_data = list(build_groups())

with open('aus-colonial-massacres.json', 'w') as fp:
    json.dump(massacre_data, fp, indent=4)

In [125]:
# Convert dates to python dates

import datetime

def clean_date(date_str):
    parts = date_str.split(" ")
    parts[0] = parts[0].rjust(2, '0')
    padded_date = " ".join(parts)
    return datetime.datetime.strptime(padded_date, '%d %b %Y').isoformat()

def parse_estimated_date(estimated_date: str):
    if ' to ' in estimated_date:
        return dict(zip(['from', 'to'], estimated_date.split(' to ')))
    else:
        return {'from': estimated_date, 'to': estimated_date}

def clean_for_plotting(massacre: Dict):
    estimated_date = parse_estimated_date(massacre['estimated_date'])
    return {
        'estimated_date': {
            'from': clean_date(estimated_date['from']),
            'to': clean_date(estimated_date['to']),
        }
    }

massacre_data_cleaned = [
    clean_for_plotting(m) for m in massacre_data
]

with open('aus-colonial-massacres-cleaned.json', 'w') as fp:
    json.dump(massacre_data_cleaned, fp, indent=4)

print(json.dumps(massacre_data_cleaned, indent=4))

[
    {
        "estimated_date": {
            "from": "1794-09-01T00:00:00",
            "to": "1794-09-01T00:00:00"
        }
    },
    {
        "estimated_date": {
            "from": "1795-06-07T00:00:00",
            "to": "1795-06-07T00:00:00"
        }
    },
    {
        "estimated_date": {
            "from": "1804-05-03T00:00:00",
            "to": "1804-05-03T00:00:00"
        }
    },
    {
        "estimated_date": {
            "from": "1805-04-27T00:00:00",
            "to": "1805-04-27T00:00:00"
        }
    },
    {
        "estimated_date": {
            "from": "1806-03-01T00:00:00",
            "to": "1806-03-01T00:00:00"
        }
    },
    {
        "estimated_date": {
            "from": "1815-11-01T00:00:00",
            "to": "1815-11-30T00:00:00"
        }
    },
    {
        "estimated_date": {
            "from": "1816-04-17T00:00:00",
            "to": "1816-04-17T00:00:00"
        }
    },
    {
        "estimated_date": {
            "from": "1818-