In [None]:
# Objective

To have a machine-readable set of data on the State Records Office's collection of scanned maps and survey plans, with
each object having a full set of metadata available, including a rough spatial location to enable end-users to
choose objects of local interest to georectify.

In [None]:
# For want of an API, let's scrape the digital objects collection from SRO.

https://github.com/geogeeks-au/maps-for-lost-towns/issues/8

In [None]:
from bs4 import BeautifulSoup
import requests
import time
import json
import os.path
from pprintpp import pprint as pp

SRO_CATALOGUE_URL = "https://archive.sro.wa.gov.au"
page = 225
scraping = True
json_file = "sro_digital_objects_collection.json"
digital_objects = []

while scraping is True:
    # @TODO Determine what a nice wait period is to stop SRO's server from falling over
    time.sleep(1)
    
    r = requests.get(
        SRO_CATALOGUE_URL + "/index.php/digitalobject/browse?page=%s&limit=30&sort=identifier" % (page)
    )
    soup = BeautifulSoup(r.text, "lxml")
    anchors = soup.select("div.preview > a")
        
    if len(anchors) == 0:
        scraping = False
    else:
        print "# Page #%s, #%s Items" % (page, len(anchors))
#         print
        
        digital_objects = []
        
        for anchor in anchors:
#             print "Thumbnail"
#             print SRO_CATALOGUE_URL + anchor.contents[0]["src"]
#             print

            print SRO_CATALOGUE_URL + anchor["href"]
            r_item = requests.get("https://archive.sro.wa.gov.au%s" % (anchor["href"]))
            item_soup = BeautifulSoup(r_item.text, "lxml")

            breadcrumbs = item_soup.select("section.breadcrumb > ul > li > a")
#             print breadcrumbs[0].text
#             print

            fields = item_soup.select("div#content > section > div.field")
#             print "Num. Fields: %s" % (len(fields))
#             print

            fields_santised = {"name_of_creator": []}
            for field in fields:
                field_name = field.find("h3").text.strip().lower().replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
                field_value = field.find("div").text.replace("\n", "").strip()

                # Ignore the crap tonne of the empty fields hidden
                if field_value != "":
                    # Some field names reoccur (e.g. Name of creator).
                    # Make these an array for happy fun times.
#                     if field_name in fields_santised:
#                         fields_santised[field_name] = [fields_santised[field_name]]

                    if field_name in fields_santised and type(fields_santised[field_name]) is list:
                        fields_santised[field_name].append(field_value)
                    else:
                        fields_santised[field_name] = field_value

#                     print "%s: '%s'" % (field_name, field_value)
        #             print

            digital_object = {
                "object_url": SRO_CATALOGUE_URL + anchor["href"],
                "thumbnail_url": SRO_CATALOGUE_URL + anchor.contents[0]["src"],
                "series": breadcrumbs[0].text,
                "fields": fields_santised
            }

#             print
#             pp(digital_object)

            digital_objects.append(digital_object)
            time.sleep(0.5)
        
        if os.path.isfile(json_file):
            with open(json_file) as infile:
                digital_objects_json = json.load(infile)
    #         pp(digital_objects_json)
        else:
            digital_objects_json = []
        
        with open(json_file, "w") as outfile:
            json.dump(digital_objects_json + digital_objects, outfile)
            
        page += 1
        print
    
#     scraping = False
    
    if page > 225:
        print "Fin"
        scraping = False
#         break

In [None]:
# Load our looted objects into PostgreSQL

import psycopg2

connection = "host='pg01.geogeeks.org' dbname='lost_towns' user='keith' password='notmypassword'"
conn = psycopg2.connect(connection)
cursor = conn.cursor()

with open(json_file) as infile:
    digital_objects_json = json.load(infile)

for index, dobject in enumerate(digital_objects_json):
    print index

    query =  ("INSERT INTO sro_digital_objects_collection (object_url, thumbnail_url, series, fields) "
                  "VALUES (%s, %s, %s, %s);")
    data = (dobject["object_url"], dobject["thumbnail_url"], dobject["series"], json.dumps(dobject["fields"]))
    cursor.execute(query, data)

print "Committing..."
conn.commit()
print "Fin"

In [None]:
# Loot our local cache of images and extract all of the meaningful information we can about them

https://github.com/geogeeks-au/maps-for-lost-towns/issues/10
    
All done! See https://github.com/geogeeks-au/maps-for-lost-towns/blob/master/scrapers/

In [None]:
# Now let's stitch the two together so we can see if the files we have can be associated with SRO's digital objects collection

https://github.com/geogeeks-au/maps-for-lost-towns/issues/11

In [None]:
# OK, now finally let's try to assign a spatial location to these

https://github.com/geogeeks-au/maps-for-lost-towns/issues/9