# Installing dependencies

In [None]:
%%capture
!pip install sparqlwrapper
!sudo apt-get install parallel

# Data gathering

The `execute_query` function adds the needed prefixes and handles errors.

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def execute_query(query:str):
    sparql = SPARQLWrapper("https://stad.gent/sparql")
    sparql.setReturnFormat(JSON)
    sparql.setMethod('POST')

    query = """
            PREFIX cidoc: <http://www.cidoc-crm.org/cidoc-crm/>
            PREFIX foaf: <http://xmlns.com/foaf/0.1/>
            PREFIX person: <https://data.vlaanderen.be/ns/persoon#>
            PREFIX w3: <http://www.w3.org/ns/adms#>
            
            """ + query

    sparql.setQuery(query)
    try:
        ret = sparql.queryAndConvert()

        #for r in ret["results"]["bindings"]:
        #    print(r)

        return ret
    except Exception as e:
        print(e)


This query gets the first 10000 iiif manifest links from the HVA-dataset.

In [None]:
res = execute_query("""SELECT DISTINCT ?y FROM <http://stad.gent/ldes/hva>{
  ?x cidoc:P129i_is_subject_of ?y
}
""") # LIMIT 1000

list_of_manifests= [f'''wget -q -O manifests/{x['y']['value'].split('/')[-1]}.json  {x['y']['value']}''' for x in res['results']['bindings']]
print(list_of_manifests)

['wget -q -O manifests/hva:2022-006-111.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-111', 'wget -q -O manifests/hva:2022-006-019.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-019', 'wget -q -O manifests/hva:2022-006-020.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-020', 'wget -q -O manifests/hva:2022-006-021.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-021', 'wget -q -O manifests/hva:2022-006-022.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-022', 'wget -q -O manifests/hva:2022-006-015.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-015', 'wget -q -O manifests/hva:2022-006-016.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-016', 'wget -q -O manifests/hva:2022-006-017.json  https://api.collectie.gent/iiif/presentation/v2/manifest/hva:2022-006-017', 'wget -q -O manifests/hva:2022-

All the retrieved manifest links are downloaded into the `manifests` directory.

In [None]:
import os

if not os.path.exists('manifests'):
    os.mkdir('manifests')
manifest_string = '\n'.join(list_of_manifests)
with open('manifest_list.txt', 'w+') as manifest_list_file:
    manifest_list_file.write(manifest_string)

!parallel -j4 --eta --progress -a manifest_list.txt


Computers / CPU cores / Max jobs to run
1:local / 2 / 4

Computer:jobs running/jobs completed/%of started jobs/Average seconds to complete
ETA: 0s Left: 0 AVG: 0.46s  local:0/10000/100%/0.5s 


The `get_image` function looks for an image url in the provided manifest and attempts to download it. Note: this currently assumes that there is only 1 image in the manifest, which isn't neccessarily the case.

In [None]:
import json

if not os.path.exists('images'):
    os.mkdir('images')

def get_image(manifest_path: str):
    with open(os.path.join('manifests', manifest_path), 'r') as json_file:
        json_data = json.load(json_file)
    image_url = json_data['sequences'][0]['canvases'][0]['images'][0]['resource']['@id']
    os.system(f'wget -P images {str(image_url)}')

for i in os.listdir('manifests'):
    get_image(i)

In [None]:
print(os.listdir('manifests')[:10])

['hva:2009-046-282.json', 'hva:2003-246-017.json', 'hva:2020-013-028.json', 'hva:1979-109-012.json', 'hva:1978-022-023.json', 'hva:2007-165-046.json', 'hva:2007-016-012.json', 'hva:1980-223-107.json', 'hva:2001-038-237.json', 'hva:FO-0106-0013.json']


In [None]:
import json

def get_image(manifest_path: str):
    print(manifest_path)
    with open(os.path.join('manifests', manifest_path), 'r') as json_file:
        json_data = json.load(json_file)
    print(len(json_data['sequences'][0]['canvases'][0]['images']))

for i in os.listdir('manifests'):
    get_image(i)

hva:2009-046-282.json
1
hva:2003-246-017.json


JSONDecodeError: ignored

Zip the downloaded images to make them ready for download. An alternative could be to put them on a google drive.

In [None]:
!zip -r -qq images.zip images

  adding: images/ (stored 0%)
  adding: images/default.jpg.116 (deflated 19%)
  adding: images/default.jpg.714 (deflated 6%)
  adding: images/default.jpg.393 (deflated 5%)
  adding: images/default.jpg.739 (deflated 1%)
  adding: images/default.jpg.513 (deflated 1%)
  adding: images/default.jpg.387 (deflated 0%)
  adding: images/default.jpg.282 (deflated 0%)
  adding: images/default.jpg.727 (deflated 0%)
  adding: images/default.jpg.76 (deflated 0%)
  adding: images/default.jpg.726 (deflated 95%)
  adding: images/default.jpg.142 (deflated 13%)
  adding: images/default.jpg.46 (deflated 0%)
  adding: images/default.jpg.92 (deflated 0%)
  adding: images/default.jpg.609 (deflated 0%)
  adding: images/default.jpg.355 (deflated 0%)
  adding: images/default.jpg.452 (deflated 1%)
  adding: images/default.jpg.234 (deflated 0%)
  adding: images/default.jpg.635 (deflated 0%)
  adding: images/default.jpg.205 (deflated 0%)
  adding: images/default.jpg.474 (deflated 1%)
  adding: images/default.jpg.2