In [1]:
import os

data = []
image_directory = "/mnt/idai_cloud"

print(f'Collecting JPEGs in directory "{image_directory}".')
for root, dirs, files in os.walk(image_directory):
    for file in files:
        if file.endswith('.jpg') or file.endswith('.jpeg') or file.endswith('.JPG') or file.endswith('JPEG'):
            data.append((file, os.path.abspath(f'{root}/{file}'), None))

print(f'{len(data)} images found.')

Collecting JPEGs in directory "/mnt/idai_cloud".
1427486 images found.


In [2]:
from random import shuffle
shuffle(data)


In [3]:
# Reduce the amount of final images by taking only every 100th.
data_filtered = data[::100]
print(f'{len(data_filtered)} images used.')


14275 images used.


In [4]:
import db.mariadb as mariadb
import json
import MySQLdb._exceptions as mysql_exceptions
from requests_futures.sessions import FuturesSession

batch_counter = 0
batch_size = 1000

print('Processing batches.')
print('  Trying to resolve Arachne URLs, this may take some time.')

while batch_counter < len(data_filtered):

    print(f"  Processing batch {batch_counter} to {batch_counter+batch_size} of {len(data_filtered)}.")
    current_batch = data[batch_counter:batch_counter+batch_size]

    session = FuturesSession(max_workers=10)

    arachne_path = 'https://arachne.dainst.org/data'
    futures = []
    for (name, _path, _url) in current_batch:
        futures.append(session.get(f'{arachne_path}/search?q={name}'))

    for idx, response in enumerate(futures):

        try:
            entity_id = None
            json_value = response.result().json()
            if json_value['size'] == 1:
                entity_id = json_value['entities'][0]['entityId']

            if entity_id is not None:
                current_batch[idx] = (current_batch[idx][0], current_batch[idx][1],
                                      f'{arachne_path}/image/{entity_id}')
        except json.decoder.JSONDecodeError as e:
            print(e)
            print(response)

    con = mariadb.get_connection("127.0.0.1", 3308, "image_processing", "user", "user_pw")
    try:
        mariadb.write_files_data(current_batch, con)
    except mysql_exceptions.DataError as e:
        print(e)
    con.close()

    batch_counter += batch_size

print('Done.')


Processing batches.
  Trying to resolve Arachne URLs, this may take some time.
  Processing batch 0 to 1000 of 14275.
  Processing batch 1000 to 2000 of 14275.
  Processing batch 2000 to 3000 of 14275.
  Processing batch 3000 to 4000 of 14275.
  Processing batch 4000 to 5000 of 14275.
  Processing batch 5000 to 6000 of 14275.
  Processing batch 6000 to 7000 of 14275.
  Processing batch 7000 to 8000 of 14275.
  Processing batch 8000 to 9000 of 14275.
  Processing batch 9000 to 10000 of 14275.
  Processing batch 10000 to 11000 of 14275.
  Processing batch 11000 to 12000 of 14275.
  Processing batch 12000 to 13000 of 14275.
  Processing batch 13000 to 14000 of 14275.
  Processing batch 14000 to 15000 of 14275.
Done.
