## Preliminaries

*We'll set up a working directory and download a small sample of images to ingest.*

In [50]:
import os

# We create a working directory for this example.
os.makedirs("uscg-storis", exist_ok=True)

# We create a "data" subdirectory for the images and the metadata tag files.
os.makedirs("uscg-storis/data", exist_ok=True)
data = "uscg-storis/data"

# We create an "out" subdirectory for the processed images and the metadata catalog.
os.makedirs("uscg-storis/out", exist_ok=True)
out  = "uscg-storis/out"  

Let's download a sample collection of 20 images from the logbooks of the USCG Storis.

In [51]:
import requests
import logging
import http.client

# To set up logging.
# https://stackoverflow.com/questions/16337511/
http.client.HTTPConnection.debuglevel = 1
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True

# To access the NARA API for images of the USCG Storis' 1957 logbook.
nara_id = "38547962"
api_base = 'https://catalog.archives.gov/api/v1/'
api_url = '{0}?naIds={1}'.format(api_base, nara_id)
res = requests.get(api_url)

# To parse the NARA API output for metadata.
entry_img_array = res.json().get('opaResponse').get('results').get('result')[0].get('objects').get('object')
digital_directory = entry_img_array[0].get('file').get('@path').split("/")[-2]

# To write the NARA API output to file for reference.
api_output = "{0}/nara_id_{1}.json".format(data, digital_directory, nara_id)
if res.status_code == 200:
    with open(api_output, 'wb') as f:
        f.write(res.content)

# To download images of 40 pages of the Storis' logbooks.
for img_info in entry_img_array: 

    # We test for mimetype "image/jpeg"---we don't want to download any files
    # with mimetype "application/pdf".
    if img_info.get('file').get('@mime') == "image/jpeg":

        img_name = img_info.get('file').get('@name')
        img_url = img_info.get('file').get('@url')
        img_res = requests.get(img_url)

        # To write a single image to file.
        local_img_name = "{0}/{1}".format(data, img_name)
        if img_res.status_code == 200:
            with open(local_img_name, 'wb') as img_f:
                img_f.write(img_res.content)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /api/v1/?naIds=38547962 HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /api/v1/?naIds=38547962 HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: application/json;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:28:49 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0126.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0126.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:28:51 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0127.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0127.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:28:53 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0128.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0128.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:28:55 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0129.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0129.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:28:56 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0130.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0130.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:28:58 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0131.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0131.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:28:59 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0132.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0132.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:01 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0133.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0133.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:03 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0134.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0134.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:04 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0135.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0135.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:05 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0136.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0136.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:07 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0137.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type:

DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0137.JPG HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


 image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:08 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive
send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0138.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0138.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:10 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0139.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0139.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa03.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa03
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:12 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0140.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0140.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:13 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0141.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0141.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:15 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0142.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0142.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:17 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0143.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0143.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:18 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0144.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0144.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:20 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0145.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0145.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Thu, 19 Dec 2019 20:29:22 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


Let's write a metadata tagfile in the "data" subdirectory with the minimal required metadata for the sample of images.

In [52]:
import csv

with open(os.path.join(data, 'metadata.csv'), mode='w') as metadata_file:
    metadata_writer = csv.writer(metadata_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    metadata_writer.writerow(['archive.host_country', 'USA'])
    metadata_writer.writerow(['document.contact_person', 'Kevin Wood'])
    metadata_writer.writerow(['archive.notes', 'Images available via API at https://catalog.archives.gov/api/v1/38547962'])
    metadata_writer.writerow(['platform.name', 'USCG Storis'])
    metadata_writer.writerow(['document.id_within_archive', '38547962'])
    metadata_writer.writerow(['document.id_within_archive_type', 'NARA ID'])
    metadata_writer.writerow(['document.record_type', "ships' logs"])
    metadata_writer.writerow(['document.accession_to_archive_date', '2016-08-19'])
    metadata_writer.writerow(['document.standardized_region_list', 'north_atlantic'])
    metadata_writer.writerow(['document.start_date', '1957-06-09'])
    metadata_writer.writerow(['document.start_date', '1957-09-30'])
    metadata_writer.writerow(['document.rights_statement', 'CC0 Public Domain'])
    metadata_writer.writerow(['document.notes', ''])

During ingest we'll associate the above metadata to the 20 samples images. In practice, any `.csv` file in the `data` subdirectory will be parsed as a metadata tagefile. For example, the tagfile `metadata.csv` provides metadata for images in the same directory `uscg-storis/data` as itself and in all subdirectory below itself. 

To enable users to provide "hierarchical" metadata, the information in a tagfile from a subdirectory has precendence over any tagfiles from parent directories. (The idea is to provide the *most specific metadata* for images in the same directory as the images themselves, while parent directories might provide *general metadata* for a whole collection of images.)

Here's what the tagfile we created looks like.

In [53]:
import pandas as pd
df = pd.read_csv(os.path.join(data, "metadata.csv"), header=None, names=["field", "value"])
df

Unnamed: 0,field,value
0,archive.host_country,USA
1,document.contact_person,Kevin Wood
2,archive.notes,Images available via API at https://catalog.ar...
3,platform.name,USCG Storis
4,document.id_within_archive,38547962
5,document.id_within_archive_type,NARA ID
6,document.record_type,ships' logs
7,document.accession_to_archive_date,2016-08-19
8,document.standardized_region_list,north_atlantic
9,document.start_date,1957-06-09


## Ingesting images

First, we'll interactively load "helper" functions as defined in the `rdai` module. 

In [57]:
%run -i rdai

Now, we'll define the global variable `fixed_seq` in order to call `mint_uuid` for each image file.

In [55]:
# We generate a fixed sequence for uuids.
get_fixed_seq()

If we're on casper, then we'll need to load python-magic from `rdadata`. Else, we assume the python-magic package has already been installed, e.g., with `pip3 install python-magic --user`.

In [58]:
import sys
sys.path.append('/glade/u/home/rdadata/lib/python/site-packages')

In [66]:
# get_exiftool()
import subprocess
repo_dir = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0].rstrip().decode('utf-8')
import sys
os.path.join(repo_dir, "dependencies/pyexiftool")
import exiftool

In [67]:
normalized_catalog = get_normalized_catalog(data)
# We generate a metadata catalog (unnormalized) from the data directory.

catalog = unnormalize_catalog(normalized_catalog)
# We flatten the normalized catalog. 
# Each file in the data directory "has its own entry" in this catalog.
# We'll eventually ignore non-image files.

write_timestamped_catalog(catalog, out)
# We write this version of the metadata catalog to the output directory.

RDAI: Tag EXIF:ImageUniqueID=07e153ae22a311eab39ff33270d5ee9b already exists in file uscg-storis/data/storis-wmec-38-1957-logbooks_0126.JPG.
RDAI: Tag EXIF:ImageUniqueID=080e771a22a311eab39ff33270d5ee9b already exists in file uscg-storis/data/storis-wmec-38-1957-logbooks_0127.JPG.
RDAI: Tag EXIF:ImageUniqueID=083d8de822a311eab39ff33270d5ee9b already exists in file uscg-storis/data/storis-wmec-38-1957-logbooks_0128.JPG.
RDAI: Tag EXIF:ImageUniqueID=086d96a622a311eab39ff33270d5ee9b already exists in file uscg-storis/data/storis-wmec-38-1957-logbooks_0129.JPG.
RDAI: Tag EXIF:ImageUniqueID=089ce00a22a311eab39ff33270d5ee9b already exists in file uscg-storis/data/storis-wmec-38-1957-logbooks_0130.JPG.
RDAI: Tag EXIF:ImageUniqueID=08cab56222a311eab39ff33270d5ee9b already exists in file uscg-storis/data/storis-wmec-38-1957-logbooks_0131.JPG.
RDAI: Tag EXIF:ImageUniqueID=08f9289222a311eab39ff33270d5ee9b already exists in file uscg-storis/data/storis-wmec-38-1957-logbooks_0132.JPG.
RDAI: Tag EXI

In [68]:
catalog = read_timestamped_catalog(out)
# We read in the most recent version of the metadata catalog from the out directory.

elementary_family = [c for c in catalog if c['media_type'].startswith("image")]
# We create a list of all the entries in the catalog that are image files.

In [69]:
import os
# We'll perform some file renames between the data directory and the out directory.

# We move all the images in the catalog to the output directory.
for member in elementary_family:
    os.rename(member['file_path'], os.path.join(out, member['uuid']))

In [74]:
os.listdir(out)

FileNotFoundError: [Errno 2] No such file or directory: 'uscg-storis/out'

In [71]:
# Conversely, we move all the images in the catalog back to the data directory.
for member in elementary_family:
    os.rename(os.path.join(out, member['uuid']), member['file_path'])

In [72]:
!rm -rf uscg-storis/