# Metadata Application Profile Assignment

In [1]:
import csv
import json
import requests

# for later, when working with local files
import glob
import os
from os.path import join

In [2]:
endpoint = 'https://www.loc.gov/free-to-use'
parameters = {
    'fo' : 'json'
}

In [3]:
collection = 'travel-posters'

In [4]:
collection_list_response = requests.get(endpoint + '/' + collection, params=parameters)

In [5]:
collection_list_response.url

'https://www.loc.gov/free-to-use/travel-posters?fo=json'

In [6]:
collection_json = collection_list_response.json()

In [7]:
# .keys() is a helpful function to see what the data elements are
collection_json.keys()

dict_keys(['breadcrumbs', 'content', 'content_is_post', 'description', 'expert_resources', 'next', 'next_sibling', 'options', 'pages', 'portal', 'previous', 'previous_sibling', 'site_type', 'timestamp', 'title', 'type'])

In [8]:
for k in collection_json['content']['set']['items']:
    print(k)

{'image': '/static/portals/free-to-use/public-domain/travel-posters/13399u.jpg', 'link': '/resource/ppmsca.13399/', 'title': 'Yellowstone National Park, Ranger Naturalist Service'}
{'image': '/static/portals/free-to-use/public-domain/travel-posters/06845u.jpg', 'link': '/resource/ds.06845/', 'title': 'Fly TWA New York / David.'}
{'image': '/static/portals/free-to-use/public-domain/travel-posters/13397u.jpg', 'link': '/resource/ppmsca.13397/', 'title': 'Grand Canyon National Park'}
{'image': '/static/portals/free-to-use/public-domain/travel-posters/3g05158u.jpg', 'link': '/resource/cph.3g05158/', 'title': "Buckingham Fountain on Chicago's lake front, world's largest and most beautiful illuminated fountain"}
{'image': '/static/portals/free-to-use/public-domain/travel-posters/13396u.jpg', 'link': '/resource/ppmsca.13396/', 'title': 'Fort Marion National Monument, St. Augustine, Florida'}
{'image': '/static/portals/free-to-use/public-domain/travel-posters/3b48750u.jpg', 'link': '/resource/

In [9]:
len(collection_json['content']['set']['items'])

21

In [10]:
collection_json['content']['set']['items'][0].keys()

dict_keys(['image', 'link', 'title'])

In [11]:
travel_set_list = os.path.join('data','travel_set_list.csv')
headers = ['image','link','title']

with open(travel_set_list, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    for item in collection_json['content']['set']['items']:
        
        # clean up errant spaces in the title fields
        item['title'] = item['title'].rstrip()
        writer.writerow(item)
    print('wrote',travel_set_list)

wrote data/travel_set_list.csv


# Get metadata for individual items 

In [12]:
# update endpoint info
endpoint = 'https://www.loc.gov'
parameters = {
    'fo' : 'json'
}

In [13]:
# run this cell to confirm that you have a location for the JSON files
item_metadata_directory = os.path.join('data','ftu_travel_metadata')

if os.path.isdir(item_metadata_directory):
    print(item_metadata_directory,'exists')
else:
    os.mkdir(item_metadata_directory)
    print('created',item_metadata_directory)

created data/ftu_travel_metadata


In [14]:
item_count = 0
error_count = 0
file_count = 0

data_directory = 'data'
item_metadata_directory = 'ftu_travel_metadata'
item_metadata_file_start = 'item_metadata'
json_suffix = '.json'

travel_set_list = os.path.join('data','travel_set_list.csv')

with open(travel_set_list, 'r', encoding='utf-8', newline='') as f:
    reader = csv.DictReader(f, fieldnames=headers)
    for item in reader:
        if item['link'] == 'link':
            continue
        # these resource links could redirect to item pages, but currently don't work
        if '?' in item['link']:
            resource_ID = item['link']
            short_ID = item['link'].split('/')[2]
            item_metadata = requests.get(endpoint + resource_ID + '&fo=json')
            print('requested',item_metadata.url,item_metadata.status_code)
            if item_metadata.status_code != 200:
                print('requested',item_metadata.url,item_metadata.status_code)
                error_count += 1
                continue
            try:
                item_metadata.json()
            except: #basically this catches all of the highsmith photos with hhh in the ID
                error_count += 1
                print('no json found')
                continue
            fout = os.path.join(data_directory, item_metadata_directory, str(item_metadata_file_start + '-' + short_ID + json_suffix))
            with open(fout, 'w', encoding='utf-8') as json_file:
                json_file.write(json.dumps(item_metadata.json()['item']))
                file_count += 1
                print('wrote', fout)
            item_count += 1
        else:
            resource_ID = item['link']
            short_ID = item['link'].split('/')[2]
            item_metadata = requests.get(endpoint + resource_ID, params=parameters)
            print('requested',item_metadata.url,item_metadata.status_code)
            if item_metadata.status_code != 200:
                print('requested',item_metadata.url,item_metadata.status_code)
                error_count += 1
                continue
            try:
                item_metadata.json()
            except:
                error_count += 1
                print('no json found')
                continue
            fout = os.path.join(data_directory, item_metadata_directory, str(item_metadata_file_start + '-' + short_ID + json_suffix))
            with open(fout, 'w', encoding='utf-8') as json_file:
                json_file.write(json.dumps(item_metadata.json()['item']))
                file_count += 1
                print('wrote', fout)
            item_count += 1

print('--- mini LOG ---')
print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

requested https://www.loc.gov/resource/ppmsca.13399/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-ppmsca.13399.json
requested https://www.loc.gov/resource/ds.06845/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-ds.06845.json
requested https://www.loc.gov/resource/ppmsca.13397/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-ppmsca.13397.json
requested https://www.loc.gov/resource/cph.3g05158/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-cph.3g05158.json
requested https://www.loc.gov/resource/ppmsca.13396/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-ppmsca.13396.json
requested https://www.loc.gov/resource/cph.3b48750/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-cph.3b48750.json
requested https://www.loc.gov/resource/cph.3b48864/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-cph.3b48864.json
requested https://www.loc.gov/resource/cph.3b48718/?fo=json 200
wrote data/ftu_travel_metadata/item_metadata-cph.3b487

# Transformation Part 1: Testing

In [15]:
current_loc = os.getcwd()

print(current_loc)

/Users/emmaheck/Documents/umich/courses/SI676/networked-services-labs-2023-main


In [16]:
metadata_file_path = os.path.join('data','ftu_travel_metadata')

print(metadata_file_path)

data/ftu_travel_metadata


The next cell uses the `glob` library, which supports the use of file path expanders
to look for patterns in file paths. In this case, the previous item metadata exraction
wrote files that had the pattern `item_metadata-[item-identifier].json`. 
So, to match any pattern for the `item-identifier` section, `glob` allows
the use of the `*` (asterisk) character to match any pattern:

In [17]:
file_count = 0

for file in glob.glob('data/ftu_travel_metadata/item_metadata-*.json'):
    file_count += 1
    print(file)
    
print('found',file_count)

data/ftu_travel_metadata/item_metadata-ppmsca.13400.json
data/ftu_travel_metadata/item_metadata-ppmsca.13396.json
data/ftu_travel_metadata/item_metadata-cph.3b48718.json
data/ftu_travel_metadata/item_metadata-ppmsca.43496.json
data/ftu_travel_metadata/item_metadata-ds.08068.json
data/ftu_travel_metadata/item_metadata-cph.3g04243.json
data/ftu_travel_metadata/item_metadata-cph.3b49082.json
data/ftu_travel_metadata/item_metadata-ppmsca.13397.json
data/ftu_travel_metadata/item_metadata-cph.3f05643.json
data/ftu_travel_metadata/item_metadata-cph.3b48860.json
data/ftu_travel_metadata/item_metadata-cph.3b48733.json
data/ftu_travel_metadata/item_metadata-cph.3g05158.json
data/ftu_travel_metadata/item_metadata-cph.3b48750.json
data/ftu_travel_metadata/item_metadata-cph.3b48731.json
data/ftu_travel_metadata/item_metadata-ppmsca.04892.json
data/ftu_travel_metadata/item_metadata-ppmsca.13398.json
data/ftu_travel_metadata/item_metadata-cph.3b48864.json
data/ftu_travel_metadata/item_metadata-cph.3g

In [18]:
list_of_item_metadata_files = list() 
for file in glob.glob('data/ftu_travel_metadata/item_metadata-*.json'):
    list_of_item_metadata_files.append(file)

In [19]:
len(list_of_item_metadata_files)

21

In [20]:
# quick duplicate check
list_of_item_metadata_files.sort()

for file in list_of_item_metadata_files:
    print(file)

data/ftu_travel_metadata/item_metadata-cph.3b48718.json
data/ftu_travel_metadata/item_metadata-cph.3b48731.json
data/ftu_travel_metadata/item_metadata-cph.3b48733.json
data/ftu_travel_metadata/item_metadata-cph.3b48750.json
data/ftu_travel_metadata/item_metadata-cph.3b48860.json
data/ftu_travel_metadata/item_metadata-cph.3b48864.json
data/ftu_travel_metadata/item_metadata-cph.3b49082.json
data/ftu_travel_metadata/item_metadata-cph.3b51486.json
data/ftu_travel_metadata/item_metadata-cph.3f05643.json
data/ftu_travel_metadata/item_metadata-cph.3g02947.json
data/ftu_travel_metadata/item_metadata-cph.3g04243.json
data/ftu_travel_metadata/item_metadata-cph.3g05158.json
data/ftu_travel_metadata/item_metadata-ds.06845.json
data/ftu_travel_metadata/item_metadata-ds.08068.json
data/ftu_travel_metadata/item_metadata-ppmsca.04892.json
data/ftu_travel_metadata/item_metadata-ppmsca.13396.json
data/ftu_travel_metadata/item_metadata-ppmsca.13397.json
data/ftu_travel_metadata/item_metadata-ppmsca.13398

In [21]:
# try first with one file, can you open the json, can you see what elements are in the json?
with open(list_of_item_metadata_files[0], 'r', encoding='utf-8') as item:
    # what are we looking at?
    print('file:',list_of_item_metadata_files[0],'\n')
    
    # load the item data
    item_data = json.load(item)
    
    for element in item_data.keys():
        print(element,':',item_data[element])

file: data/ftu_travel_metadata/item_metadata-cph.3b48718.json 

_version_ : 1754109026501656576
access_restricted : False
aka : ['http://www.loc.gov/pictures/collection/wpapos/item/98507272/', 'https://hdl.loc.gov/loc.pnp/cph.3b48718', 'http://www.loc.gov/item/98507272/', 'http://www.loc.gov/pictures/item/98507272/', 'https://hdl.loc.gov/loc.pnp/cph.3f05191', 'http://www.loc.gov/resource/cph.3f05191/', 'http://www.loc.gov/resource/cph.3b48718/', 'http://lccn.loc.gov/98507272']
call_number : POS - WPA - ILL .G36, no. 2 (B size) [P&P]
campaigns : []
contributor_names : ["Federal Writers' Project, sponsor"]
contributors : [{"federal writers' project": 'https://www.loc.gov/search/?fa=contributor:federal+writers%27+project&fo=json'}]
control_number : 
created : 2016-04-20 08:08:56
created_published : ['Chi[cago], Ill., : WPA Federal Art Project, [between 1936 and 1940]']
created_published_date : [between 1936 and 1940]
date : 1936-01-01
description : ['1 print on board (poster) : silkscreen

Look around in the dictionary a bit more:

In [22]:
item_data.keys()

dict_keys(['_version_', 'access_restricted', 'aka', 'call_number', 'campaigns', 'contributor_names', 'contributors', 'control_number', 'created', 'created_published', 'created_published_date', 'date', 'description', 'digital_id', 'digitized', 'display_offsite', 'extract_timestamp', 'extract_urls', 'format', 'format_headings', 'genre', 'group', 'hassegments', 'id', 'image_url', 'index', 'item', 'language', 'languages', 'latlong', 'library_of_congress_control_number', 'link', 'location', 'location_country', 'location_state', 'location_str', 'locations', 'locations_country', 'locations_state', 'locations_str', 'lonlat', 'marc', 'medium', 'medium_brief', 'mime_type', 'modified', 'notes', 'number', 'number_former_id', 'number_lccn', 'number_source_modified', 'online_format', 'original_format', 'other_formats', 'other_title', 'partof', 'place', 'related', 'repository', 'reproduction_number', 'reproductions', 'resource_links', 'resources', 'rights', 'rights_advisory', 'rights_information', 's

In [23]:
    # can you get the date?
    print('\ndate:',item_data['date'], type(item_data['date']))
    # can you get the format?
    print('\nformat:',item_data['format'][0], type(item_data['format']))


date: 1936-01-01 <class 'str'>

format: {'photo, print, drawing': 'https://www.loc.gov/search/?fa=original_format:photo,+print,+drawing&fo=json'} <class 'list'>


In [24]:
# set up the containers to create the csv of all the item fields
# file for csv to read out
travel_info_csv = 'travel_items_data.csv'

# set up a list for the columns in your csv; 
# your goal should be to automate this, but . . . 
# it works for demonstration as you set up the crosswalk
headers = ['source_file', 'item_id', 'title', 'date', 'source_url', 'phys_format', 'dig_format', 'rights']

# try first with one file
with open(list_of_item_metadata_files[0], 'r', encoding='utf-8') as data:
    # load the item data
    item_data = json.load(data)
    
    # extract the data you want
    # for checking purposes, add in the source of the info
    source_file = str(file)
    # make sure there's some unique and stable identifier
    try:
        item_id = item_data['library_of_congress_control_number']
    except:
        item_id = item_data['url'].split('/')[-2]
    title = item_data['title']
    date = item_data['date']
    source_url = item_data['url']
    try:
        phys_format = item_data['format'][0]
    except:
        phys_format = 'Not found'
    try:
        dig_format = item_data['online_format'][0]
    except:
        dig_format = 'Not found'
    mime_type = item_data['mime_type']
    try:
        rights = item_data['rights_information']
    except:
        rights = 'Undetermined'


    # dictionary for the rows
    row_dict = dict()
    
    # look for the item metadata, assign it to the dictionary; 
    # start with some basic elements likely (already enumerated in the headers list) :
    # source file
    row_dict['source_file'] = source_file
    # identifier
    row_dict['item_id'] = item_id
    # title
    row_dict['title'] = title
    # date
    row_dict['date'] = date
    # link
    row_dict['source_url'] = source_url
    # format
    row_dict['phys_format'] = phys_format
    # digital format
    row_dict['dig_format'] = dig_format
    #rights
    row_dict['rights'] = rights 
    print('created row dictionary:',row_dict)

    # write to the csv
    with open(travel_info_csv, 'w', encoding='utf-8') as fout:
        writer = csv.DictWriter(fout, fieldnames=headers)
        writer.writeheader()
        writer.writerow(row_dict)
        print('wrote',travel_info_csv)

created row dictionary: {'source_file': 'data/ftu_travel_metadata/item_metadata-ppmsca.43496.json', 'item_id': '98507272', 'title': 'Illinois: A descriptive and historical guide / galic.', 'date': '1936-01-01', 'source_url': 'https://www.loc.gov/item/98507272/', 'phys_format': {'photo, print, drawing': 'https://www.loc.gov/search/?fa=original_format:photo,+print,+drawing&fo=json'}, 'dig_format': 'image', 'rights': 'No known restrictions on publication.'}
wrote travel_items_data.csv


You're now developing the structure of the CSV file that will import items into your Omeka S site. The CSV import module supports the loading of item files via a URL. This provides the location of a file (in this case, an image), which Omeka will copy into its database and attach to your item. This means that it isn't necessary to upload individual files after or during metadata creation. 

To allow this, you need to find a direct url to a good image file for the item. There are multiple options, and the code below demonstrates looking for the url to a medium-sized image of an item:

In [25]:
travel_info_csv = 'travel_items_data.csv'

# set up a list for the columns in your csv; in future, this should be more automated but this works for now as you set up the crosswalk
headers = ['source_file', 'item_id', 'title', 'date', 'source_url', 'phys_format', 'dig_format', 'rights']

# try first with one file
with open(list_of_item_metadata_files[0], 'r', encoding='utf-8') as data:
    # load the item data
    item_data = json.load(data)
    
    print(item_data['image_url'][3])

https://tile.loc.gov/storage-services/service/pnp/cph/3f00000/3f05000/3f05100/3f05191v.jpg#h=1024&w=711


# Transformation Part 2: Write your CSV

The goal of this final step is to create a CSV file, which will be possible to import into your Omeka site. It may seem like it's taken a long time to get to this point... but remember, when this works you will be importing around 60 items into the site at one time, so if you can get all of this to work for an even larger set of materials, you will be saving quite a lot of time in the future when you need to import items. Even if you were to collect the items piecemeal, which would need a different workflow than illustrated here, you can accomplish similar goals by recording metadata for each item consistently and in a spreadsheet, which you can then use to import the items in batch.

So now that your transformation script is tested, the goal is to extend this to the whole set by looping through each of the desired JSON files:

In [26]:
# for purposes of demonstration, use this block to make sure there isn't already a list file:

items_data_file = os.path.join(data_directory, 'travel_items_data.csv')

if os.path.isfile(items_data_file):
    os.unlink(items_data_file)
    print('removed',items_data_file)

# clear row_dict
row_dict = ()

In [27]:
from datetime import date

date_string_for_today = date.today().strftime('%Y-%m-%d') # see https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior

print(date_string_for_today)

2023-12-11


In [30]:
# set up the containers to create the csv & counters 
# file for csv to read out
travel_info_csv = os.path.join('data','travel_items_data.csv')
file_count = 0
items_written = 0
error_count = 0

# add in a couple of extras for Omeka, including item type and date uploaded

# set up a list for the columns in your csv; in future, this should be more automated but this works for now as you set up the crosswalk
headers = ['item_type', 'date_uploaded', 'source_file', 'item_id','phys_format', 'dig_format', 'rights', 'image_url', 'title', 'date_created', 'contributor_names']

# now, adapt the previous loop to open each file:
for file in list_of_item_metadata_files:
    file_count += 1
    print('opening',file)
    with open(file, 'r', encoding='utf-8') as item:
        # load the item data
        try:
            item_data = json.load(item)
        except:
            print('error loading',file)
            error_count += 1
            continue

        # extract/name the data you want
        # item type
        item_type = 'Item'
        # date uplaoded
        date_uploaded = date_string_for_today
        # for checking purposes, add in the source of the info
        source_file = str(file)
        # make sure there's some unique and stable identifier
        try:
            item_id = item_data['library_of_congress_control_number']
        except:
            item_id = 'Not found'
        try:
            phys_format = item_data['format'][0]
        except:
            phys_format = 'Not found'
        try:
            dig_format = item_data['online_format'][0]
        except:
            dig_format = 'Not found'
        mime_type = item_data['mime_type']
        try:
            rights = item_data['rights_information']
        except:
            rights = 'Undetermined'
        try:
            image_url = item_data['image_url'][3]
        except:
            image_url = item_data['image_url'][2]
        try: 
            title = item_data['title']
        except:
            title = 'Not found'
        try:
            date_created = item_data['date']
        except:
            date_created = 'Not found'
        try:
            contributor_names = item_data['contributor_names']
        except:
            contributor_names = 'Not found'

        # dictionary for the rows
        row_dict = dict()

        # look for the item metadata, assign it to the dictionary; 
        # start with some basic elements likely (already enumerated in the headers list) :
        # item type
        row_dict['item_type'] = item_type
        # date uploaded
        row_dict['date_uploaded'] = date_uploaded
        # source filename
        row_dict['source_file'] = source_file
        # identifier
        row_dict['item_id'] = item_id
        row_dict['phys_format'] = phys_format
        # digital format
        row_dict['dig_format'] = dig_format.capitalize()
        #rights
        row_dict['rights'] = rights
        #image
        row_dict['image_url'] = image_url
        row_dict['title'] = title
        row_dict['date_created'] = date_created
        row_dict['contributor_names'] = contributor_names

        # write to the csv
        with open(travel_info_csv, 'a', encoding='utf-8') as fout:
            writer = csv.DictWriter(fout, fieldnames=headers)
            if items_written == 0:
                writer.writeheader()
            writer.writerow(row_dict)
            items_written += 1
            print('adding',item_id)

print('\n\n--- LOG ---')
print('wrote',travel_info_csv)
print('with',items_written,'items')
print(error_count,'errors (info not written)')

opening data/ftu_travel_metadata/item_metadata-cph.3b48718.json
adding 98507272
opening data/ftu_travel_metadata/item_metadata-cph.3b48731.json
adding 93505613
opening data/ftu_travel_metadata/item_metadata-cph.3b48733.json
adding 98518589
opening data/ftu_travel_metadata/item_metadata-cph.3b48750.json
adding 98518516
opening data/ftu_travel_metadata/item_metadata-cph.3b48860.json
adding 96525136
opening data/ftu_travel_metadata/item_metadata-cph.3b48864.json
adding 98516742
opening data/ftu_travel_metadata/item_metadata-cph.3b49082.json
adding 98518752
opening data/ftu_travel_metadata/item_metadata-cph.3b51486.json
adding 98518753
opening data/ftu_travel_metadata/item_metadata-cph.3f05643.json
adding 98518608
opening data/ftu_travel_metadata/item_metadata-cph.3g02947.json
adding 94504463
opening data/ftu_travel_metadata/item_metadata-cph.3g04243.json
adding 96503125
opening data/ftu_travel_metadata/item_metadata-cph.3g05158.json
adding 97502889
opening data/ftu_travel_metadata/item_me