# Get & Save JSON Data from Met Museum's API

In [47]:
# Import libraries

import requests
import json
import time
import math
import os
import re
import pandas as pd

In [48]:
# Return last file path in folder of JSON files 
# with previously saved API data

def get_last_file(folder_path):
    
    # List all files in the folder
    files = os.listdir(folder_path)
    
    if not files:
        return None  # Return None if the folder is empty

    # Regex for extracting digits from filenames
    file_pattern = re.compile(r'met_objects_(\d+)\.json')
    
    matching_files = [file for file in files if file_pattern.match(file)]
    
    if not matching_files:
        return None  # Return None if there are no matching files
    
    # Extract numbers from filenames and get the maximum
    last_file_number = max(int(file_pattern.match(file).group(1)) for file in matching_files)
    
    # Construct the full path of the last file
    last_file_path = os.path.join(folder_path, f"met_objects_{last_file_number}.json")
    
    return last_file_path


In [49]:
# Get objectID for most recently added object to the API
# Based on 'metadataDate' attribute

def get_last_object(file_path):
    
    # Open the JSON file and load its contents
    with open(file_path, "r") as file:
        data = json.load(file)

    # Check the type of data loaded from the JSON file
    if isinstance(data, list) or isinstance(data, dict):

        # Get last object in file
        last_object = data[-1]

        # Get objectID attribute for that object 
        if (isinstance(last_object, dict) or isinstance(last_object, list)) and 'objectID' in last_object:
            
            last_id = last_object["objectID"]
            return (last_id)
        
        else:
            # Return 'None' if the object is not a dict
            return None
         
    else:
        # Return 'None' if the JSON file contains another type of data
        return None


In [50]:
# Return list of object IDs

def get_ids(url_base):
    
    # Request IDs from Met's base URL
    response = requests.get(url_base)

    # Get dictionary from JSON response
    id_dict = response.json()
    
    # Create list of objectIDs
    ids = id_dict['objectIDs']

    return ids

In [51]:
# Return JSON data for single URL

def get_json(url, sleep=0.02, retries=3):
    jsonStr = None
    
    # Set retries in case of server timeout
    for i in range(1, retries+1):
        # Sleep to avoid overloading server (Met rules specify <80 API calls/second)
        time.sleep(sleep * i)
        
        # Request JSON data
        try:
            response = requests.get(url)
            jsonStr = response.json()

        # Catch request errors and skip to next objectID
        except requests.exceptions.RequestException as e:
            print(f"Error on {url}:\n\n{e}")
            continue
        else:
            break
    
    return jsonStr

In [52]:
# Call API, retrieve JSON data for each URL, save data to txt file

def call_api(url_base, obj_ids, save_path):
    
    # Iterate over object IDs
    for i in range(len(obj_ids)):
        obj_id = obj_ids[i]
        
        # Concatenate base url and object ID
        url = f'{url_base}/{obj_id}'
        
        # Get JSON data for URL
        jsonStr = get_json(url)

        # Skip if JSON not found
        if not jsonStr:
            continue
        
        # Convert JSON object to string
        jsonObj = json.dumps(jsonStr)
        
        # Add JSON data to txt file as string data
        with open(save_path, "a+") as f:
            # Include '[' and ', ' if object is first in file
            if i == 0:
                f.write(f"[{jsonObj}, ")
            
            # Include ']' if object is last in file
            elif i == len(obj_ids) - 1:
                f.write(f"{jsonObj}]")

            # Include ', ' if object occurs in middle of file
            else:
                f.write(f"{jsonObj}, ")
        
        # Confirm object's data was written
        print(obj_id)

In [53]:
# Save all API objects to files in 'data' folder
def create_new_collection(api_url):
    
    met_ids = get_ids(api_url)
    
    # Ensure each JSON file has maximum 100K objects
    file_count = math.ceil(len(met_ids) / 100000)
    
    i = 0

    # Iterate over number of files to create
    for f in range(file_count):
        # Get nth portion of list
        met_ids_slice = met_ids[i:i+100000]

        # Assign JSON file where data will be saved
        met_save_path = f"../data/met/api/met_objects_{f}.json"

        # Get and save object data from API
        call_api(api_url, met_ids_slice, met_save_path)

        # Increment i
        i += 100000

In [54]:
# Save latest API objects to new file in 'data' folder
def update_collection(api_url):

    # Get path for most recently created json file in data folder
    last_file_path = get_last_file('../data/met/api')

    if last_file_path:
        
        # Get objectID attribute for most recent object in the last file added to 'data' folder
        last_object_id = get_last_object(last_file_path)

        if last_object_id:

            # Get list of objectID attributes from Met API
            met_ids = get_ids(api_url)
            
            # From the list of 'met_ids': 
            # Locate the index of the last objectID that is already saved on file
            last_object_index = met_ids.index(last_object_id)
            
            # Get the slice of the 'met_ids' whose objects are not yet on file in the 'data' folder
            met_ids_slice = met_ids[last_object_index + 1:]

            # Generate a new file name for saving new data:
            last_file_num = re.search(r'(\d+)(?=\.\w+$)', last_file_path)

            if last_file_num:
            
                # Increment the last file name's final digit by 1
                new_file_num = int(last_file_num.group(1)) + 1
                
                met_save_path = f"../data/met/api/met_objects_{new_file_num}.json"

                # Call API to save new objects to file
                call_api(api_url, met_ids_slice, met_save_path)
            
            else:
                
                # If this process fails, use 'latest' in place of incremented number
                met_save_path = "../data/met/api/met_objects_latest.json"

                # Call API to save new objects to file
                call_api(api_url, met_ids_slice, met_save_path)
        
        # Handle non-JSON files in 'data' folder
        else:
            start_fresh = input("File could not be parsed. Get all API objects? Y/N")

            if start_fresh.lower() == 'y':
                create_new_collection(api_url)
            else:
                print("Update canceled")

    # Handle empty or unreadable 'data' folder
    else:
        start_fresh = input("No files found. Get all API objects? Y/N")

        if start_fresh.lower() == 'y':
            create_new_collection(api_url)
        else:
            print("Update canceled")
    


In [56]:
# Main function

def main(): 
    # Base URL for Met /objects API endpoint
    met_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects"
    
    choice = input("Enter 'N' for all data or 'U' for the latest additions: ")
    
    # Get entire dataset from API
    if choice.lower() == 'n':
        create_new_collection(met_url)

    # Get just the latest additions to the API data
    if choice.lower() == 'u':
        update_collection(met_url)
        

In [57]:
# Run program

if __name__ == '__main__':
    main()

904055
904081
904082
904083
904084
904085
904086
904087
904088
904089
904090
904091
904108
904150
904157
904161
904167
904171
904174
904321
904354
904393
904394
904395
904396
904397
904398
904399
904400
904401
904402
904403
904404
904405
904406
904407
904408
904409
904410
904411
904412
904413
904414
904415
904416
904417
904418
904419
904420
904421
904422
904423
904425
904440
904442
904443
904444
904447
904449
904457
904460
904465
904469
904486
904493
904494
904495
904497
904499
904501
904502
904503
904505
904507
904509
904512
904513
904515
904516
904517
904518
904519
904520
904521
904523
904529
904530
904533
904534
904535
904536
904537
904538
904539
904540
904543
904544
904546
904547
904548
904549
904550
904568
904602
904625
904628
904630
904632
904634
904636
896141
896143
896144
896145
896147
896149
896151
896152
896153
896155
896161
896162
896163
896164
896169
896178
896183
896184
896185
896187
896188
896189
896191
896193
896196
896198
896201
896202
896204
896208
896209
896212
896213

## TEST CASE

Below is a test to see if the 'update' option works as expected and retrieves a list of new objectID attributes for the latest additions to the API since the last update.

In [46]:

# Get & print last file name in data folder
last_file_path = get_last_file('../data/met/api')
print(last_file_path)

# Get & print last objectID in last file
last_object_id = get_last_object(last_file_path)
print(last_object_id)

# Get list of objectID attributes from Met API
api_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects"
met_ids = get_ids(api_url)
            
# From the list of 'met_ids': 
# Get & print the index of the 'last_object_id'
last_object_index = met_ids.index(last_object_id)
print(last_object_index)

# Get & print the slice of the 'met_ids' 
# whose objects are not yet on file in the 'data' folder
met_ids_slice = met_ids[last_object_index + 1:]
print(met_ids_slice)

# Generate a new file name for saving new data:
last_file_num = re.search(r'(\d+)(?=\.\w+$)', last_file_path)

if last_file_num:

    # Increment the last file name's final digit by 1
    new_file_num = int(last_file_num.group(1)) + 1
    
    # Create & print new file name
    met_save_path = f"../data/met/api/met_data_{new_file_num}.json"
    print(met_save_path)

../data/met/api/met_objects_3.json
904013
484905
[904055, 904081, 904082, 904083, 904084, 904085, 904086, 904087, 904088, 904089, 904090, 904091, 904108, 904150, 904157, 904161, 904167, 904171, 904174, 904321, 904354, 904393, 904394, 904395, 904396, 904397, 904398, 904399, 904400, 904401, 904402, 904403, 904404, 904405, 904406, 904407, 904408, 904409, 904410, 904411, 904412, 904413, 904414, 904415, 904416, 904417, 904418, 904419, 904420, 904421, 904422, 904423, 904425, 904440, 904442, 904443, 904444, 904447, 904449, 904457, 904460, 904465, 904469, 904486, 904493, 904494, 904495, 904497, 904499, 904501, 904502, 904503, 904505, 904507, 904509, 904512, 904513, 904515, 904516, 904517, 904518, 904519, 904520, 904521, 904523, 904529, 904530, 904533, 904534, 904535, 904536, 904537, 904538, 904539, 904540, 904543, 904544, 904546, 904547, 904548, 904549, 904550, 904568, 904602, 904625, 904628, 904630, 904632, 904634, 904636, 894709, 894710, 894711, 894712, 894713, 894714, 894715, 894716, 894717