# Download IIIF Manifests

This notebook is for downloading and working with IIIF manifests from the Wellcome Collection.

In [None]:
import os
import json
from pprint import pprint
import sys

from wc_simd.iiif_manifests import download_iiif_manifests

## Download IIIF Manifests

Run the function to download IIIF manifests if needed.
The default directory is the current directory, but we'll specify to save in '../data/iiif_manifests'.

In [None]:
# Uncomment and run this cell to download manifests
# download_iiif_manifests(
#     subset_fraction=0.01,  # Use a small subset for testing
#     download_dir='../data/iiif_manifests'
# )

## Find Empty JSON Files

The download process creates empty JSON files (`{}`) when a download fails after multiple attempts.
Let's identify these files to check which manifests had download issues.

In [4]:
def find_empty_json_files(directory='../data/iiif_manifests'):
    """Find all JSON files in the directory that are empty or very small in size."""
    empty_files = []
    total_files = 0

    # Ensure directory exists
    if not os.path.exists(directory):
        print(f"Directory {directory} does not exist.")
        return empty_files, total_files

    # Get all JSON files
    json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
    total_files = len(json_files)

    # Check file sizes
    for filename in json_files:
        filepath = os.path.join(directory, filename)
        try:
            if os.path.getsize(filepath) <= 2:  # File size <= 2 bytes (e.g., "{}")
                empty_files.append(filename)
        except Exception as e:
            print(f"Error checking size of {filename}: {e}")

    return empty_files, total_files

# Run the function
empty_files, total_files = find_empty_json_files()

# Print results
print(f"Found {len(empty_files)} empty files out of {total_files} total JSON files ({(len(empty_files)/total_files)*100:.2f}% failure rate)")
if empty_files:
    print("First 10 empty files:")
    for file in empty_files[:10]:
        print(f"- {file}")
    
    if len(empty_files) > 10:
        print(f"... and {len(empty_files) - 10} more")

Found 1609 empty files out of 339927 total JSON files (0.47% failure rate)
First 10 empty files:
- ffhjnz93_b11144890.json
- k3ebsey3_b1011144x.json
- u4hzwpnq_b1040031x.json
- gnc2sg5t_b13146993.json
- x8sd5g3u_b11486375.json
- eszejfpv_b12974390.json
- s26cpfe5_b11144130.json
- nw6gp87v_b12177362.json
- knpccfgg_b11512817.json
- yfe5hvm8_b13147201.json
... and 1599 more


## Retry Downloading Empty Files

If you want to retry downloading the empty files, you can use the function below.

In [None]:
import requests
from tqdm import tqdm
import csv

def retry_download_empty_files(empty_files, directory='../data/iiif_manifests'):
    """Retry downloading the empty JSON files"""
    success_count = 0
    fail_count = 0
    not_found_records = []

    for filename in tqdm(empty_files, desc="Retrying downloads"):
        # If filename includes error message (small content), clean it up
        if " (small content:" in filename:
            filename = filename.split(" (small content:")[0]

        filepath = os.path.join(directory, filename)

        # Parse the work ID and manifest ID from the filename
        # Format is typically: a222zvge_b18659135.json
        parts = filename.rsplit('_', 1)
        if len(parts) != 2 or not parts[1].endswith('.json'):
            print(f"Skipping {filename}: unable to parse URL")
            fail_count += 1
            continue

        work_id = parts[0]
        manifest_id = parts[1].rsplit('.', 1)[0]

        # Recreate the URL
        url = f"https://iiif.wellcomecollection.org/presentation/{manifest_id}"

        # Try downloading
        success = False
        for attempt in range(3):
            try:
                resp = requests.get(url, timeout=15)
                if resp.status_code == 404:
                    print(f"404 Not Found: {url}")
                    not_found_records.append({"filename": filename, "url": url})
                    break  # Exit retry loop on 404
                resp.raise_for_status()
                with open(filepath, "w") as f:
                    f.write(resp.text)
                success = True
                break
            except Exception as e:
                if attempt < 2:  # Don't sleep after the last attempt
                    import time
                    time.sleep(1)  # Wait a bit before retrying

        if success:
            success_count += 1
        else:
            fail_count += 1

    # Write 404 records to a CSV file
    csv_path = os.path.join(directory, b"404_not_found.csv")
    with open(csv_path, mode="w", newline="") as csvfile:
        fieldnames = ["filename", "url"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(not_found_records)

    print(f"Retry results: {success_count} successful, {fail_count} failed")
    print(f"404 records saved to {csv_path}")
    return success_count, fail_count

# Uncomment to retry downloading empty files
success_count, fail_count = retry_download_empty_files(empty_files)