In [16]:
import pandas as pd
import requests
from io import BytesIO
import re  # Import regular expressions module

# URL to download the Excel file
url = 'https://docs.google.com/spreadsheets/d/1KQZOmXHrjXuQCttNGuAxFUr72SlXcBCYbs8c2Cy3fDE/export?format=xlsx&gid=175388937'

# Fetch the file
response = requests.get(url)

all_file_names = []

# Check if the request was successful
if response.status_code == 200:
    # Load the content into a pandas DataFrame
    df = pd.read_excel(BytesIO(response.content))
    
    # Define a function to check if the Letter_ID falls within the specified range
    def is_within_range(letter_id):
        try:
            # Convert to integer if possible
            numeric_id = int(letter_id)
            return 0 <= numeric_id <= 500
        except ValueError:
            return False

    # Apply the filtering function
    filtered_df = df[df['Letter_ID'].apply(is_within_range)]
    
    # Iterate over the filtered DataFrame and clean file names
    for i in filtered_df['I Tatti file name(s)']:
        if pd.notna(i):  # Check if the value is not NaN
            # Remove newline characters and split using regex for commas and surrounding spaces
            file_names = re.split(r'\s*,\s*', i.replace('\n', ''))
            
            # Clean up extra spaces within each file name and filter out empty strings
            cleaned_file_names = [re.sub(r'\s+', ' ', name).strip() for name in file_names if name.strip()]
            
            # Add cleaned, non-empty values to the list
            all_file_names.extend(cleaned_file_names)

    print(all_file_names)

else:
    print(f"Failed to fetch the file: {response.status_code}")
    
# Count the number of file names
n = len(all_file_names)
print(n)


['32044150446383_001', '32044150446383_002', '32044150446383_003', '32044150446383_004', '32044150446383_005', '32044150446383_006', '32044150446383_007', '32044150446383_008', '32044150446383_009', '32044150446383_010', '32044150446383_011', '32044150446383_012', '32044150446383_013', '32044150446383_014', '32044150446383_015', '32044150446383_016', '32044150446383_017', '32044150446383_018', '32044150446383_019', '32044150446383_020', '32044150446383_021', '32044150446383_022', '32044150446383_023', '32044150446383_024', '32044150446383_025', '32044150446383_026', '32044150446383_027', '32044150446383_028', '32044150446375_001', '32044150446375_002', '32044150446375_003', '32044150446375_004', '32044150446375_005', '32044150446375_006', '32044150446375_007', '32044150446375_008', '32044150446375_009', '32044150446375_010', '32044150446375_011', '32044150446375_046', '32044150446375_048', '32044150446375_012', '32044150446375_013', '32044150446375_014', '32044150446375_015', '32044150

In [17]:
import requests
import os

def download_images(base_url, image_numbers, output_dir="BGdataset/images"):
    """
    Downloads images from the specified IIIF base URL, processes and saves them in the given directory.
    Stores IDs of images whose metadata was not found in a list.
    
    Parameters:
        base_url (str): The base URL for the IIIF image service.
        image_numbers (list of str): The list of image identifiers or names to download.
        output_dir (str): The directory where images will be saved.
    
    Returns:
        List of image IDs where metadata was not found.
    """
    # List to store image numbers where metadata was not found
    missing_metadata = []

    # Create a directory to store the images if it doesn't already exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for image_number in image_numbers:
        # Define the path where the image will be saved
        file_path = os.path.join(output_dir, f"{image_number}.jpg")

        # Skip downloading if the file already exists
        if os.path.exists(file_path):
            print(f"Image {image_number}.jpg already exists. Skipping download.")
            continue
        
        # Define the image metadata URL
        metadata_url = f"{base_url}{image_number}.jpg/info.json"

        try:
            # Fetch the metadata
            response = requests.get(metadata_url)
            
            # Check if the response status code is 200 (OK), meaning the image exists
            if response.status_code == 200:
                # Define the base image URL for downloading
                image_url = f"{base_url}{image_number}.jpg/full/full/0/default.jpg"
                
                print(f"Fetching image from: {image_url}")
                
                img_response = requests.get(image_url)
                if img_response.status_code == 200:
                    with open(file_path, "wb") as f:
                        f.write(img_response.content)
                    print(f"Downloaded image: {image_number}.jpg")
                else:
                    print(f"Failed to download image: {image_number}.jpg")
            else:
                print(f"Metadata for image {image_number} not found. Skipping.")
                missing_metadata.append(image_number)  # Add ID to the list
                continue  # Skip to the next image

        except requests.exceptions.RequestException as e:
            # Handle potential network issues
            print(f"Error fetching image {image_number}: {e}")
            continue  # Continue to the next image on error

    # Return the list of image numbers with missing metadata
    return missing_metadata

# Example usage
base_url = "https://iiif.itatti.harvard.edu/iiif/2/bellegreene-full!"
image_numbers = all_file_names  # Replace with your actual list of image identifiers
missing_metadata_list = download_images(base_url, image_numbers)

# Print the image numbers where metadata was not found
print(f"Images with missing metadata: {missing_metadata_list}")



Image 32044150446383_001.jpg already exists. Skipping download.
Image 32044150446383_002.jpg already exists. Skipping download.
Image 32044150446383_003.jpg already exists. Skipping download.
Image 32044150446383_004.jpg already exists. Skipping download.
Image 32044150446383_005.jpg already exists. Skipping download.
Image 32044150446383_006.jpg already exists. Skipping download.
Image 32044150446383_007.jpg already exists. Skipping download.
Image 32044150446383_008.jpg already exists. Skipping download.
Image 32044150446383_009.jpg already exists. Skipping download.
Image 32044150446383_010.jpg already exists. Skipping download.
Image 32044150446383_011.jpg already exists. Skipping download.
Image 32044150446383_012.jpg already exists. Skipping download.
Image 32044150446383_013.jpg already exists. Skipping download.
Image 32044150446383_014.jpg already exists. Skipping download.
Image 32044150446383_015.jpg already exists. Skipping download.
Image 32044150446383_016.jpg already exi

In [19]:
print(len(missing_metadata_list))
missing_metadata_list

32


['32044150448397_073',
 '32044150448454_053',
 '32044150448561_0018',
 '32044150448652_0028',
 '32044150448860_007',
 '32044150448860_008',
 '32044150448860_009',
 '32044150448860_010',
 '32044150448860_011',
 '32044150448860_012',
 '32044150448860_013',
 '32044150448860_014',
 '32044150448860_015',
 '32044150448860_016',
 '32044150448860_017',
 '32044150448860_018',
 '32044150448860_019',
 '32044150448860_020',
 '32044150448860_021',
 '32044150448860_022',
 '32044150448860_023',
 '32044150448860_024',
 '32044150448860_025',
 '32044150448860_026',
 '32044150448860_027',
 '32044150448860_028',
 '32044150448860_029',
 '32044150448860_030',
 '32044150448860_031',
 '3204450449064_001',
 '3204450449064_002',
 '32044150449130_0010']