In [9]:
import os

def rename_file(file_path, filename):
    id = filename.split('.')[0]
    print(f"ID: {id}")
    
    # Split the filename at the first underscore
    parts = id.split('_', 1)
    print(f"Parts: {parts}")
    
    # Check if there's a second part
    if len(parts) < 2:
        print(f"Skipping '{filename}': does not contain an underscore.")
        return
    
    # Get the second part (everything after the first underscore)
    new_name = parts[1] + '.txt'
    new_file_path = os.path.join(os.path.dirname(file_path), new_name)
    
    print(f"New name: {new_file_path}")
    
    # Rename the file
    try:
        os.rename(file_path, new_file_path)
        print(f"Successfully renamed '{filename}' to '{new_name}'")
    except OSError as e:
        print(f"Error renaming '{filename}': {e}")

def rename_files_in_folder(folder_path):
    # Iterate over all files in the specified folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's a file (not a subdirectory)
        if os.path.isfile(file_path):
            rename_file(file_path, filename)

# Example usage
folder_path = "BGdataset/transkribus_transcription"
full_path = os.path.abspath(folder_path)
print(f"Full path: {full_path}")
print(f"Path exists: {os.path.exists(full_path)}") # Current directory. Change this to the path of your folder.
rename_files_in_folder(folder_path)


Full path: c:\Users\crosi\Documents\GitHub\InternshipITatti\BGdataset\transkribus_transcription
Path exists: True
ID: 0001_00003_1
Parts: ['0001', '00003_1']
New name: BGdataset/transkribus_transcription\00003_1.txt
Successfully renamed '0001_00003_1.txt' to '00003_1.txt'
ID: 0002_00003_3
Parts: ['0002', '00003_3']
New name: BGdataset/transkribus_transcription\00003_3.txt
Successfully renamed '0002_00003_3.txt' to '00003_3.txt'
ID: 0003_00004_1
Parts: ['0003', '00004_1']
New name: BGdataset/transkribus_transcription\00004_1.txt
Successfully renamed '0003_00004_1.txt' to '00004_1.txt'
ID: 0004_00010_7
Parts: ['0004', '00010_7']
New name: BGdataset/transkribus_transcription\00010_7.txt
Successfully renamed '0004_00010_7.txt' to '00010_7.txt'
ID: 0005_00012_13
Parts: ['0005', '00012_13']
New name: BGdataset/transkribus_transcription\00012_13.txt
Successfully renamed '0005_00012_13.txt' to '00012_13.txt'
ID: 0006_00019_15
Parts: ['0006', '00019_15']
New name: BGdataset/transkribus_transcri

In [13]:
import pandas as pd
import requests
from io import BytesIO
import re  # Import regular expressions module

# URL to download the Excel file
url = 'https://docs.google.com/spreadsheets/d/1KQZOmXHrjXuQCttNGuAxFUr72SlXcBCYbs8c2Cy3fDE/export?format=xlsx&gid=175388937'

# Fetch the file
response = requests.get(url)

all_file_names = []

# Check if the request was successful
if response.status_code == 200:
    # Load the content into a pandas DataFrame
    df = pd.read_excel(BytesIO(response.content))
    df['Letter_ID'] = df['Letter_ID'].astype(str).replace('00215a', '00000')
    # Define a function to check if the Letter_ID falls within the specified range
    def is_within_range(letter_id):
        try:
            
            # Convert to integer if possible
            numeric_id = int(letter_id)
            return 0 <= numeric_id <= 500 
        except ValueError:
            return False

    # Apply the filtering function
    filtered_df = df[df['Letter_ID'].apply(is_within_range)]
    
    # Iterate over the filtered DataFrame and clean file names
    for i in filtered_df['I Tatti file name(s)']:
        if pd.notna(i):  # Check if the value is not NaN
            # Remove newline characters and split using regex for commas and surrounding spaces
            file_names = re.split(r'\s*,\s*', i.replace('\n', ''))
            
            # Clean up extra spaces within each file name and filter out empty strings
            cleaned_file_names = [re.sub(r'\s+', ' ', name).strip() for name in file_names if name.strip()]
            
            # Add cleaned, non-empty values to the list
            all_file_names.extend(cleaned_file_names)
    
    # Update specific filenames directly in the list
    for index, file_name in enumerate(all_file_names):
        if file_name == '32044150448397_073':
            all_file_names[index] = '32044150448397_063'
        elif file_name == '32044150448561_0018':
            all_file_names[index] = '32044150448561_018'
        elif file_name == "32044150448652_0028":
            all_file_names[index] = "32044150448652_028"
        elif file_name == "3204450449064_001":
            all_file_names[index] = "32044150449064_001"
        elif file_name == "3204450449064_002":
            all_file_names[index] = "32044150449064_002"
        elif file_name == "32044150449130_0010":
            all_file_names[index] = "32044150449130_010"
        elif file_name == "32044150448860_007":
            all_file_names[index] = "32044150448850_007"

    # Print the cleaned and updated list of filenames
    print(all_file_names)

    # Count the number of file names
    n = len(all_file_names)
    print(n)

else:
    print(f"Failed to fetch the file: {response.status_code}")



['32044150446383_001', '32044150446383_002', '32044150446383_003', '32044150446383_004', '32044150446383_005', '32044150446383_006', '32044150446383_007', '32044150446383_008', '32044150446383_009', '32044150446383_010', '32044150446383_011', '32044150446383_012', '32044150446383_013', '32044150446383_014', '32044150446383_015', '32044150446383_016', '32044150446383_017', '32044150446383_018', '32044150446383_019', '32044150446383_020', '32044150446383_021', '32044150446383_022', '32044150446383_023', '32044150446383_024', '32044150446383_025', '32044150446383_026', '32044150446383_027', '32044150446383_028', '32044150446375_001', '32044150446375_002', '32044150446375_003', '32044150446375_004', '32044150446375_005', '32044150446375_006', '32044150446375_007', '32044150446375_008', '32044150446375_009', '32044150446375_010', '32044150446375_011', '32044150446375_046', '32044150446375_048', '32044150446375_012', '32044150446375_013', '32044150446375_014', '32044150446375_015', '32044150

In [17]:
import requests
import os

def download_images(base_url, image_numbers, output_dir="BGdataset/images"):
    """
    Downloads images from the specified IIIF base URL, processes and saves them in the given directory.
    Stores IDs of images whose metadata was not found in a list.
    
    Parameters:
        base_url (str): The base URL for the IIIF image service.
        image_numbers (list of str): The list of image identifiers or names to download.
        output_dir (str): The directory where images will be saved.
    
    Returns:
        List of image IDs where metadata was not found.
    """
    # List to store image numbers where metadata was not found
    missing_metadata = []

    # Create a directory to store the images if it doesn't already exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for image_number in image_numbers:
        # Define the path where the image will be saved
        file_path = os.path.join(output_dir, f"{image_number}.jpg")

        # Skip downloading if the file already exists
        if os.path.exists(file_path):
            print(f"Image {image_number}.jpg already exists. Skipping download.")
            continue
        
        # Define the image metadata URL
        metadata_url = f"{base_url}{image_number}.jpg/info.json"

        try:
            # Fetch the metadata
            response = requests.get(metadata_url)
            
            # Check if the response status code is 200 (OK), meaning the image exists
            if response.status_code == 200:
                # Define the base image URL for downloading
                image_url = f"{base_url}{image_number}.jpg/full/full/0/default.jpg"
                
                print(f"Fetching image from: {image_url}")
                
                img_response = requests.get(image_url)
                if img_response.status_code == 200:
                    with open(file_path, "wb") as f:
                        f.write(img_response.content)
                    print(f"Downloaded image: {image_number}.jpg")
                else:
                    print(f"Failed to download image: {image_number}.jpg")
            else:
                print(f"Metadata for image {image_number} not found. Skipping.")
                missing_metadata.append(image_number)  # Add ID to the list
                continue  # Skip to the next image

        except requests.exceptions.RequestException as e:
            # Handle potential network issues
            print(f"Error fetching image {image_number}: {e}")
            continue  # Continue to the next image on error

    # Return the list of image numbers with missing metadata
    return missing_metadata

# Example usage
base_url = "https://iiif.itatti.harvard.edu/iiif/2/bellegreene-full!"
image_numbers = all_file_names  # Replace with your actual list of image identifiers
missing_metadata_list = download_images(base_url, image_numbers)

# Print the image numbers where metadata was not found
print(f"Images with missing metadata: {missing_metadata_list}")



Error fetching image 32044150446383_001: HTTPSConnectionPool(host='iiif.itatti.harvard.edu', port=443): Max retries exceeded with url: /iiif/2/bellegreene-full!32044150446383_001.jpg/info.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002C0D289D1E0>, 'Connection to iiif.itatti.harvard.edu timed out. (connect timeout=None)'))


In [15]:
print(len(missing_metadata_list))
missing_metadata_list

24


['32044150448860_008',
 '32044150448860_009',
 '32044150448860_010',
 '32044150448860_011',
 '32044150448860_012',
 '32044150448860_013',
 '32044150448860_014',
 '32044150448860_015',
 '32044150448860_016',
 '32044150448860_017',
 '32044150448860_018',
 '32044150448860_019',
 '32044150448860_020',
 '32044150448860_021',
 '32044150448860_022',
 '32044150448860_023',
 '32044150448860_024',
 '32044150448860_025',
 '32044150448860_026',
 '32044150448860_027',
 '32044150448860_028',
 '32044150448860_029',
 '32044150448860_030',
 '32044150448860_031']

In [17]:
import os

def count_elements_in_folder(directory):
    """
    Counts the number of elements (files and subdirectories) in a given directory.
    
    Parameters:
        directory (str): The path of the directory to count elements in.
        
    Returns:
        int: The total number of elements in the directory.
    """
    if not os.path.exists(directory):
        print(f"Directory {directory} does not exist.")
        return 0

    # List all files and subdirectories in the directory
    elements = os.listdir(directory)

    # Count the elements
    count = len(elements)
    
    return count

# Example usage
directory = "BGdataset/images"  # Replace with your folder path
element_count = count_elements_in_folder(directory)
print(f"Total number of elements in '{directory}': {element_count}")


Total number of elements in 'BGdataset/images': 3682


In [32]:
import os
import pandas as pd
import re  # Importing regex for string manipulation

def update_tatti_file_names(folder_path, df):
    """
    Iterates over all files in a folder, extracts parts before and after underscores, and maps the sorted parts
    to the corresponding 'Corresponding txt files' column based on the 'Letter_ID' column.

    Additionally, directly substitutes specific incorrect filenames in the 'I Tatti file name(s)' column.

    Parameters:
        folder_path (str): Path to the folder containing the files.
        df (pd.DataFrame): DataFrame containing 'Letter_ID' and 'I Tatti file name(s)' columns.

    Returns:
        pd.DataFrame: Updated DataFrame with a new column 'Corresponding txt files' that lists the txt files.
    """
    # Ensure the folder exists
    if not os.path.exists(folder_path):
        print(f"Directory {folder_path} does not exist.")
        return df
    
    # Create a dictionary of substitutions
    substitutions = {
        '32044150448397_073': '32044150448397_063',
        '32044150448561_0018': '32044150448561_018',
        '32044150448652_0028': '32044150448652_028',
        '3204450449064_001': '32044150449064_001',
        '3204450449064_002': '32044150449064_002',
        '32044150449130_0010': '32044150449130_010',
        '32044150448860_007': '32044150448850_007'
    }

    # Apply substitutions directly to the 'I Tatti file name(s)' column
    for idx, row in df.iterrows():
        filenames = row['I Tatti file name(s)']
        if pd.notna(filenames):  # Check if it's not NaN
            for old_name, new_name in substitutions.items():
                filenames = filenames.replace(old_name, new_name)  # Replace old names with new names
            df.at[idx, 'I Tatti file name(s)'] = filenames  # Update the DataFrame with the corrected names

    # Now map txt files from the folder based on Letter_ID
    letter_id_files = {}

    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file has a .txt extension
        if file_name.endswith('.txt'):
            # Split the filename into the part before and after '_'
            parts = file_name.split('_')
            if len(parts) == 2:  # Ensure the file follows the expected pattern
                letter_id = parts[0]  # The part before the underscore
                try:
                    # Extract the number after the underscore and remove the file extension
                    number = int(parts[1].replace('.txt', ''))

                    # Store the file name in a dictionary under the corresponding letter_id
                    if letter_id not in letter_id_files:
                        letter_id_files[letter_id] = []
                    letter_id_files[letter_id].append((file_name, number))  # Store both the file name and the number
                except ValueError:
                    print(f"Invalid number after underscore in file: {file_name}")

    # Add a new column for the corresponding txt files
    df['Corresponding txt files'] = ''

    # Now update the DataFrame by mapping sorted file names to 'Corresponding txt files'
    for idx, row in df.iterrows():
        letter_id = row['Letter_ID']
        if letter_id in letter_id_files:
            # Sort files by the number after the underscore
            sorted_files = sorted(letter_id_files[letter_id], key=lambda x: x[1])
            
            # Extract only the file names and add to the new column
            df.at[idx, 'Corresponding txt files'] = ', '.join([file[0] for file in sorted_files])

    return df

# Example usage
folder_path = 'BGdataset/txt'  # Replace with the actual folder path
df = pd.read_excel('https://docs.google.com/spreadsheets/d/1KQZOmXHrjXuQCttNGuAxFUr72SlXcBCYbs8c2Cy3fDE/export?format=xlsx&gid=175388937')  # Load your DataFrame
df['Letter_ID'] = df['Letter_ID'].astype(str)  # Ensure 'Letter_ID' is a string

# Update the DataFrame by adding a new column for corresponding txt files and performing filename substitutions
updated_df = update_tatti_file_names(folder_path, df)

# Display the updated DataFrame with the new column
updated_df = updated_df[['Letter_ID', 'I Tatti file name(s)', 'Corresponding txt files']]
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
display(updated_df)
updated_df.to_csv('updated_dataset.csv', index=False)
print("Updated DataFrame saved to 'updated_dataset.csv'.")


Unnamed: 0,Letter_ID,I Tatti file name(s),Corresponding txt files
0,00001,"32044150446383_001, 32044150446383_002","00001_1.txt, 00001_3.txt"
1,00002,32044150446383_003,00002_1.txt
2,00003,"32044150446383_004, 32044150446383_005, 320441...","00003_1.txt, 00003_3.txt, 00003_4.txt, 00003_6..."
3,00004,"32044150446383_010, 32044150446383_011, 320441...","00004_1.txt, 00004_3.txt, 00004_4.txt, 00004_5..."
4,00005,"32044150446383_014, 32044150446383_015","00005_1.txt, 00005_2.txt"
...,...,...,...
602,00602,"32044150449395_001, 32044150449395_002, 320441...",
603,00603,"32044150449395_005, 32044150449395_006",
604,00604,"32044150449403_001, 32044150449403_002",
605,,,


Updated DataFrame saved to 'updated_dataset.csv'.


In [40]:
import pandas as pd
import os
from pathlib import Path


# Function to safely split strings or return empty list for non-strings
def safe_split(value):
    if isinstance(value, str):
        return value.split(', ')
    else:
        print(f"Warning: Non-string value found: {value}")
        return []

# Keep only the first 100 letters
updated_df = updated_df.head(100)
updated_df
# # Create a mapping from Tatti file names to new names
tatti_to_new = {}
for idx, row in updated_df.iterrows():
    tatti_files = safe_split(row['I Tatti file name(s)'])
    txt_files = safe_split(row['Corresponding txt files'])
    
    if not tatti_files or not txt_files:
        print(f"Warning: Empty data in row {idx}")
        continue
    
    for tatti, txt in zip(tatti_files, txt_files):
        new_name = txt.replace('.txt', '.jpg')
        tatti_to_new[tatti] = new_name
        


# Print the first few items in the mapping
print("\nFirst few items in the tatti_to_new mapping:")
for i, (key, value) in enumerate(tatti_to_new.items()):
    if i < 5:
        print(f"{key}: {value}")
    else:
        break

# Function to rename files
def rename_file(old_path, new_name):
    new_path = old_path.parent / new_name
    old_path.rename(new_path)
    print(f"Renamed {old_path.name} to {new_name}")

# Iterate over images in the folder
image_folder = Path('BGdataset/images')  # Replace with your folder path
for image_path in image_folder.glob('*.jpg'):
    tatti_name = image_path.stem  # Filename without extension
    if tatti_name in tatti_to_new:
        new_name = tatti_to_new[tatti_name]
        rename_file(image_path, new_name)
    else:
        print(f"No matching new name found for {image_path.name}")

print("Renaming complete!")


First few items in the tatti_to_new mapping:
32044150446383_001: 00001_1.jpg
32044150446383_002: 00001_3.jpg
32044150446383_003: 00002_1.jpg
32044150446383_004: 00003_1.jpg
32044150446383_005: 00003_3.jpg
No matching new name found for 00001_1.jpg
No matching new name found for 00001_3.jpg
No matching new name found for 00002_1.jpg
No matching new name found for 00003_1.jpg
No matching new name found for 00003_3.jpg
No matching new name found for 00003_4.jpg
No matching new name found for 00003_6.jpg
No matching new name found for 00003_7.jpg
No matching new name found for 00003_9.jpg
No matching new name found for 00004_1.jpg
No matching new name found for 00004_3.jpg
No matching new name found for 00004_4.jpg
No matching new name found for 00004_5.jpg
No matching new name found for 00005_1.jpg
No matching new name found for 00005_2.jpg
No matching new name found for 00006_1.jpg
No matching new name found for 00006_3.jpg
No matching new name found for 00007_1.jpg
No matching new name

In [1]:
import os
import shutil

# Define the source and destination directories
source_folder = "BGdataset/html"  # Replace with your source folder path
destination_folder = "BGdataset/filtered100html"  # Replace with your destination folder path

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# List of problematic prefixes
problematic_letters = [
    "00018", "00038", "00059", "00060", "00061", "00080", "00171", "00202",
    "00203", "00214", "00240", "00261", "00312", "00313", "00325", "00341",
    "00450", "00451", "00479"
]

# Function to copy files while respecting the given conditions
def copy_filtered_images(source_folder, destination_folder, problematic_letters):
    for filename in sorted(os.listdir(source_folder)):
        # Extract the number before the underscore
        prefix = filename.split('_')[0]
        
        # Stop processing once we reach "00101"
        if prefix >= "00101":
            break
        
        # Skip files with problematic prefixes
        if prefix in problematic_letters:
            continue
        
        # Copy the file to the destination folder
        source_path = os.path.join(source_folder, filename)
        destination_path = os.path.join(destination_folder, filename)
        shutil.copy(source_path, destination_path)
        print(f"Copied: {filename}")

# Run the function to copy the images
copy_filtered_images(source_folder, destination_folder, problematic_letters)


Copied: 00001_1.html
Copied: 00001_3.html
Copied: 00002_1.html
Copied: 00003_1.html
Copied: 00003_3.html
Copied: 00003_4.html
Copied: 00003_6.html
Copied: 00003_7.html
Copied: 00003_9.html
Copied: 00004_1.html
Copied: 00004_3.html
Copied: 00004_4.html
Copied: 00004_5.html
Copied: 00005_1.html
Copied: 00005_2.html
Copied: 00006_1.html
Copied: 00006_3.html
Copied: 00007_1.html
Copied: 00007_3.html
Copied: 00007_4.html
Copied: 00008_1.html
Copied: 00008_3.html
Copied: 00008_4.html
Copied: 00008_6.html
Copied: 00009_1.html
Copied: 00009_3.html
Copied: 00009_4.html
Copied: 00009_5.html
Copied: 00010_1.html
Copied: 00010_3.html
Copied: 00010_4.html
Copied: 00010_6.html
Copied: 00010_7.html
Copied: 00010_9.html
Copied: 00011_1.html
Copied: 00011_11.html
Copied: 00011_13.html
Copied: 00011_3.html
Copied: 00011_5.html
Copied: 00011_7.html
Copied: 00011_9.html
Copied: 00012_1.html
Copied: 00012_11.html
Copied: 00012_13.html
Copied: 00012_15.html
Copied: 00012_3.html
Copied: 00012_5.html
Copied: 

In [43]:
import pandas as pd

# Load the CSV file


# Initialize a list to track rows with mismatched counts
mismatched_rows = []

# Iterate over each row in the dataframe
for index, row in updated_df.iterrows():
    i_tatti_files = str(row['I Tatti file name(s)']).split(', ')  # Convert to string first, then split
    corresponding_txt_files = str(row['Corresponding txt files']).split(', ')
    
    # Check if the number of elements match
    if len(i_tatti_files) != len(corresponding_txt_files):
        mismatched_rows.append(index)  # Collect rows with mismatched lengths

# Create a DataFrame with the mismatched rows
if mismatched_rows:
    mismatched_df = df.loc[mismatched_rows]
    print(f"Rows with mismatched element counts:\n{mismatched_df}")
    print(f"Total rows with mismatches: {len(mismatched_rows)}")
    
    # Optionally, save the mismatched rows to a new CSV file
    mismatched_df.to_csv('mismatched_rows.csv', index=False)
else:
    print("All rows have matching element counts.")
    
    




Rows with mismatched element counts:
    Box Number  Folder Number Date (YYYY/MM/DD) Letter_ID  \
17        60.0            3.0        1909_05_05     00018   
37        60.0            4.0        1909_08_24     00038   
58        60.0            6.0        1910_03_22     00059   
59        60.0            6.0        1910_03_27     00060   
79        60.0            8.0        1910_09_17     00080   

                                 I Tatti file name(s)                  sender  \
17             32044150446391_001, 32044150446391_002  Greene, Belle da Costa   
37             32044150448389_052, 32044150448389_053  Greene, Belle da Costa   
58  32044150448405_056, 32044150448405_057, 320441...  Greene, Belle da Costa   
59  32044150448405_063, 32044150448405_064, 320441...  Greene, Belle da Costa   
79  32044150448421_045, 32044150448421_046, 320441...  Greene, Belle da Costa   

            recipient                                         Letterhead  \
17  Berenson, Bernard            

In [6]:
import os
import pandas as pd

# Step 1: Open the folder and get the first 500 items
def get_first_500_files(folder_path):
    # Get all files in the folder
    all_files = os.listdir(folder_path)
    # Filter to get only the first 500 files
    first_500_files = all_files[:500]
    return first_500_files

# Step 2: Open the spreadsheet and extract the first 500 rows
def get_first_500_rows_from_spreadsheet(spreadsheet_path, column_letter_id, column_file_names):
    # Load the spreadsheet
    df = pd.read_excel(spreadsheet_path)  # If the file is CSV, use read_csv instead
    # Extract the first 500 rows
    first_500_rows = df.head(500)
    # Get the letter IDs and corresponding filenames
    letter_ids = first_500_rows[column_letter_id].tolist()
    file_names = first_500_rows[column_file_names].tolist()
    return letter_ids, file_names

# Step 3: Match problematic letters and corresponding filenames
def exclude_problematic_letters(letter_ids, file_names, problematic_letters):
    excluded_files = []
    remaining_files = []

    # Iterate over the list of letter IDs and corresponding filenames
    for letter_id, file_name in zip(letter_ids, file_names):
        if letter_id in problematic_letters:
            excluded_files.append(file_name)
        else:
            remaining_files.append(file_name)
    
    return excluded_files, remaining_files

# Step 4: Export excluded files list to CSV
def export_excluded_files(excluded_files, export_path):
    # Create a DataFrame from the excluded files list
    excluded_df = pd.DataFrame(excluded_files, columns=["Excluded Files"])
    # Export the DataFrame to CSV
    excluded_df.to_csv(export_path, index=False)
    print(f"Excluded files have been exported to: {export_path}")

# Main function to run the process
def process_files_and_spreadsheet(folder_path, spreadsheet_path, column_letter_id, column_file_names, export_path):
    # Step 1: Get the first 500 files (not used for exclusion but kept for context)
    first_500_files = get_first_500_files(folder_path)
    
    # Step 2: Get the first 500 Letter IDs and corresponding file names from the spreadsheet
    letter_ids, file_names = get_first_500_rows_from_spreadsheet(spreadsheet_path, column_letter_id, column_file_names)

    # Problematic letter IDs based on your description
    problematic_letters = [
        "00018", "00038", "00059", "00060", "00061", "00080", "00171", "00202",
        "00203", "00214", "00240", "00261", "00312", "00313", "00325", "00341",
        "00450", "00451", "00479"
    ]
    
    # Step 3: Exclude the corresponding file names of problematic letters
    excluded_files, remaining_files = exclude_problematic_letters(letter_ids, file_names, problematic_letters)
    
    # Step 4: Export the excluded files list to CSV
    export_excluded_files(excluded_files, export_path)
    
    return excluded_files, remaining_files

# Example usage
folder_path = 'BGdataset/images'
spreadsheet_path = 'downloaded_spreadsheet.xlsx'  # Assuming the spreadsheet is downloaded locally
column_letter_id = 'Letter_ID'  # Column containing letter IDs
column_file_names = 'I Tatti file name(s)'  # Column in the spreadsheet containing file IDs
export_path = 'excluded_files.csv'  # Output path for the excluded files list

# Run the process
excluded_files, remaining_files = process_files_and_spreadsheet(folder_path, spreadsheet_path, column_letter_id, column_file_names, export_path)


Excluded files have been exported to: excluded_files.csv


In [10]:
import pandas as pd
df = pd.read_csv("excluded_files.csv")
df

Unnamed: 0,Excluded Files
0,"32044150446391_001, 32044150446391_002"
1,"32044150448389_052, 32044150448389_053"
2,"32044150448405_056, 32044150448405_057, 320441..."
3,"32044150448405_063, 32044150448405_064, 320441..."
4,"32044150448405_052, 32044150448405_053, 320441..."
5,"32044150448421_045, 32044150448421_046, 320441..."
6,"32044150448546_051, 32044150448546_052"
7,"32044150448595_021, 32044150448595_022, 320441..."
8,"32044150448595_038, 32044150448595_039, 320441..."
9,"32044150448611_042, 32044150448611_043, 320441..."


In [16]:
import os
import pandas as pd
import shutil

# Step 1: Load the CSV and count the total number of file elements (considering ',' as a separator)
def load_csv_and_count_elements(csv_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_path)
    
    file_names_to_delete = []
    total_file_count = 0
    
    # Iterate over each row and split the 'Excluded Files' by comma
    for idx, row in df.iterrows():
        # Split the string in the cell by comma and remove any leading/trailing spaces
        file_list = [file.strip() for file in row['Excluded Files'].split(',')]
        total_file_count += len(file_list)
        file_names_to_delete.extend(file_list)  # Add the files to the delete list
    
    return total_file_count, file_names_to_delete

# Step 2: Open the folder and get the first 500 items
def get_first_500_files(folder_path):
    # Get all files in the folder
    all_files = os.listdir(folder_path)
    # Filter to get only the first 500 files
    first_500_files = all_files[:500]
    return first_500_files

# Step 3: Delete the matching files and move the remaining files to a new folder
def delete_and_move_files(first_500_files, file_names_to_delete, folder_path, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    
    for file in first_500_files:
        file_id = os.path.splitext(file)[0]  # Extract the file name without extension
        if file_id in file_names_to_delete:
            # If file ID is in the CSV list, delete the file
            
            print(f"Deleting: {file_path}")
            
        else:
            # If file is not in the list, copy it to the new folder
            src_file = os.path.join(folder_path, file)
            dst_file = os.path.join(destination_folder, file)
            print(f"Moving: {src_file} to {dst_file}")
            shutil.copy(src_file, dst_file)

# Main function to run the process
def process_images(csv_path, folder_path, destination_folder):
    # Step 1: Load the CSV and count file elements
    total_elements, file_names_to_delete = load_csv_and_count_elements(csv_path)
    print(f"Total file elements in the CSV: {total_elements}")
    
    # Step 2: Get the first 500 files from the folder
    first_500_files = get_first_500_files(folder_path)
    print(f"First 500 files in the folder: {first_500_files[:10]}")  # Print only the first 10 for verification
    
    # Step 3: Delete matching files and move remaining files
    delete_and_move_files(first_500_files, file_names_to_delete, folder_path, destination_folder)
    print("Process completed.")

# Example usage
csv_path = 'excluded_files.csv'  # Path to the CSV file with excluded file IDs
folder_path = 'BGdataset/images'  # Path to the folder containing the images
destination_folder = 'BGdataset/remaining_images'  # Folder where the remaining images will be moved

# Run the process
process_images(csv_path, folder_path, destination_folder)


Total file elements in the CSV: 159
First 500 files in the folder: ['32044150448991_002.jpg', '32044150448991_003.jpg', '32044150448991_004.jpg', '32044150448991_005.jpg', '32044150448991_006.jpg', '32044150448991_007.jpg', '32044150448991_008.jpg', '32044150448991_009.jpg', '32044150448991_010.jpg', '32044150448991_011.jpg']
Moving: BGdataset/images\32044150448991_002.jpg to BGdataset/remaining_images\32044150448991_002.jpg
Moving: BGdataset/images\32044150448991_003.jpg to BGdataset/remaining_images\32044150448991_003.jpg
Moving: BGdataset/images\32044150448991_004.jpg to BGdataset/remaining_images\32044150448991_004.jpg
Moving: BGdataset/images\32044150448991_005.jpg to BGdataset/remaining_images\32044150448991_005.jpg
Moving: BGdataset/images\32044150448991_006.jpg to BGdataset/remaining_images\32044150448991_006.jpg
Moving: BGdataset/images\32044150448991_007.jpg to BGdataset/remaining_images\32044150448991_007.jpg
Moving: BGdataset/images\32044150448991_008.jpg to BGdataset/remai

In [18]:
import os
import pandas as pd

def load_spreadsheet(spreadsheet_path, i_tatti_column, letter_id_column):
    df = pd.read_excel(spreadsheet_path)  # Use pd.read_csv if it's a CSV
    
    filename_to_letter_id = {}
    for _, row in df.iterrows():
        letter_id = row[letter_id_column]
        i_tatti_value = row[i_tatti_column]
        
        # Check if the value is a string and not empty
        if isinstance(i_tatti_value, str) and i_tatti_value.strip():
            file_names = [name.strip() for name in i_tatti_value.split(',')]
            for file_name in file_names:
                filename_to_letter_id[file_name] = letter_id
        else:
            print(f"Warning: Skipping invalid entry for Letter_ID {letter_id}. 'I Tatti file name(s)' value: {i_tatti_value}")
    
    return filename_to_letter_id

def rename_images(remaining_images_folder, filename_to_letter_id):
    files = os.listdir(remaining_images_folder)
    renamed_files = []
    
    # Create a dictionary to count occurrences of each letter_id for proper indexing
    count_dict = {letter_id: 0 for letter_id in set(filename_to_letter_id.values())}
    
    for file in files:
        file_id = os.path.splitext(file)[0]  # Get file ID without extension
        
        if file_id in filename_to_letter_id:
            letter_id = filename_to_letter_id[file_id]
            count_dict[letter_id] += 1
            
            # Create a new name using the letter_id and the current count for this ID
            # Note the change here: we're not zero-padding the index anymore
            new_name = f"{letter_id}_{count_dict[letter_id]}{os.path.splitext(file)[1]}"
            
            src_file = os.path.join(remaining_images_folder, file)
            dst_file = os.path.join(remaining_images_folder, new_name)
            print(f"Renaming {src_file} to {dst_file}")
            os.rename(src_file, dst_file)  # Rename the file
            renamed_files.append(new_name)
        else:
            print(f"Warning: No matching Letter_ID found for file {file}. Skipping.")
    
    return renamed_files

def process_image_renaming(spreadsheet_path, remaining_images_folder, i_tatti_column='I Tatti file name(s)', letter_id_column='Letter_ID'):
    # Step 1: Load the spreadsheet and get mapping of i tatti file names to letter IDs
    filename_to_letter_id = load_spreadsheet(spreadsheet_path, i_tatti_column, letter_id_column)
    
    # Step 2: Rename images in the remaining_images folder based on the spreadsheet data
    renamed_files = rename_images(remaining_images_folder, filename_to_letter_id)
    
    print(f"Renamed {len(renamed_files)} files:")
    for name in renamed_files:
        print(name)

# Example usage
spreadsheet_path = 'downloaded_spreadsheet.xlsx'  # Path to the spreadsheet with metadata
remaining_images_folder = 'BGdataset/images'  # Folder where the remaining images are stored

# Run the process
process_image_renaming(spreadsheet_path, remaining_images_folder)

Renamed 0 files:


In [2]:
import os

# Specify the directory containing the files
directory = 'transcription_IAMD_molmo'  # Change this to your folder path

# Iterate over all files in the specified directory
for filename in os.listdir(directory):
    # Check if the file is a .txt file
    if filename.endswith('.txt'):
        # Create new filename by removing '_transcription' from the original filename
        new_filename = filename.replace('_transcription', '')
        
        # Create full paths for the old and new filenames
        old_file_path = os.path.join(directory, filename)
        new_file_path = os.path.join(directory, new_filename)
        
        # Rename the file
        os.rename(old_file_path, new_file_path)

print("Files renamed successfully.")


Files renamed successfully.


In [1]:
import os
import shutil
import random

# Define the source and destination directories
source_folder = 'BGdataset/images'  # Folder where images are stored
test_split_folder = 'BGdataset/BGTRAIN1000img'  # Folder for test split
validation_split_folder = 'BGdataset/BGVALIDATION10percent'  # Folder for validation split

# Ensure test_split and validation_split folders exist
os.makedirs(test_split_folder, exist_ok=True)
os.makedirs(validation_split_folder, exist_ok=True)

# Step 1: List and sort the images in the source folder
images = sorted([f for f in os.listdir(source_folder) if f.endswith('.jpg') or f.endswith('.png')])

# Step 2: Take the first 2000 images
first_2000_images = images[:1000]

# Copy the first 2000 images to the test_split folder
for image in first_2000_images:
    shutil.copy(os.path.join(source_folder, image), os.path.join(test_split_folder, image))

# Step 3: Randomly select 200 images from test_split folder for validation
test_images = os.listdir(test_split_folder)
validation_images = random.sample(test_images, 100)

# Move these 200 images to the validation_split folder
for image in validation_images:
    shutil.move(os.path.join(test_split_folder, image), os.path.join(validation_split_folder, image))

print("Test split and validation split creation is complete.")


Test split and validation split creation is complete.


In [16]:
import os
import pandas as pd

def load_spreadsheet(spreadsheet_path, i_tatti_column, letter_id_column):
    df = pd.read_excel(spreadsheet_path)  # Use pd.read_csv if it's a CSV
    
    filename_to_letter_id = {}
    for _, row in df.iterrows():
        letter_id = row[letter_id_column]
        i_tatti_value = row[i_tatti_column]
        
        # Check if the value is a string and not empty
        if isinstance(i_tatti_value, str) and i_tatti_value.strip():
            file_names = [name.strip() for name in i_tatti_value.split(',')]
            for file_name in file_names:
                filename_to_letter_id[file_name] = letter_id
        else:
            print(f"Warning: Skipping invalid entry for Letter_ID {letter_id}. 'I Tatti file name(s)' value: {i_tatti_value}")
    
    return filename_to_letter_id

def rename_text_files(remaining_texts_folder, filename_to_letter_id):
    files = os.listdir(remaining_texts_folder)
    renamed_files = []
    
    # Create a dictionary to count occurrences of each letter_id for proper indexing
    count_dict = {letter_id: 0 for letter_id in set(filename_to_letter_id.values())}
    
    for file in files:
        if file.endswith('.txt'):
            file_id = os.path.splitext(file)[0]  # Get file ID without extension
            
            if file_id in filename_to_letter_id:
                letter_id = filename_to_letter_id[file_id]
                count_dict[letter_id] += 1
                
                # Create the new name for the file
                new_name = f"{letter_id}_{count_dict[letter_id]}.txt"
                dst_file = os.path.join(remaining_texts_folder, new_name)
                
                # Skip renaming if the file with the new name already exists
                if os.path.exists(dst_file):
                    print(f"File {dst_file} already exists. Skipping renaming for {file}.")
                    continue
                
                # Proceed with renaming
                src_file = os.path.join(remaining_texts_folder, file)
                print(f"Renaming {src_file} to {dst_file}")
                os.rename(src_file, dst_file)  # Rename the file
                renamed_files.append(new_name)
            else:
                print(f"Warning: No matching Letter_ID found for file {file}. Skipping.")
    
    return renamed_files

def process_text_renaming(spreadsheet_path, remaining_texts_folder, i_tatti_column='I Tatti file name(s)', letter_id_column='Letter_ID'):
    # Step 1: Load the spreadsheet and get mapping of i tatti file names to letter IDs
    filename_to_letter_id = load_spreadsheet(spreadsheet_path, i_tatti_column, letter_id_column)
    
    # Step 2: Rename text files in the remaining_texts folder based on the spreadsheet data
    renamed_files = rename_text_files(remaining_texts_folder, filename_to_letter_id)
    
    print(f"Renamed {len(renamed_files)} files:")
    for name in renamed_files:
        print(name)

# Example usage
spreadsheet_path = 'downloaded_spreadsheet.xlsx'  # Path to the spreadsheet with metadata
remaining_texts_folder = 'transcriptions_BG_internVL/txt'  # Folder where the remaining .txt files are stored

# Run the process
process_text_renaming(spreadsheet_path, remaining_texts_folder)


Renaming transcriptions_BG_internVL/txt\32044150446375_003.txt to transcriptions_BG_internVL/txt\00010_1.txt
Renaming transcriptions_BG_internVL/txt\32044150446375_014.txt to transcriptions_BG_internVL/txt\00012_1.txt
Renaming transcriptions_BG_internVL/txt\32044150446375_019.txt to transcriptions_BG_internVL/txt\00012_2.txt
Renaming transcriptions_BG_internVL/txt\32044150446375_023.txt to transcriptions_BG_internVL/txt\00013_1.txt
Renaming transcriptions_BG_internVL/txt\32044150446375_032.txt to transcriptions_BG_internVL/txt\00014_1.txt
Renaming transcriptions_BG_internVL/txt\32044150446375_045.txt to transcriptions_BG_internVL/txt\00017_1.txt
Renaming transcriptions_BG_internVL/txt\32044150446383_017.txt to transcriptions_BG_internVL/txt\00006_1.txt
Renaming transcriptions_BG_internVL/txt\32044150446383_020.txt to transcriptions_BG_internVL/txt\00007_1.txt
Renaming transcriptions_BG_internVL/txt\32044150446383_021.txt to transcriptions_BG_internVL/txt\00008_1.txt
Renaming transcript