In [7]:
import os
from PIL import Image

# Define the input and output folder paths
input_folder = 'IAM/aachen_validation_set'  # Folder containing the images
output_folder = 'IAM/IAMa_cropped'  # Folder to save the cropped images

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to crop an image
def crop_image(image, crop_box):
    return image.crop(crop_box)

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):  # Add more extensions if needed
        # Open an image file
        img_path = os.path.join(input_folder, filename)
        with Image.open(img_path) as img:
            # Get image dimensions (width, height)
            width, height = img.size

            # Define a cropping box (crop 10% from top and bottom only)
            left = 0  # No cropping on the left
            top = height * 0.11  # Crop 10% from the top
            right = width  # No cropping on the right
            bottom = height * 0.95  # Crop 10% from the bottom

            # Crop the image
            cropped_img = crop_image(img, (left, top, right, bottom))

            # Save the cropped image to the output folder
            cropped_img.save(os.path.join(output_folder, filename))

print("Processing complete. All images have been cropped and saved.")


Processing complete. All images have been cropped and saved.


In [4]:
import os
import shutil

# Define folder paths and the split text file path
input_folder = 'IAM/formsI-Z'  # Folder containing .png images
split_text = "C:/Users/crosi/Downloads/splits/splits/validation.uttlist"  # Path to text file with filenames
output_folder = "IAM/aachen_validation_set"  # Output folder for matched .png files

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Read the list of base filenames from the split text file (without extensions)
with open(split_text, 'r') as f:
    validation_files = set(line.strip() for line in f)  # Use a set for fast lookups

# Iterate over all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.png'):  # Only consider .png files
        # Extract the base filename (without extension) from the input folder file
        base_filename = os.path.splitext(filename)[0]
        
        # Check if the base filename matches any in the validation file list
        if base_filename in validation_files:
            source_path = os.path.join(input_folder, filename)  # Full path to the .png file
            destination_path = os.path.join(output_folder, filename)  # Destination path
            
            # Copy the file to the output folder
            shutil.copy(source_path, destination_path)
            print(f"Copied {filename} to {output_folder}")



Copied m01-090.png to IAM/aachen_validation_set
Copied m01-115.png to IAM/aachen_validation_set
Copied m01-160.png to IAM/aachen_validation_set
Copied m02-052.png to IAM/aachen_validation_set
Copied m02-075.png to IAM/aachen_validation_set
Copied m02-083.png to IAM/aachen_validation_set
Copied m02-090.png to IAM/aachen_validation_set
Copied m02-102.png to IAM/aachen_validation_set
Copied m02-106.png to IAM/aachen_validation_set
Copied m03-062.png to IAM/aachen_validation_set
Copied m03-095.png to IAM/aachen_validation_set
Copied m04-000.png to IAM/aachen_validation_set
Copied m04-007.png to IAM/aachen_validation_set
Copied m04-012.png to IAM/aachen_validation_set
Copied m04-019.png to IAM/aachen_validation_set
Copied m04-024.png to IAM/aachen_validation_set
Copied m04-145.png to IAM/aachen_validation_set
Copied m04-152.png to IAM/aachen_validation_set
Copied m04-164.png to IAM/aachen_validation_set
Copied m04-180.png to IAM/aachen_validation_set
Copied m04-190.png to IAM/aachen_validat

In [21]:
import os
from bs4 import BeautifulSoup

def extract_text_from_xml(xml_file_path, output_file_path):
    """
    Extract text from an XML file and save it to a text file, replacing XML entities.
    
    :param xml_file_path: Path to the XML file to read.
    :param output_file_path: Path to the text file to save the extracted text.
    """
    # Open and read the XML file
    with open(xml_file_path, 'r') as f:
        file_content = f.read()

    # Parse the XML file with BeautifulSoup
    soup = BeautifulSoup(file_content, 'xml')

    # Find the 'machine-printed-part' tag
    tag1 = soup.find("machine-printed-part")

    if tag1 is None:
        print(f"No 'machine-printed-part' tag found in {xml_file_path}.")
        return

    # Find all 'machine-printed-line' tags within 'tag1'
    tag2 = tag1.find_all("machine-print-line")

    with open(output_file_path, 'w') as outfile:
        # Write the text from each 'machine-print-line' tag to the file
        for line in tag2:
            text = line.get("text")
            if text:
                # Replace XML entities manually if they are not auto-decoded
                text = text.replace("&apos;", "'").replace("&quot;", '"').replace("&amp;quot;", '"')
                outfile.write(text + '\n')

def process_all_xml_files_in_directory(input_directory, output_directory):
    """
    Process all XML files in the specified directory and save the extracted text
    to the output directory.

    :param input_directory: Path to the directory containing XML files.
    :param output_directory: Path to the directory where text files will be saved.
    """
    # Ensure output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Iterate through all files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith('.xml'):
            xml_file_path = os.path.join(input_directory, filename)
            output_file_path = os.path.join(output_directory, filename.replace('.xml', '.txt'))
            
            # Extract text and save to file
            extract_text_from_xml(xml_file_path, output_file_path)
            print(f"Processed {filename}")

# Example usage
input_directory = 'IAM/aachen_validation_txt'
output_directory = 'IAM/aachen_validation_txt'
process_all_xml_files_in_directory(input_directory, output_directory)


Processed c04-156.xml
Processed c04-160.xml
Processed c06-083.xml
Processed d01-024.xml
Processed d01-056.xml
Processed d01-060.xml
Processed d03-112.xml
Processed d04-005.xml
Processed d04-008.xml
Processed d04-071.xml
Processed d04-075.xml
Processed d04-081.xml
Processed d04-117.xml
Processed d04-121.xml
Processed d06-000.xml
Processed d06-015.xml
Processed d06-030.xml
Processed d06-050.xml
Processed d06-082.xml
Processed d06-091.xml
Processed e01-014.xml
Processed e01-018.xml
Processed e06-006.xml
Processed e06-037.xml
Processed e06-041.xml
Processed f04-046.xml
Processed f04-049.xml
Processed f04-093.xml
Processed f04-096.xml
Processed f04-100.xml
Processed f07-028b.xml
Processed f07-032b.xml
Processed f07-036.xml
Processed f07-039b.xml
Processed f07-042b.xml
Processed f07-046b.xml
Processed g01-083.xml
Processed g03-016.xml
Processed g03-058.xml
Processed g04-036.xml
Processed g04-039.xml
Processed g04-055.xml
Processed g04-060.xml
Processed g04-063.xml
Processed h01-004.xml
Proce

KeyboardInterrupt: 

In [37]:
import os
import re

# Define input and output folders
input_folder = "transcriptions_IAM_llava"  # Folder containing the .txt files
output_folder = "transcriptions_IAM_llava"  # Folder to save the cleaned .txt files
os.makedirs(output_folder, exist_ok=True)


def clean_text(file_path, output_folder):
    # Get the file name without extension
    file_name = os.path.splitext(os.path.basename(file_path))[0]

    # Read the content of the file
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

    # Remove the text between [INST] and [/INST] using a regular expression
    cleaned_content = re.sub(r'\[INST\].*?\[/INST\]', '', content, flags=re.DOTALL)

    # Save the cleaned content to a new file
    output_file_path = os.path.join(output_folder, f"{file_name}.txt")
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        output_file.write(cleaned_content)

    print(f"Processed {file_path}, cleaned version saved to {output_file_path}")

# Main function to iterate over all txt files in the input folder
def main():
    # Iterate through all .txt files in the input folder
    for txt_file in os.listdir(input_folder):
            
        if txt_file.endswith(".txt"):
            file_path = os.path.join(input_folder, txt_file)
            clean_text(file_path, output_folder)

if __name__ == "__main__":
    main()


Deleted transcriptions_IAM_llava\c04-160_cleaned.txt
Deleted transcriptions_IAM_llava\c06-083_cleaned.txt
Deleted transcriptions_IAM_llava\d01-024_cleaned.txt
Deleted transcriptions_IAM_llava\d01-056_cleaned.txt
Deleted transcriptions_IAM_llava\d01-060_cleaned.txt
Deleted transcriptions_IAM_llava\d03-112_cleaned.txt
Deleted transcriptions_IAM_llava\d04-005_cleaned.txt
Deleted transcriptions_IAM_llava\d04-008_cleaned.txt
Deleted transcriptions_IAM_llava\d04-071_cleaned.txt
Deleted transcriptions_IAM_llava\d04-075_cleaned.txt
Deleted transcriptions_IAM_llava\d04-081_cleaned.txt
Deleted transcriptions_IAM_llava\d04-117_cleaned.txt
Deleted transcriptions_IAM_llava\d04-121_cleaned.txt
Deleted transcriptions_IAM_llava\d06-000_cleaned.txt
Deleted transcriptions_IAM_llava\d06-015_cleaned.txt
Deleted transcriptions_IAM_llava\d06-030_cleaned.txt
Deleted transcriptions_IAM_llava\d06-050_cleaned.txt
Deleted transcriptions_IAM_llava\d06-082_cleaned.txt
Deleted transcriptions_IAM_llava\d06-091_clean