In [1]:
import os
import shutil
import random
import xml.etree.ElementTree as ET

In [2]:
# Define the directory paths
images_dir = "data/images/"
xml_dir = "data/label/"

# Get a list of all XML files
xml_files = [f for f in os.listdir(xml_dir) if f.endswith('.xml')]

# Loop through each XML file
for xml_file in xml_files:
    xml_path = os.path.join(xml_dir, xml_file)

    # Parse the XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Check if any object has a label starting with "table"
    has_table = any(obj.find('name').text.startswith('table') for obj in root.findall('.//object'))

    if not has_table:
        # If no "table" label, remove the corresponding image
        image_file = f'{os.path.splitext(xml_file)[0]}.jpg'
        image_path = os.path.join(images_dir, image_file)

        os.remove(xml_path)  # Remove XML file
        os.remove(image_path)  # Remove corresponding image file

        print(f"Removed {xml_file} and {image_file}")

print("Process completed.")

Process completed.


In [3]:
# Define paths
input_folder = 'data/label/'
output_folder = 'data/'

# Define split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

In [4]:
# Get a list of all XML files in the input folder
xml_files = [f for f in os.listdir(input_folder) if f.endswith('.xml')]

# Calculate the number of files for each split
num_files = len(xml_files)
num_train = int(num_files * train_ratio)
num_val = int(num_files * val_ratio)
num_test = num_files - num_train - num_val

# Shuffle the list of XML files
random.shuffle(xml_files)

# Split the files
train_files = xml_files[:num_train]
val_files = xml_files[num_train:num_train+num_val]
test_files = xml_files[-num_test:]

# Create output folders if they don't exist
os.makedirs(os.path.join(output_folder, 'train'), exist_ok=True)
os.makedirs(os.path.join(output_folder, 'val'), exist_ok=True)
os.makedirs(os.path.join(output_folder, 'test'), exist_ok=True)

# Move files to respective folders
for file in train_files:
    shutil.move(os.path.join(input_folder, file), os.path.join(output_folder, 'train', file))

for file in val_files:
    shutil.move(os.path.join(input_folder, file), os.path.join(output_folder, 'val', file))

for file in test_files:
    shutil.move(os.path.join(input_folder, file), os.path.join(output_folder, 'test', file))

In [5]:
os.rmdir(input_folder)