In [1]:
import os
import random
from shutil import copyfile

# Set the path to your dataset
dataset_path = "data/MSID_US_AUG"

# Set the path to the directory where you want to save the split dataset
split_path = "data/MSID_US_AUG_SP"

# Set the percentage of data to use for training and testing
train_percent = 0.8
test_percent = 0.2

# Set the percentage of training data to use for validation
validation_percent = 0.2

In [2]:


# Create subdirectories for train, test, and validation data
train_path = os.path.join(split_path, "train")
os.makedirs(train_path, exist_ok=True)
test_path = os.path.join(split_path, "test")
os.makedirs(test_path, exist_ok=True)
validation_path = os.path.join(split_path, "validation")
os.makedirs(validation_path, exist_ok=True)

# Loop through each class directory in the dataset and split the data
for class_dir in os.listdir(dataset_path):
    if os.path.isdir(os.path.join(dataset_path, class_dir)):
        # Create subdirectories for each class in the train, test, and validation directories
        train_class_path = os.path.join(train_path, class_dir)
        os.makedirs(train_class_path, exist_ok=True)
        test_class_path = os.path.join(test_path, class_dir)
        os.makedirs(test_class_path, exist_ok=True)
        validation_class_path = os.path.join(validation_path, class_dir)
        os.makedirs(validation_class_path, exist_ok=True)

        # Get a list of all image files for this class
        all_images = os.listdir(os.path.join(dataset_path, class_dir))
        random.shuffle(all_images)

        # Calculate the number of images for each split
        num_train = int(len(all_images) * train_percent)
        num_test = int(len(all_images) * test_percent)
        num_validation = int(num_train * validation_percent)

        # Split the images and copy them to the appropriate directories
        for i, image_file in enumerate(all_images):
            source_path = os.path.join(dataset_path, class_dir, image_file)
            if i < num_train:
                if i < num_validation:
                    destination_path = os.path.join(validation_class_path, image_file)
                else:
                    destination_path = os.path.join(train_class_path, image_file)
            else:
                destination_path = os.path.join(test_class_path, image_file)
            copyfile(source_path, destination_path)

In [3]:
# Loop through each class directory in the split directories and print the number of images
for split_dir in ["train", "test", "validation"]:
    print(f"{split_dir.capitalize()} set:")
    class_counts = {}
    for class_dir in os.listdir(os.path.join(split_path, split_dir)):
        if os.path.isdir(os.path.join(split_path, split_dir, class_dir)):
            num_images = len(os.listdir(os.path.join(split_path, split_dir, class_dir)))
            print(f" - {class_dir}: {num_images} images")
            class_counts[class_dir] = num_images
    total_count = sum(class_counts.values())
    print(f"Total: {total_count} images\n")

Train set:
 - Measles: 699 images
 - Chickenpox: 822 images
 - Normal: 868 images
 - Monkeypox: 922 images
Total: 3311 images

Test set:
 - Measles: 219 images
 - Chickenpox: 257 images
 - Normal: 272 images
 - Monkeypox: 288 images
Total: 1036 images

Validation set:
 - Measles: 174 images
 - Chickenpox: 205 images
 - Normal: 216 images
 - Monkeypox: 230 images
Total: 825 images

