In [4]:
import os
import shutil
import random
import cv2

def undersample_dataset(input_folder, output_folder, classes_to_undersample, undersampling_ratio):
    num_images_undersampled = {}
    num_images_original = {}

    for class_name in os.listdir(input_folder):
        if class_name in classes_to_undersample:
            class_folder = os.path.join(input_folder, class_name)
            output_class_folder = os.path.join(output_folder, class_name)

            if not os.path.exists(output_class_folder):
                os.makedirs(output_class_folder)

            num_images_undersampled[class_name] = 0
            num_images_original[class_name] = 0

            for image_name in os.listdir(class_folder):
                image_path = os.path.join(class_folder, image_name)
                image = cv2.imread(image_path)

                if random.uniform(0, 1) <= undersampling_ratio:
                    output_image_path = os.path.join(output_class_folder, image_name)
                    cv2.imwrite(output_image_path, image)
                    num_images_undersampled[class_name] += 1

                num_images_original[class_name] += 1
        else:
            shutil.copytree(os.path.join(input_folder, class_name), os.path.join(output_folder, class_name))

    return num_images_undersampled, num_images_original

In [6]:
# Define the paths to your input and output folders
input_folder = "data/MSID"
output_folder = "data/MSID_US"

# Define the classes that you want to undersample
classes_to_undersample = ["Monkeypox", "Normal"]

# Define the undersampling ratio
undersampling_ratio = 0.4

num_images_undersampled, num_images_original = undersample_dataset(input_folder, output_folder, classes_to_undersample, undersampling_ratio)

In [9]:
print('original', num_images_original)

print('undersampled', num_images_undersampled)

original {'Normal': 293, 'Monkeypox': 279}
undersampled {'Normal': 123, 'Monkeypox': 116}
