In [1]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import gc # Garbage collection to clear RAM

#### Benign = 1569
#### Malignant = 803

In [2]:
path = "C:/Users/Gavela Maculuve/PycharmProjects/Projects_/CNN/BreastCancer/images"
trainpath = path + '/train'
validpath = path + '/valid'
testpath = path + '/test'

In [3]:
malignant = [
    os.path.join(trainpath, 'Malignant'),
    os.path.join(validpath, 'Malignant'),
    os.path.join(testpath, 'Malignant')
]

benign = [
    os.path.join(trainpath, 'Benign'),
    os.path.join(validpath, 'Benign'),
    os.path.join(testpath, 'Benign')
]

splits = ['Train', 'Validation', 'Test']

In [4]:
balance_gen = ImageDataGenerator(
    rotation_range = 20,
    width_shift_range= 0.1,
    height_shift_range= 0.1,
    shear_range= 0.1,
    zoom_range= 0.1,
    horizontal_flip= True,
    vertical_flip= False,
    fill_mode= 'nearest'
)

In [5]:
for i in range(3):
    target_count = len(os.listdir(benign[i]))
    current_count = len(os.listdir(malignant[i]))
    needed = target_count - current_count

    print(splits[i])
    print(f"Current Malignant: {current_count} | Target: {target_count}")
    print(f"Generating {needed} variations...\n")


Train
Current Malignant: 918 | Target: 1569
Generating 651 variations...

Validation
Current Malignant: 227 | Target: 448
Generating 221 variations...

Test
Current Malignant: 128 | Target: 208
Generating 80 variations...



In [6]:
for i in range(len(splits)):
    target_folder = benign[i]
    current_folder = malignant[i]

    target_count = len(os.listdir(target_folder))
    current_count = len(os.listdir(current_folder))
    needed = target_count - current_count

    print(splits[i])
    print(f"Current Malignant: {current_count} | Target: {target_count}")
    print(f"Generating {needed} variations...\n")

    if needed <= 0:
        print("You good")
        continue
    print(f"Generating {needed} variations for {splits[i]} Malignant folder..")

    # original files as templates
    files = [f for f in os.listdir(current_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    count = 0
    while count < needed:
        # picking a random photo
        random_file = np.random.choice(files)
        img_path = os.path.join(current_folder, random_file)

        img = load_img(img_path, target_size = (128, 128))
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)

        # save the variation
        for batch in balance_gen.flow(x, batch_size=1,
                                     save_to_dir=current_folder,
                                     save_prefix='aug_bal',
                                     save_format='jpg'):
            count += 1
            break
        if count % 10 == 0:
            gc.collect()
    print(f"\n{splits[i]} complete. Malignant: {len(os.listdir(current_folder))}")
print("Succesfull")

Train
Current Malignant: 918 | Target: 1569
Generating 651 variations...

Generating 651 variations for Train Malignant folder..

Train complete. Malignant: 1542
Validation
Current Malignant: 227 | Target: 448
Generating 221 variations...

Generating 221 variations for Validation Malignant folder..

Validation complete. Malignant: 448
Test
Current Malignant: 128 | Target: 208
Generating 80 variations...

Generating 80 variations for Test Malignant folder..

Test complete. Malignant: 208
Succesfull


In [8]:
malignant = [
    os.path.join(trainpath, 'Malignant'),
    os.path.join(validpath, 'Malignant'),
    os.path.join(testpath, 'Malignant')
]

benign = [
    os.path.join(trainpath, 'Benign'),
    os.path.join(validpath, 'Benign'),
    os.path.join(testpath, 'Benign')
]

splits = ['Train', 'Validation', 'Test']

for i in range(3):
    target_count = len(os.listdir(benign[i]))
    current_count = len(os.listdir(malignant[i]))
    needed = target_count - current_count

    print(splits[i])
    print(f"Current Malignant: {current_count} | Target: {target_count}")
    print(f"Missing {needed} variations...\n")

Train
Current Malignant: 1542 | Target: 1569
Missing 27 variations...

Validation
Current Malignant: 448 | Target: 448
Missing 0 variations...

Test
Current Malignant: 208 | Target: 208
Missing 0 variations...

