In [None]:
import os
import cv2
import numpy as np
import random
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator, array_to_img

# Define paths for original and augmented data
main_dir = "/content/drive/MyDrive/Discipline Specific /4th Model/Original Non-cancerous"
aug_main_dir = "/content/drive/MyDrive/Preprocessed Non Cancerous  "

# Define the classes and age groups
classes = ['vasc', 'nv', 'df', 'bkl']
age_groups = ["over_30", "30_and_under"]

# Target number of images per sub-folder
target_num = 500

# Define ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest"
)

# Function to remove hair from lesions before augmentation
def remove_hair_direct_replace(image, blackhat_kernel_size=(9, 9), blackhat_threshold=10, inpaint_radius=1):
    """Removes hair from the image while preserving original color in non-hair areas."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Blackhat transform to highlight hair
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, blackhat_kernel_size)
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)

    # Threshold to create binary mask
    _, hair_mask = cv2.threshold(blackhat, blackhat_threshold, 255, cv2.THRESH_BINARY)

    # Morphological opening to remove noise
    open_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    hair_mask = cv2.morphologyEx(hair_mask, cv2.MORPH_OPEN, open_kernel)

    # Dilation to ensure hair is fully covered
    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    hair_mask = cv2.dilate(hair_mask, dilate_kernel, iterations=1)

    # Inpaint hair regions
    inpainted = cv2.inpaint(image, hair_mask, inpaint_radius, flags=cv2.INPAINT_TELEA)

    # Replace only hair pixels with inpainted pixels
    hair_mask_bool = hair_mask.astype(bool)
    result = image.copy()
    result[hair_mask_bool] = inpainted[hair_mask_bool]

    return result

# Process each class and age group folder
for cls in classes:
    for age in age_groups:
        src_dir = os.path.join(main_dir, cls, age)  # Source folder
        dest_dir = os.path.join(aug_main_dir, cls, age)  # Destination folder
        os.makedirs(dest_dir, exist_ok=True)

        # Get list of image files
        image_files = [f for f in os.listdir(src_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        # Copy and preprocess original images
        for f in image_files:
            src_path = os.path.join(src_dir, f)
            dest_path = os.path.join(dest_dir, f)

            # Load image with OpenCV
            image = cv2.imread(src_path)

            # Remove hair from the image
            hair_free_image = remove_hair_direct_replace(image)

            # Save the preprocessed image to the destination folder
            cv2.imwrite(dest_path, hair_free_image)

        # Count images currently in the destination folder
        count = len(os.listdir(dest_dir))
        print(f"Folder {dest_dir} initially has {count} images after hair removal preprocessing.")

        # Skip augmentation if no original images are present
        if len(image_files) == 0:
            print(f"No images found in {src_dir}. Skipping augmentation for this folder.")
            continue

        # Index for naming augmented images
        augment_index = 0

        # Generate augmented images until the folder reaches target_num images
        while count < target_num:
            for f in image_files:
                if count >= target_num:
                    break  # Stop if target reached

                img_path = os.path.join(dest_dir, f)  # Use preprocessed image path
                img = load_img(img_path)  # Load preprocessed image
                x = img_to_array(img)  # Convert to numpy array
                x = x.reshape((1,) + x.shape)  # Reshape for ImageDataGenerator

                # Generate augmented image
                augmented = next(datagen.flow(x, batch_size=1))[0].astype(np.uint8)

                # Create a unique filename
                aug_filename = f"aug_{os.path.splitext(f)[0]}_{augment_index}.jpg"
                aug_dest_path = os.path.join(dest_dir, aug_filename)

                # Save augmented image
                array_to_img(augmented).save(aug_dest_path)

                count += 1
                augment_index += 1

                # Optional: Print progress every 50 images
                if count % 50 == 0:
                    print(f"{dest_dir}: {count} images generated.")

        print(f"After augmentation, folder {dest_dir} has {count} images.\n")

        # Remove extra images if the folder exceeds the target limit
        if count > target_num:
            print(f"Folder {dest_dir} has {count} images, removing extra to maintain {target_num}.")

            # Get list of all images in the folder
            all_images = [os.path.join(dest_dir, f) for f in os.listdir(dest_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

            # Randomly shuffle the images
            random.shuffle(all_images)

            # Remove extra images
            num_to_remove = count - target_num
            for i in range(num_to_remove):
                os.remove(all_images[i])

            print(f"Folder {dest_dir} now has exactly {target_num} images.\n")


Folder /content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30 initially has 115 images after hair removal preprocessing.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 150 images generated.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 200 images generated.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 250 images generated.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 300 images generated.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 350 images generated.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 400 images generated.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 450 images generated.
/content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30: 500 images generated.
After augmentation, folder /content/drive/MyDrive/Augmented Non Cancerous /vasc/over_30 has 500 images.

Folder /content/drive/MyDrive/Augmented Non Cancerous /vasc/30_and_under initially has 27 images

## New BKL Folder made because of low recall with the previous one

In [None]:
import os
import cv2
import numpy as np
import random
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def remove_hair_direct_replace(
    image,
    blackhat_kernel_size=(9, 9),
    blackhat_threshold=10,
    inpaint_radius=1
):
    """
    Removes hair from the image by:
      1. Detecting hair regions using blackhat + threshold + morphological ops.
      2. Inpainting only those regions.
      3. Directly replacing hair pixels in the original image with inpainted pixels,
         preserving the original color in non-hair areas.
    """
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Blackhat to highlight hair
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, blackhat_kernel_size)
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)

    # Threshold to create a binary mask
    _, hair_mask = cv2.threshold(blackhat, blackhat_threshold, 255, cv2.THRESH_BINARY)

    # Morphological opening (remove small noise)
    open_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    hair_mask = cv2.morphologyEx(hair_mask, cv2.MORPH_OPEN, open_kernel)

    # Dilate to ensure thin hair strands are fully covered
    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    hair_mask = cv2.dilate(hair_mask, dilate_kernel, iterations=1)

    # Inpaint the image in hair regions
    inpainted = cv2.inpaint(image, hair_mask, inpaint_radius, flags=cv2.INPAINT_TELEA)

    # Convert hair_mask to boolean for indexing
    hair_mask_bool = hair_mask.astype(bool)

    # Create a copy of the original image
    result = image.copy()

    # Directly replace hair pixels in 'result' with the inpainted pixels
    result[hair_mask_bool] = inpainted[hair_mask_bool]

    return result

# Set up image augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

def augment_images(source_folder, target_folder, target_count=500):
    """
    Augments images from source_folder and saves the augmented images to target_folder.
    To ensure diversity, the code shuffles the list of available images and processes them
    one at a time in cycles until the total number of augmented images reaches target_count.
    """
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # Look for common image file formats
    valid_ext = [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]
    image_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder)
                   if os.path.splitext(f)[1].lower() in valid_ext]

    if len(image_files) == 0:
        print("No images found in", source_folder)
        return

    augmented_count = 0
    cycle = 0
    while augmented_count < target_count:
        # Shuffle the image list each cycle for diversity
        random.shuffle(image_files)
        for image_path in image_files:
            if augmented_count >= target_count:
                break
            image = cv2.imread(image_path)
            if image is None:
                continue

            # Pre-process image: remove hair
            image_preprocessed = remove_hair_direct_replace(image)

            # Prepare image for augmentation (expand dimensions to match generator input)
            image_array = np.expand_dims(image_preprocessed, 0)

            # Generate one augmented image for this original image
            aug_iter = datagen.flow(image_array, batch_size=1)
            aug_image = next(aug_iter)[0].astype(np.uint8)

            # Construct a unique filename for the augmented image
            filename = os.path.basename(image_path)
            name, _ = os.path.splitext(filename)
            save_path = os.path.join(target_folder, f"{name}_aug_cycle{cycle}_{augmented_count}.png")
            cv2.imwrite(save_path, aug_image)
            print(f"Saved {save_path}")
            augmented_count += 1
        cycle += 1
    print(f"Augmentation complete: {augmented_count} images saved in {target_folder}")

if __name__ == "__main__":
    # Update these paths to match your Google Drive folder structure.
    base_drive_folder = "/content/drive/MyDrive/Discipline Specific /4th Model/Original Non-cancerous/bkl"  # Folder containing subfolders "over_30" and "30_and_under"
    target_drive_folder = "/content/drive/MyDrive/Discipline Specific /4th Model/New BKL"  # New folder to store augmented images

    # Define the two subfolder names
    subfolders = ["over_30", "30_and_under"]

    for sub in subfolders:
        source_path = os.path.join(base_drive_folder, sub)
        target_path = os.path.join(target_drive_folder, sub)
        print(f"Augmenting images in {source_path}...")
        augment_images(source_path, target_path, target_count=500)