In [1]:
import pandas as pd
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
# Mount drive and download data from EDA step 1
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# configure google drive folders from EDA_Preprocessing.ipynb S1 file
GDIR_PROJECT = '/content/drive/MyDrive/AAI-521/Module7/TeamProject'
GDIR_PROJECT_EDA = GDIR_PROJECT + '/EDA_PrePro'
GDIR_PROJECT_EDA_S1 = GDIR_PROJECT_EDA + '/S1'
GDIR_PROJECT_EDA_S1_VAL = GDIR_PROJECT_EDA_S1 + '/val'
GDIR_PROJECT_EDA_S1_TRAIN = GDIR_PROJECT_EDA_S1 + '/train'

Mounted at /content/drive


In [4]:
# read train and val set metadata from S1
metadata_df_train = pd.read_csv(GDIR_PROJECT_EDA_S1_TRAIN + '/metadata_train.csv')
metadata_df_val = pd.read_csv(GDIR_PROJECT_EDA_S1_VAL + '/metadata_val.csv')


In [5]:
metadata_df_train.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [6]:
# create new folder to save further preprocessed images
GDIR_PROJECT_EDA_S2 = GDIR_PROJECT_EDA + '/S2'
GDIR_PROJECT_EDA_S2_VAL = GDIR_PROJECT_EDA_S2 + '/val'
GDIR_PROJECT_EDA_S2_TRAIN = GDIR_PROJECT_EDA_S2 + '/train'
os.makedirs(GDIR_PROJECT_EDA_S2, exist_ok=True)
os.makedirs(GDIR_PROJECT_EDA_S2_VAL, exist_ok=True)
os.makedirs(GDIR_PROJECT_EDA_S2_TRAIN, exist_ok=True)


**RESIZE IMAGES TO A FIXED TARGET SIZE**
> Resize images to a smaller target shape, but first, perform a quick EDA on the shape of the training images <br>

In [7]:
# EDA on TRINING IMAGE SHAPES
shapes = []

# get image shapes in train_ds
shapes = []
image_count =0
for root, dirs, files in os.walk(GDIR_PROJECT_EDA_S1_TRAIN):
  for filename in files:
     if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        image_path = os.path.join(root, filename)
        # Process the image here
        with Image.open(image_path) as img:
              shapes.append(img.size)
        #print(f"Processing image: {image_path}")
        print(f"Processing image {image_count}", end='\r')
        image_count += 1

print(f"Total images: {len(shapes)}")

Total images: 8864


In [13]:
# show shape count/distribution
shape_distribution = np.unique(shapes, return_counts=True)
shape_distribution

(array([450, 600]), array([8864, 8864]))

> All images are of shape 450x600. Images will be resized to 192x256 to maintain aspect ratio

In [14]:
# Set common parameters
TARGET_SIZE = (192,256) # in preparation for CNN, From EDA, size in S1 is (450,600)
BATCH_SIZE = 32
RANDOM_SEED = 42


In [15]:
# define function to resize images in a directory to a specified size
# and saves resulting images to a target directory
def resize_images(source_dir, target_dir):

  # generate resized image dataset using keras
  images_ds = tf.keras.utils.image_dataset_from_directory(
    source_dir,
    seed=RANDOM_SEED,
    image_size=TARGET_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='categorical',
    crop_to_aspect_ratio=False, # prevents cropping
    # This approach will resize the images while preserving their aspect ratio,
    # avoiding distortion or loss of important visual information1.
    # For skin cancer images, maintaining the original proportions is crucial for accurate diagnosis and analysis.
    interpolation="bilinear" # smooth resizing
  )

  # create subdirectories in target directory for resized images
  for dx_type in images_ds.class_names:
    os.makedirs(target_dir+'/'+dx_type, exist_ok=True)

  # save resized images from images_ds into their respective dx type
  image_num = 0
  for batch, labels in images_ds:
    for i, img in enumerate(batch):
        # Convert the tensor to a numpy array
        img_array = (img.numpy()).astype('uint8')

        # get dx type
        dx_type = images_ds.class_names[np.argmax(labels[i])]

        # Generate a filename
        filename = f'image_{image_num}.jpg'

        # log
        #print(f'i:{i} IMAGE_NO:{image_num} LABEL:{dx_type} FILENAME:{filename}')

        image_num += 1

        # Save Path
        save_dir = os.path.join(target_dir, dx_type)
        filepath = os.path.join(save_dir, filename)

        # remove file if it exists
        if os.path.exists(filepath):
          os.remove(filepath)

        tf.keras.utils.save_img(
            filepath,
            img_array,
            scale=False  # We've already scaled the image

        )


In [16]:
# resize validation set
resize_images(GDIR_PROJECT_EDA_S1_VAL, GDIR_PROJECT_EDA_S2_VAL)

Found 1094 files belonging to 7 classes.


In [20]:
# check how many val images we have in each dx type folder
# should be consistent with EDAPREPROP PART I
for dx_type in metadata_df_val['dx'].unique():
  print(dx_type, len(os.listdir(GDIR_PROJECT_EDA_S2_VAL+'/'+dx_type)))

nv 876
mel 46
df 8
bcc 35
bkl 86
akiec 30
vasc 13


In [21]:
# resize training set
resize_images(GDIR_PROJECT_EDA_S1_TRAIN, GDIR_PROJECT_EDA_S2_TRAIN)

Found 8864 files belonging to 7 classes.


In [22]:
# check how many val images we have in each dx type folder
# should be consistent with EDAPREPROP PART I
for dx_type in metadata_df_val['dx'].unique():
  print(dx_type, len(os.listdir(GDIR_PROJECT_EDA_S2_TRAIN+'/'+dx_type)))

nv 5784
mel 1065
df 107
bcc 479
bkl 1003
akiec 297
vasc 129


**GENERATE AUGMENTED IMAGES TO HANDLE CLASS IMBALANCE (PERFORMED ONLY ON TRAINING SET)**

> In each 'dx' class, generate augmented images from the existing ones until the total number of images reaches a target number.

In [25]:
# define function that generates augmented images
def generate_augmented_images(source_dir, target_dir, target_count, target_class):

  # source_dir: basis of generation of augmented images, must contain subdirectories with different classes
  # target_dir: where all augmented images and base images are saved, a subdirectory will be created for the specified target_class
  #
  # target_count: required number of images per target_class
  # target_class: Image Data Generator will only consider images from the specified target_class

  # uses global BATCH_SIZE and TARGET_SIZE variables
  print("TARGET_CLASS: ", target_class)
  # initialize image data generator
  # many of the the configuratios below are from (1)
  datagen = ImageDataGenerator(
    rotation_range = 25, # as per (1)
    width_shift_range=0.15, # as per (1)
    height_shift_range=0.15, # as per (1)
    zoom_range=0.1, # arbitrarily chosen
    shear_range=0.15, # (1)
    horizontal_flip=True, #(1)
    vertical_flip=True, #(1)
    brightness_range=(0.9,1.5), #(1)
    fill_mode='nearest') # keras default

  print("\tSOURCE DIR: ", source_dir)

  print("\tTARGET_COUNT: ", target_count)

  # create target directory for this class
  target_dir = os.path.join(target_dir, target_class)
  os.makedirs(target_dir, exist_ok=True)
  print("\tTARGET DIR: ", target_dir)

  # apply image data generator above on source_dir
  aug_generator = datagen.flow_from_directory(
    source_dir,
    seed=RANDOM_SEED,
    save_to_dir=target_dir,
    save_format='jpg',
    target_size=TARGET_SIZE,
    batch_size=BATCH_SIZE,
    classes=[target_class])

  # count number of batches needed to generate augmented images for this class
  num_base_images = len(os.listdir(target_dir))
  print("\tNUM BASE IMAGES: ", num_base_images)

  num_aug_images_wanted = target_count - num_base_images
  if(num_aug_images_wanted < 0): num_aug_images_wanted = 0
  print("\tNUM AUG IMAGES NEEDED: ", num_aug_images_wanted)

  num_batches = int(np.ceil(num_aug_images_wanted / BATCH_SIZE))
  print("\tNUM BATCHES: ", num_batches, "\n")

  # call generator enough number of times to generate/save desired number of images
  for i in range(num_batches):
    images, labels = next(aug_generator)

  return

# the max number of samples per dx type is
TARGET_NUMSAMPLES_PER_CLASS = metadata_df_train.groupby('dx').size().max()
TARGET_NUMSAMPLES_PER_CLASS


5784

In [26]:
# apply above function on each class in train
source_dir = GDIR_PROJECT_EDA_S2_TRAIN
target_dir = GDIR_PROJECT_EDA_S2_TRAIN

for dx_type in metadata_df_train['dx'].unique():
  generate_augmented_images(source_dir, target_dir, target_count = TARGET_NUMSAMPLES_PER_CLASS, target_class = dx_type)

TARGET_CLASS:  bkl
	SOURCE DIR:  /content/drive/MyDrive/AAI-521/Module7/TeamProject/EDA_PrePro/S2/train
	TARGET_COUNT:  5784
	TARGET DIR:  /content/drive/MyDrive/AAI-521/Module7/TeamProject/EDA_PrePro/S2/train/bkl
Found 1003 images belonging to 1 classes.
	NUM BASE IMAGES:  1003
	NUM AUG IMAGES NEEDED:  4781
	NUM BATCHES:  150 

TARGET_CLASS:  nv
	SOURCE DIR:  /content/drive/MyDrive/AAI-521/Module7/TeamProject/EDA_PrePro/S2/train
	TARGET_COUNT:  5784
	TARGET DIR:  /content/drive/MyDrive/AAI-521/Module7/TeamProject/EDA_PrePro/S2/train/nv
Found 5784 images belonging to 1 classes.
	NUM BASE IMAGES:  5784
	NUM AUG IMAGES NEEDED:  0
	NUM BATCHES:  0 

TARGET_CLASS:  df
	SOURCE DIR:  /content/drive/MyDrive/AAI-521/Module7/TeamProject/EDA_PrePro/S2/train
	TARGET_COUNT:  5784
	TARGET DIR:  /content/drive/MyDrive/AAI-521/Module7/TeamProject/EDA_PrePro/S2/train/df
Found 107 images belonging to 1 classes.
	NUM BASE IMAGES:  107
	NUM AUG IMAGES NEEDED:  5677
	NUM BATCHES:  178 

TARGET_CLASS:  mel

In [28]:
# check how many val images we have in each dx type folder
# should be consistent with EDAPREPROP PART I
for dx_type in metadata_df_train['dx'].unique():
  print(dx_type, len(os.listdir(GDIR_PROJECT_EDA_S2_TRAIN+'/'+dx_type)))

bkl 5719
nv 5784
df 4879
mel 5709
vasc 4708
bcc 5780
akiec 5410


**REFERENCES**
[1] Kumar Lilhore, U., Simaiya, S., Sharma, Y.K. et al. A precise model for skin cancer diagnosis using hybrid U-Net and improved MobileNet-V3 with hyperparameters optimization. Sci Rep 14, 4299 (2024). https://doi.org/10.1038/s41598-024-54212-8