### 2. Data Preparation

In [20]:
import os
import numpy as np
import random
import shutil
import time
import zipfile
import PIL
from PIL import Image

### Data Augmentation

In [2]:
!pip install Augmentor



In [11]:
#connecting dataset with code
import Augmentor

p = Augmentor.Pipeline("/home/innovation/Documents/Projects/LEPROSY/LEPROSY/cleaned_images/acne_images")

#performing rotations on images
p.rotate(probability=0.7, max_left_rotation=10, max_right_rotation=10)

#performing zoom-in zoom-out on images
p.zoom(probability=0.3, min_factor=1.1, max_factor=1.6)

#taking 500 images as new training dataset
training_augmented = p.sample(500)

Processing <PIL.Image.Image image mode=RGB size=294x222 at 0x7FE2B0752B20>:   2%|▏         | 11/500 [00:00<00:49,  9.82 Samples/s]                 

Initialised with 500 image(s) found.
Output directory set to /home/innovation/Documents/Projects/LEPROSY/LEPROSY/cleaned_images/acne_images/output.

Processing <PIL.Image.Image image mode=RGB size=294x222 at 0x7FE2B0A667C0>: 100%|██████████| 500/500 [00:01<00:00, 257.86 Samples/s]                  


### Checking for class imbalance

In [22]:
# Define the root directory of your dataset
root_dir = "cleaned_images"

subdirs = ["acne_images","psoriasis_images", "leprosy_images"]

{disorder :len(os.listdir(f'{root_dir}/{disorder}')) for disorder in subdirs}

# Observation:
# There is class imbalance

{'acne_images': 500, 'psoriasis_images': 612, 'leprosy_images': 500}

**Finding the average resolution for images dataset**

In [23]:
widths = []
heights = []

for disorder in subdirs:
    for img in os.listdir(f'{root_dir}/{disorder}'):
        img_path = os.path.join(f'{root_dir}/{disorder}', img) # Making image file path
        im = Image.open(img_path)
        widths.append(im.size[0])
        heights.append(im.size[1])

avg_height = round(sum(heights)/len(heights))
avg_width = round(sum(widths)/len(widths))

print(f'The average height is:{avg_height} and the average width is:{avg_width}')

The average height is:347 and the average width is:396


### Train Validation Test Split 

In [24]:
# Define the root directory of your dataset
root_dir = "cleaned_images"

subdirs = ["acne_images","psoriasis_images", "leprosy_images"]

# Checking if the folder exists and deleting it if it exists
for folder_name in ['train', 'validation', 'test']:
    if os.path.exists(f'{root_dir}/{folder_name}'):
        # deleting the folder and its contents
        shutil.rmtree(f'{root_dir}/{folder_name}')

# create subdirectories for train, validation and test sets
train_dir = os.path.join(root_dir, 'train')
validation_dir = os.path.join(root_dir, 'validation')
test_dir = os.path.join(root_dir, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# set the ratio for train, validation and test sets
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

for subdir in subdirs:
    subdir_path = os.path.join(root_dir, subdir)
    train_subdir = os.path.join(train_dir, subdir)
    validation_subdir = os.path.join(validation_dir, subdir)
    test_subdir = os.path.join(test_dir, subdir)
    os.makedirs(train_subdir, exist_ok=True)
    os.makedirs(validation_subdir, exist_ok=True)
    os.makedirs(test_subdir, exist_ok=True)

    image_names = [name for name in os.listdir(subdir_path)]
    image_names = random.sample(image_names, len(image_names))

    for i, image_name in enumerate(image_names):
            # Defining the source path:
            source_path = os.path.join(subdir_path, image_name)

            if i < len(image_names) * train_ratio:
                target_path = os.path.join(train_subdir, image_name)
                shutil.copyfile(source_path, target_path)
            elif i < len(image_names) * (train_ratio + val_ratio):
                target_path = os.path.join(validation_subdir, image_name)
                shutil.copyfile(source_path, target_path)
            else:
                target_path = os.path.join(test_subdir, image_name)
                shutil.copyfile(source_path, target_path)

In [25]:
# Checking the distribution of the image after the train-validation-test split
print({disorder :len(os.listdir(f'{root_dir}/train/{disorder}')) for disorder in subdirs})

# Observation:
# There is class imbalance as a result of the train-validation-test split.

{'acne_images': 400, 'psoriasis_images': 490, 'leprosy_images': 400}


In [26]:
# No. of images in train set
train_images = [images for disease in subdirs \
                     for images in os.listdir(f'{root_dir}/train/{disease}')]
print("Number of images in train set ",len(train_images))

Number of images in train set  1290


In [27]:
# No of images in validation set
validation_images = [images for disease in subdirs \
                     for images in os.listdir(f'{root_dir}/validation/{disease}')]
print("Number of images in validation set ",len(validation_images))

Number of images in validation set  161


In [28]:
# No of images in test set
test_images = [images for disease in subdirs \
                     for images in os.listdir(f'{root_dir}/test/{disease}')]
print("Number of images in test set ",len(test_images))

Number of images in test set  161
