# Training, Validation and Testing Image Data Partition and Preprocessing Notebook

The purpose of this notebook is to show how the training, validation and testing preprocessed image datasets were created from a raw, unprocessed dataset.

This notebook comprises of two parts: the splitting of unprocessed data into 3 parts (training, validation and testing), and the preprocessing process similar to the SPARK preprocessing script.

## Image Data Splitting

In [1]:
# Importing splitfolders library for easy splitting
import splitfolders

# Creating training, validation, and testing datasets from 'raw_data' folder to 'img_data' folder.
# The ratio of images inside the training, validation and testing datasets is 90% : 7.5% : 2.5%
splitfolders.ratio('raw_data', output="img_data", seed=1337, ratio=(0.9,0.075,0.025))

Copying files: 41924 files [24:27, 28.58 files/s]


## Image Data Preprocessing

In [2]:
# Importing relevant packages
from scipy import ndimage, misc
from skimage.filters import threshold_otsu
from skimage import io, color, exposure
import numpy as np
import os
import cv2
from PIL import Image
import matplotlib.image

In [3]:
# Defining paths
# Input folders
train0_path = "img_data/train/0"
train1_path = "img_data/train/1"
val0_path = "img_data/val/0"
val1_path = "img_data/val/1"
test0_path = "img_data/test/0"
test1_path = "img_data/test/1"

# Output folders
ptrain0_path = "processed/train/0"
ptrain1_path = "processed/train/1"
pval0_path = "processed/val/0"
pval1_path = "processed/val/1"
ptest0_path = "processed/test/0"
ptest1_path = "processed/test/1"

In [4]:
# Creating list of paths
path_list = [[train0_path, ptrain0_path], [train1_path, ptrain1_path], [val0_path, pval0_path], [val1_path, pval1_path], [test0_path, ptest0_path], [test1_path, ptest1_path]]

path_list

[['img_data/train/0', 'processed/train/0'],
 ['img_data/train/1', 'processed/train/1'],
 ['img_data/val/0', 'processed/val/0'],
 ['img_data/val/1', 'processed/val/1'],
 ['img_data/test/0', 'processed/test/0'],
 ['img_data/test/1', 'processed/test/1']]

In [7]:
# For each input path (and output path) in the list of paths... 
for in_path, out_path in path_list:
        # and for each image in the input folder in question...
        for image_path in os.listdir(in_path):
            # Loading in image
            img = Image.open(in_path + "/" + image_path)

            '''
            Section 1: Translating image into array
            '''
            
            # Resize the image to a consistent size (e.g., 224x224)
            img = img.resize((300, 300))
        
            # Convert the image to a NumPy array
            img_array = np.array(img)
            
            # Normalize pixel values
            img_array = cv2.normalize(img_array, None, 255, 0, cv2.NORM_MINMAX, cv2.CV_8U)

            '''
            Section 2: Creating the mask to remove unnecessary features like healthy skin and hair.
            Hair is first removed with the DullRazor algorithm, resulting in an image which is then used
            to create a mask highlighting the skinmark.
            '''
            
            # Converting RGB picture to greyscale for hair removal DullRazor algorithm
            img_gc = color.rgb2gray(img_array)

            # DullRazor algorithm starts here
            #Black hat filter
            kernel = cv2.getStructuringElement(1,(9,9)) 
            blackhat = cv2.morphologyEx(img_gc, cv2.MORPH_BLACKHAT, kernel)
            
            #Gaussian filter
            bhg= cv2.GaussianBlur(blackhat,(3,3),cv2.BORDER_DEFAULT)

            #Masking hair
            ret, mask = cv2.threshold(bhg,0.03,255,cv2.THRESH_BINARY)
            
            # Normalise mask
            mask = cv2.normalize(mask, None, 255, 0, cv2.NORM_MINMAX, cv2.CV_8U)
            
            #Replace pixels of the mask
            dst = cv2.inpaint(img_array, mask, 6, cv2.INPAINT_TELEA)
            # DullRazor algorithm ends here

            # Adjusting exposure
            img_ex1 = exposure.adjust_log(dst)

            p2, p98 = np.percentile(img_ex1, (2, 98))
            img_ex2 = exposure.rescale_intensity(img_ex1, in_range=(p2, p98))

            # Converting cleaned photo into greyscale for thresholding/segmentation
            img_ex2 = color.rgb2gray(img_ex2)
    
            # Global thresholding with Otsu
            thresh = threshold_otsu(img_ex2)
            
            # Creating threshold image
            img_t = img_ex2 <= thresh
    
            # Creating mask using threshold image
            # Value 0 as black and white photo used
            mask = np.where(img_t >= 0, img_t, 0)
    
            '''
            Section 3: Creating the final processed photo by only including parts of img_array that is highlighted by the mask.
            '''
            
            # Nested for loop for each 'row' of img
            for h in range(mask.shape[0]):
              # For each 'column' of img
                for w in range(mask.shape[1]):
                  # If the pixel chosen from the mask is white, add in the pixel from the original image
                  # Otherwise, discard/make pixel black
                    if mask[h][w] == 0:
                        for i in range(3):
                            img_array[h][w][i] = 0
                    else:
                        continue
        
            # Creating full output path and saving the file to disk
            fullpath = out_path + '/processed_' + image_path
            matplotlib.image.imsave(fullpath, img_array)