# Images Preprocessing

Author: Endrit Mustafa

---

This notebook is created for the purpose of creating a main csv file from the images itself. <br>
From the path <code>..data/dataset/images/</code> we will take all images paths, and then we will preprocess them by extracting their pixcel informations for each image<br>
Futher we will combine these extracted informations to the first <code>metadata.csv</code> file <br>
The created main csv file will be used for training the classifying model and then for classifying the targeted images into one of the classes

---

## Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from glob import glob

import os
from PIL import Image
import concurrent.futures

## Creating the main DataFrame

In [3]:
# Reading Metadata csv file

df = pd.read_csv('../data/metadata.csv')

In [4]:
# Creating image path dettector
image_path = {os.path.splitext(os.path.basename(x))[0]: x for x in glob(os.path.join('../data/dataset/dataset/images/','*','*.jpg'))}

#Defineing the image path and addiing as a new column in the df

df['path'] = df['image_id'].map(image_path.get)
df = df.loc[df['path'].isnull() != True]

In [5]:
# Creating a function to preprocess the image

def load_and_resize_image(x, size):
    """
    Function to preprocess the image
    """
    img = Image.open(x)
    img = img.resize((size, size))
    return np.asarray(img)

In [6]:
# Defining the desired image SIZE
SIZE = 32

# Number of threads or processes to use
NUM_WORKERS = 4

# Creating new column for each pixel value and adding it in the df
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
    df['image'] = list(executor.map(lambda x: load_and_resize_image(x, SIZE), df['path']))

In [7]:
# Creating a function to add image folder name (It will be used to splid data as in folders: train, test, validation, in the model training part)

def extract_subset_from_path(image_path):
    
    """
    This function is used to substract the folder in which the image is
    """
    
    if 'test' in image_path:
        return 'test'
    elif 'train' in image_path:
        return 'train'
    elif 'validation' in image_path:
        return 'validation'
    else:
        return 'unknown'

In [8]:
# Adding the 'subset' column based on the image path

df['subset'] = df['path'].map(extract_subset_from_path)

In [9]:
# Number of samples per each subset

df['subset'].value_counts()

train         8515
test          1000
validation     500
Name: subset, dtype: int64

In [10]:
# Printing the main df
df

Unnamed: 0,image_id,lesion_type,confirmation,age,sex,localization,path,image,subset
0,IMAGE_0000244,0,histo,80.0,male,scalp,../data/dataset/dataset/images\train\IMAGE_000...,"[[[191, 152, 192], [194, 154, 193], [196, 152,...",train
1,IMAGE_0001916,0,histo,80.0,male,scalp,../data/dataset/dataset/images\train\IMAGE_000...,"[[[24, 14, 29], [56, 39, 64], [106, 80, 112], ...",train
2,IMAGE_0006461,0,histo,80.0,male,scalp,../data/dataset/dataset/images\validation\IMAG...,"[[[190, 136, 150], [199, 145, 161], [200, 141,...",validation
3,IMAGE_0003197,0,histo,80.0,male,scalp,../data/dataset/dataset/images\validation\IMAG...,"[[[35, 18, 28], [83, 51, 64], [128, 88, 103], ...",validation
4,IMAGE_0009907,0,histo,75.0,male,ear,../data/dataset/dataset/images\train\IMAGE_000...,"[[[155, 110, 135], [188, 139, 168], [210, 157,...",train
...,...,...,...,...,...,...,...,...,...
10010,IMAGE_0007158,6,histo,40.0,male,abdomen,../data/dataset/dataset/images\validation\IMAG...,"[[[181, 164, 179], [179, 162, 176], [180, 163,...",validation
10011,IMAGE_0005939,6,histo,40.0,male,abdomen,../data/dataset/dataset/images\train\IMAGE_000...,"[[[4, 5, 3], [24, 22, 21], [101, 88, 88], [128...",train
10012,IMAGE_0001721,6,histo,40.0,male,abdomen,../data/dataset/dataset/images\train\IMAGE_000...,"[[[132, 119, 120], [157, 139, 138], [177, 158,...",train
10013,IMAGE_0008064,6,histo,80.0,male,face,../data/dataset/dataset/images\validation\IMAG...,"[[[160, 123, 144], [163, 131, 152], [166, 128,...",validation


In [11]:
# Creating reshaped pixcel column

df['pixels'] = df['image'].apply(lambda img: img.reshape(-1))

# Creating columns for each pixcel value
for i in range(32*32*3):
    df[f'pixel_{i+1}'] = df['pixels'].apply(lambda pixels: pixels[i])

# Dropping the original 'image' and 'pixels' columns if not needed anymore
df.drop(columns=['image', 'pixels'], inplace=True)

In [12]:
df

Unnamed: 0,image_id,lesion_type,confirmation,age,sex,localization,path,subset,pixel_1,pixel_2,...,pixel_3063,pixel_3064,pixel_3065,pixel_3066,pixel_3067,pixel_3068,pixel_3069,pixel_3070,pixel_3071,pixel_3072
0,IMAGE_0000244,0,histo,80.0,male,scalp,../data/dataset/dataset/images\train\IMAGE_000...,train,191,152,...,136,176,131,146,184,149,169,184,153,176
1,IMAGE_0001916,0,histo,80.0,male,scalp,../data/dataset/dataset/images\train\IMAGE_000...,train,24,14,...,69,43,26,43,25,14,28,25,14,27
2,IMAGE_0006461,0,histo,80.0,male,scalp,../data/dataset/dataset/images\validation\IMAG...,validation,190,136,...,143,165,129,144,155,121,138,134,102,114
3,IMAGE_0003197,0,histo,80.0,male,scalp,../data/dataset/dataset/images\validation\IMAG...,validation,35,18,...,56,34,18,26,25,12,16,25,12,15
4,IMAGE_0009907,0,histo,75.0,male,ear,../data/dataset/dataset/images\train\IMAGE_000...,train,155,110,...,196,200,158,175,160,124,137,103,73,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10010,IMAGE_0007158,6,histo,40.0,male,abdomen,../data/dataset/dataset/images\validation\IMAG...,validation,181,164,...,186,207,184,185,207,186,186,205,187,188
10011,IMAGE_0005939,6,histo,40.0,male,abdomen,../data/dataset/dataset/images\train\IMAGE_000...,train,4,5,...,108,70,59,57,13,13,11,4,5,3
10012,IMAGE_0001721,6,histo,40.0,male,abdomen,../data/dataset/dataset/images\train\IMAGE_000...,train,132,119,...,177,206,184,181,171,148,142,176,156,151
10013,IMAGE_0008064,6,histo,80.0,male,face,../data/dataset/dataset/images\validation\IMAG...,validation,160,123,...,165,184,159,167,185,158,167,184,161,170


In [13]:
# Saveing the main_df to a csv file

df.to_csv('../data/main_image_data.csv', index=False)