# Preprocessing

## Import

In [48]:
import os
import pandas as pd

import cv2 as cv
from sklearn.model_selection import train_test_split

from keras_preprocessing.image import ImageDataGenerator

%run ../scripts/save_utils.py

## Data split

First of all, we need to properly split the data.  
  
I have combined train and test data into one folder resp. to proper class. Now we need to combine images and their labels.  
  
I decided to make a dataframe containing path to the image and its label as columns:

In [49]:
images_path = '../data/raw/merged_data'

In [50]:
def get_folder_names(directory_path):
    entries = os.listdir(directory_path)
    folders = [entry for entry in entries if os.path.isdir(os.path.join(directory_path, entry))]
    return folders

def images_to_dataframe(directory_path):
    folders = get_folder_names(directory_path)
    file_paths = []
    labels = []

    for folder in folders:
        folder_path = os.path.join(directory_path, folder)
        files = os.listdir(folder_path)
        for file in files:
            file_path = os.path.relpath(os.path.join(folder_path, file), directory_path)
            file_path = file_path.replace('\\', '/')  # Replace backslashes with forward slashes
            file_paths.append(file_path)
            labels.append(folder)

    df = pd.DataFrame({
        'image_path': file_paths,
        'label': labels
    })

    df = df.sample(frac=1).reset_index(drop=True)

    return df

In [51]:
df = images_to_dataframe(images_path)

In [52]:
print(df)

                     image_path       label
0        notumor/Tr-no_0552.jpg     notumor
1      pituitary/Tr-pi_0121.jpg   pituitary
2         glioma/Tr-gl_0728.jpg      glioma
3         glioma/Tr-gl_0584.jpg      glioma
4      pituitary/Tr-pi_1440.jpg   pituitary
...                         ...         ...
7018   pituitary/Tr-pi_1141.jpg   pituitary
7019     notumor/Te-no_0346.jpg     notumor
7020     notumor/Tr-no_0752.jpg     notumor
7021  meningioma/Tr-me_0287.jpg  meningioma
7022   pituitary/Tr-pi_0524.jpg   pituitary

[7023 rows x 2 columns]


Now we have a dataset containing all paths to images and their resp. labels.  
Let's proceed to splitting:

In [53]:
x_tmp, x_test, y_tmp, y_test = train_test_split(df['image_path'], df['label'], test_size=0.2,
                                                shuffle=True, random_state=73, stratify=df['label'])

x_train, x_val, y_train, y_val = train_test_split(x_tmp, y_tmp, test_size=0.25,
                                                  shuffle=True, random_state=73, stratify=y_tmp)

Since indices got all mixed up after splitting, we will reset them:

In [54]:
data = [x_train, y_train, x_val, y_val, x_test, y_test]
for entry in data:
    entry.reset_index(drop=True, inplace=True)

Now we have 3 separate sets : **train**, with **80%** of initial data, and **validation** and **test** sets **each** containing **20%** of the initial data.  
  
Let's now check how many samples each set has:

In [55]:
print('Number of samples in:')
print('  train set:      ', x_train.shape[0])
print('  validation set: ', x_val.shape[0])
print('  test set:       ', x_test.shape[0])

Number of samples in:
  train set:       4213
  validation set:  1405
  test set:        1405


And now we save it to use later when it comes to building actual models:

In [56]:
save_data('../save_files/processed_data.pkl', x_train, y_train, x_val, y_val, x_test, y_test)

All variables have been successfully saved to ../save_files/processed_data.pkl.


## ImageDataGenerator

Let's now assemble data into dataframes to use in ImageDataGenerator:

In [57]:
df_train = pd.DataFrame({'image_path':x_train, 'label':y_train})
df_val = pd.DataFrame({'image_path':x_val, 'label':y_val})
df_test = pd.DataFrame({'image_path':x_test, 'label':y_test})

In [58]:
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [59]:
train_generator = train_datagen.flow_from_dataframe(df_train, '..data/raw/merged_data',
                                                    x_col='image_path', y_col='label',
                                                    target_size=(512, 512), batch_size=20,
                                                    class_mode='categorical', validate_filenames=False)

validation_generator = val_datagen.flow_from_dataframe(df_val, '..data/raw/merged_data',
                                                       x_col='image_path', y_col='label',
                                                       target_size=(512, 512), batch_size=20,
                                                       class_mode='categorical', validate_filenames=False)

test_generator = test_datagen.flow_from_dataframe(df_test, '..data/raw/merged_data',
                                                  x_col='image_path', y_col='label',
                                                  target_size=(512, 512), batch_size=20,
                                                  class_mode='categorical', validate_filenames=False)

# be cautious with validate_filenames=False, might lead to errors in the future;

Found 4213 non-validated image filenames belonging to 4 classes.
Found 1405 non-validated image filenames belonging to 4 classes.
Found 1405 non-validated image filenames belonging to 4 classes.


Let's now save generators to use later:

In [60]:
save_data('../save_files/generators.pkl', train_generator, validation_generator, test_generator)

An error occurred while saving data: cannot pickle '_thread.lock' object
