## Data Preprocessing and Model Training

In [12]:
# General Libraries:
import cv2
import os
from PIL import Image
import numpy as np
from sklearn.utils import shuffle
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Deep Learning libraries:
import tensorflow as tf
from tensorflow import keras
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import VGG16
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout
from keras.optimizers import Adam
from keras.utils import to_categorical



### Data Loading and Cleaning

In [2]:
def load_data(directory):
    images = []
    labels = []

    for label in os.listdir(directory):
        for image in os.listdir(os.path.join(directory, label)):
            img_path = os.path.join(directory, label, image)

            # Read and resize the image using Pillow
            img = Image.open(img_path)
            img_array = np.array(img)

            # Append the image array and the label to the lists
            images.append(img_array)
            labels.append(label)

    # Shuffle the arrays using sklearn.utils.shuffle
    images, labels = shuffle(images, labels, random_state=42)

    return images, labels

In [3]:
train_dir = r'C:\Users\andmpereira\OneDrive - Grupo Jerónimo Martins\Documentos\GitHub\DLNN-Group2\00 - Dataset\Training'
test_dir = r'C:\Users\andmpereira\OneDrive - Grupo Jerónimo Martins\Documentos\GitHub\DLNN-Group2\00 - Dataset\Testing'

# Load training data
train_images, train_labels = load_data(train_dir)

# Load testing data
test_images, test_labels = load_data(test_dir)

In [4]:
# Check data structure

num_train_images = len(train_images)
num_test_images = len(test_images)
num_train_labels = len(train_labels)
num_test_labels = len(test_labels)

print("Number of Images in Train Set:", num_train_images)
print("Number of Labels in Train Set:", num_train_labels)
print("Number of Images in Test Set:", num_test_images)
print("Number of Labels in Test Set:", num_test_labels)

Number of Images in Train Set: 5712
Number of Labels in Train Set: 5712
Number of Images in Test Set: 1311
Number of Labels in Test Set: 1311


In [5]:
def remove_duplicates(images, labels):
    unique_pairs = set()
    unique_images = []
    unique_labels = []
    num_removed = 0

    for img, label in zip(images, labels):
        img_str = img.tobytes()
        pair = (img_str, label)

        # Check if the pair is unique
        if pair not in unique_pairs:
            unique_pairs.add(pair)
            unique_images.append(img)
            unique_labels.append(label)
        else:
            num_removed += 1

    print(f"Removed {num_removed} duplicate images.")
    
    return shuffle(unique_images, unique_labels, random_state=99)

In [6]:
# Remove duplicates from training data
train_images, train_labels = remove_duplicates(train_images, train_labels)

# Remove duplicates from testing data
test_images, test_labels = remove_duplicates(test_images, test_labels)

Removed 207 duplicate images.
Removed 30 duplicate images.


In [7]:
def print_class_distribution(labels, dataset_name):
    unique_classes, class_counts = np.unique(labels, return_counts=True)
    
    print(f"\nClass Distribution in {dataset_name} Dataset:")
    for class_label, count in zip(unique_classes, class_counts):
        print(f"Class {class_label}: {count} images")
    
    total_images = len(labels)
    print(f"Total number of images in {dataset_name} Dataset: {total_images}")

# Print training data class distribution
print_class_distribution(train_labels, 'Training')

# Print testing data class distribution
print_class_distribution(test_labels, 'Testing')



Class Distribution in Training Dataset:
Class glioma: 1321 images
Class meningioma: 1333 images
Class notumor: 1406 images
Class pituitary: 1445 images
Total number of images in Training Dataset: 5505

Class Distribution in Testing Dataset:
Class glioma: 299 images
Class meningioma: 301 images
Class notumor: 381 images
Class pituitary: 300 images
Total number of images in Testing Dataset: 1281


### Data Preprocessing

In [8]:
# Split the training data into training and validation sets
train_images, val_images, train_labels, val_labels = train_test_split(train_images, train_labels, test_size=0.2, random_state=99)


In [9]:
def preprocess_images(images, desired_size, normalize=True):
    preprocessed_images = []

    for img_array in images:
        # Convert numpy array to Pillow Image
        img = Image.fromarray(img_array)

        # Convert grayscale images to RGB
        if img.mode != 'RGB':
            img = img.convert('RGB')

        # Resize the image
        img = img.resize(desired_size)

        # Convert the image back to a numpy array
        img_array = np.array(img)

        # Normalize pixel values to the range [0, 1]
        if normalize:
            img_array = img_array / 255.0

        preprocessed_images.append(img_array)

    return preprocessed_images

In [10]:
width = 224 
height = 244

train_images_processed = preprocess_images(train_images, desired_size=(width,height))
val_images_processed = preprocess_images(val_images, desired_size=(width,height))
test_images_processed = preprocess_images(test_images, desired_size=(width,height))

train_images_processed = np.array(train_images_processed)
val_images_processed = np.array(val_images_processed)

In [14]:

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

num_classes = len(label_encoder.classes_)

train_labels_one_hot = to_categorical(train_labels_encoded, num_classes=num_classes)
val_labels_one_hot = to_categorical(val_labels_encoded, num_classes=num_classes)
test_labels_one_hot = to_categorical(test_labels_encoded, num_classes=num_classes)

### Data Augmentation

In [15]:
# Create an ImageDataGenerator with data augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator()

# Preprocess and augment training data
train_generator = datagen.flow(np.array(train_images_processed), train_labels_one_hot, batch_size=32)
val_generator = val_datagen.flow(np.array(val_images_processed), val_labels_one_hot, batch_size=32)

### Model Architecture

In [16]:
# Load the pre-trained VGG16 model without the top (fully connected) layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 244, 3))
num_classes = 4

# Freeze the pre-trained layers
for layer in base_model.layers:
    layer.trainable = False

# Create a new model and add the pre-trained VGG16 base model
model = Sequential()
model.add(base_model)

# Flatten the output of the base model
model.add(Flatten())

# Add a fully connected layer with ReLU activation and Dropout
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Adjust dropout rate as needed

# Add another fully connected layer with ReLU activation and Dropout
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Adjust dropout rate as needed

# Output layer for classification
model.add(Dense(num_classes, activation='softmax'))

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vgg16 (Functional)          (None, 7, 7, 512)         14714688  
                                                                 
 flatten (Flatten)           (None, 25088)             0         
                                                                 
 dense (Dense)               (None, 256)               6422784   
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 4)                 5

In [17]:
from keras.optimizers import RMSprop

# Compile the model using RMSprop optimizer
model.compile(optimizer=RMSprop(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

num_epochs = 10
batch_size = 32

# Train the model
history = model.fit(
    x=train_generator,
    epochs=num_epochs,
    validation_data=val_generator,
    batch_size=batch_size
)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 