## Final Exam Machine Learning

Davide Tateo; 167275

Francesca Salute; 167284

Nicole Favero; 167340 

Tomás Gonçalves; 167288

In [2]:
#!git clone https://github.com/frasalute/ML_FinalExam.git

In [3]:
import os
print(f"Current working directory: {os.getcwd()}")
print(f"Contents of the current directory: {os.listdir('.')}")

Current working directory: c:\Users\davit\Desktop\CBS\MACHINE LEARNING\GithubExam\ML_FinalExam
Contents of the current directory: ['.DS_Store', '.git', 'Final_Exam_Code.ipynb', 'images', 'LICENSE', 'README.md']


In [4]:
base_path = '/work/ML_FinalExam/images'
print("Base path is:", base_path)

Base path is: /work/ML_FinalExam/images


In [5]:
# pip install tensorflow
# pip install scikit-learn
# pip install seaborn

In [7]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [8]:
# Importing required API and modules
import warnings
import numpy as np
import pandas as pd
from tensorflow.keras.utils import img_to_array
from keras.preprocessing.image import array_to_img
from tensorflow.keras.utils import load_img
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
def load_and_resize_images(base_path, categories, target_size=(64, 64)):  # changed size from 128 to 64 to make it faster
    data = {'photo_id': [], 'image': [], 'image_array': [], 'category': [], 'subset': []}
    
    for subset in ['train', 'test']:
        for category in categories.keys():
            folder_path = os.path.join(base_path, subset, category)
            for filename in os.listdir(folder_path):
                if filename.lower().endswith(('.jpg', '.jpeg')):  # Check for both .jpg and .jpeg extensions
                    file_path = os.path.join(folder_path, filename)
                    # Load and resize the image
                    image = load_img(file_path, target_size=target_size)
                    image_array = img_to_array(image)
                    # Add data to the DataFrame
                    data['photo_id'].append(filename)
                    data['image'].append(image)
                    data['image_array'].append(image_array)
                    data['category'].append(categories[category])
                    data['subset'].append(subset)
    
    df = pd.DataFrame(data)
    return df

# Load the DataFrame
categories = {'Benign': 0, 'Malignant': 1, 'Undetected': 2}
base_path = '/work/ML_FinalExam/images'
df = load_and_resize_images(base_path,categories)

In [None]:
print(df.head())
row_count = len(df)
print("Number of rows in the DataFrame:", row_count)

   photo_id                                              image  \
0  6154.jpg  <PIL.JpegImagePlugin.JpegImageFile image mode=...   
1  3582.jpg  <PIL.JpegImagePlugin.JpegImageFile image mode=...   
2  2657.jpg  <PIL.JpegImagePlugin.JpegImageFile image mode=...   
3     2.jpg  <PIL.JpegImagePlugin.JpegImageFile image mode=...   
4  4941.jpg  <PIL.JpegImagePlugin.JpegImageFile image mode=...   

                                         image_array  category  
0  [[[170.0, 129.0, 145.0], [172.0, 131.0, 147.0]...         0  
1  [[[140.0, 108.0, 87.0], [141.0, 109.0, 88.0], ...         0  
2  [[[170.0, 130.0, 156.0], [174.0, 134.0, 160.0]...         0  
3  [[[253.0, 202.0, 245.0], [255.0, 205.0, 248.0]...         0  
4  [[[182.0, 144.0, 155.0], [183.0, 145.0, 156.0]...         0  
Number of rows in the DataFrame: 13879


# Augmentation of "undetected" datasets

In [11]:

# Set the source directory and the target directories
original_undetected_dir = './images/undetec_to_augm'
train_augm_undetected_dir = './images/train/Undetected'
test_augm_undetected_dir ='./images/test/Undetected'

# Create a data generator for augmentation
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range = [0.8, 1.3],
    fill_mode='nearest'
)

def augment_img_dataset(source_dir, target_dir, target_count, imagedata_generator):

    """
        #this function performs augmentation

        # source_dir:           directory containing the original dataset to augment
        # target_dir:           directory where to save the augmented images
        # target_count:         desired number of images after augmentation
        # imagedata_generator:  image data generator object to be used for aumentation

        # returns:              augmented set of images
    """ 

    # List the files in the source directory containing the original dataset 
    file_list = os.listdir(source_dir)
    # Number of images in the original dataset
    original_count = len(file_list)

    # Number of times to repeat each image to reach the target count
    repeats_1 = min(target_count // original_count + 1, len(file_list))

    # Ensure the target directories exist
    os.makedirs(target_dir, exist_ok=True)    

    # Augment and save the images for the 6000 train images
    for file in file_list:
        img_path = os.path.join(source_dir, file)
        img = load_img(img_path)
        img = img_to_array(img)
        img = img.reshape((1,) + img.shape)

        i = 0
        for batch in imagedata_generator.flow(img, batch_size=1, save_to_dir = target_dir, save_prefix='augm', save_format='jpeg'): 
            #if in the previous line i save them as jpg, it anyway augments only the jpeg ones
            i += 1
            if i >= repeats_1:
                break  # break the loop after reaching the desired number of augmented images



#hhi 
### AVOID RUNNING THE FOLLOWING IF DATASET HAS ALREADY BEEN AUGMENTED ###
            
# Perform augmentation to expand the train and test datasets for "undetected"
    #augment_img_dataset(original_undetected_dir, train_augm_undetected_dir, 6000, datagen)
    #augment_img_dataset(original_undetected_dir, test_augm_undetected_dir, 1000, datagen)


In [10]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# Making sure it's a 2D array
image_matrix = np.stack(df['image_array'].values)
print(image_matrix)

[[[[170. 129. 145.]
   [172. 131. 147.]
   [170. 129. 145.]
   ...
   [202. 167. 174.]
   [200. 165. 172.]
   [199. 164. 171.]]

  [[169. 128. 144.]
   [171. 130. 146.]
   [172. 131. 147.]
   ...
   [202. 167. 174.]
   [200. 165. 172.]
   [199. 164. 171.]]

  [[170. 129. 145.]
   [171. 130. 146.]
   [173. 132. 146.]
   ...
   [202. 167. 174.]
   [201. 166. 173.]
   [199. 164. 171.]]

  ...

  [[176. 144. 149.]
   [179. 147. 152.]
   [183. 148. 154.]
   ...
   [195. 159. 163.]
   [193. 157. 161.]
   [191. 155. 159.]]

  [[174. 142. 147.]
   [177. 145. 150.]
   [182. 147. 153.]
   ...
   [196. 160. 164.]
   [193. 157. 161.]
   [191. 155. 159.]]

  [[173. 141. 146.]
   [176. 144. 149.]
   [181. 146. 152.]
   ...
   [195. 159. 163.]
   [193. 157. 161.]
   [190. 154. 158.]]]


 [[[140. 108.  87.]
   [141. 109.  88.]
   [143. 111.  90.]
   ...
   [133. 100.  85.]
   [147. 114.  99.]
   [137. 104.  89.]]

  [[140. 108.  87.]
   [141. 109.  88.]
   [143. 111.  90.]
   ...
   [134. 101.  86.]
 

In [None]:
# It's a three dimensions array so we have to flatten it 
image_matrix = np.array([img.flatten() for img in df['image_array']])

# Scale the data
scaler = StandardScaler()
image_matrix_scaled = scaler.fit_transform(image_matrix)

In [None]:
import time

# Start timing the overall process
start_time_overall = time.time()

# Principal Component Analysis
pca_ratio=PCA()
pca_ratio.fit(image_matrix_scaled) # fit the PCA so it can learn

# Using cumulative variance
cumulative_variance_ratio=np.cumsum(pca_ratio.explained_variance_ratio_)
variance=0.95 # set to 95% to keep a sufficiently large portion of the variance
n_components= np.argmax(cumulative_variance_ratio >= variance) +1 # find the number of components needed 

print(f"Number of principal components: {n_components}")

# Calculate and print the overall running time
overall_time = time.time() - start_time_overall
print(f"Total running time: {overall_time:.2f} seconds.")

# There is another way to obtain the number of components which is setting the threshold 
# pca_threshold = PCA(n_components=0.95)
# pca_threshold.fit(image_matrix_scaled) - fit the pca model to the data to learn patterns
# print(f"Number of principal components: {pca_threshold.n_components_}")

Number of principal components: 72

In [None]:

# Screen plot eigenvalues - number of principal components
plt.figure(figsize=(10, 6))
sns.lineplot(x=np.arange(1, len(cumulative_variance_ratio) + 1), y=cumulative_variance_ratio, marker='o', color='#FF69B4')
plt.title('Scree Plot')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()

In [None]:
# Transform the original data using retained principal components 
# Start from inputting in PCA the number of components found necessary for 95% variance
pca_opt=PCA(n_components=72)
pca_opt.fit(image_matrix_scaled)

df_matrix_reduced= pca_opt.transform(image_matrix_scaled) # transform the flatten original data to reduced dimensionality

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import time

# Start timing the overall process
start_time_overall = time.time()

# Split the Data in test and train
train_indices = df[df['subset'] == 'train'].index
test_indices = df[df['subset'] == 'test'].index

X_train = df_matrix_reduced[train_indices]
X_test = df_matrix_reduced[test_indices]
y_train = df.loc[train_indices, 'category']
y_test = df.loc[test_indices, 'category']

# Definition of the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.0001, 0.001, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

# Creation of an SVC
svc = SVC(probability=True)

# Creating the model using GridSearchCV with the parameter grid
model = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)

# Training the model using training data
model.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found:", model.best_params_)

# Testing the model using test data
y_pred = model.predict(X_test)

# Printing the classification report
print(classification_report(y_test, y_pred))

# Calculate and print the overall running time
overall_time = time.time() - start_time_overall
print(f"Total running time: {overall_time:.2f} seconds.")

SVM with PCA - sample of dataset

In [None]:
from sklearn.model_selection import train_test_split
import time

# Start timing the overall process
start_time_overall = time.time()

# Indices for training and testing data as defined in the 'subset' column
train_indices = df[df['subset'] == 'train'].index
test_indices = df[df['subset'] == 'test'].index

# Randomly select 20% of the data from both training and testing subsets
train_indices_sample = train_test_split(train_indices, train_size=0.2, random_state=42)[0]
test_indices_sample = train_test_split(test_indices, test_size=0.2, random_state=42)[1]

# Assign features and targets based on the selected indices
X_train = df_matrix_reduced[train_indices_sample]
X_test = df_matrix_reduced[test_indices_sample]
y_train = df.loc[train_indices_sample, 'category']
y_test = df.loc[test_indices_sample, 'category']

# Import and configure the SVM model with GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.0001, 0.001, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

svc = SVC(probability=True)
model = GridSearchCV(svc, param_grid, cv=5, n_jobs=-1)
model.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found:", model.best_params_)

# Testing the model using test data
y_pred = model.predict(X_test)

# Printing the classification report
print(classification_report(y_test, y_pred))

# Calculate and print the overall running time
overall_time = time.time() - start_time_overall
print(f"Total running time: {overall_time:.2f} seconds.")

## SVM without PCA

In [None]:
import time

# Start timing the overall process
start_time_overall = time.time()

# SVM without PCA
X_train = image_matrix_scaled[df['subset'] == 'train']
y_train = df[df['subset'] == 'train']['category']
X_test = image_matrix_scaled[df['subset'] == 'test']
y_test = df[df['subset'] == 'test']['category']

# Definition of the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.0001, 0.001, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

# Creation of an SVC
svc = SVC(probability=True)

# Creating the model using GridSearchCV with the parameter grid
model = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)

# Training the model using training data
model.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found:", model.best_params_)

# Testing the model using test data
y_pred = model.predict(X_test)

# Printing the classification report
print(classification_report(y_test, y_pred))

# Calculate and print the overall running time
overall_time = time.time() - start_time_overall
print(f"Total running time: {overall_time:.2f} seconds.")

SVM without PCA - sample of dataset

In [None]:
from sklearn.model_selection import train_test_split
import time

# Start timing the overall process
start_time_overall = time.time()

# Indices for training and testing data as defined in the 'subset' column
train_indices = df[df['subset'] == 'train'].index
test_indices = df[df['subset'] == 'test'].index

# Randomly select 20% of the data from both training and testing subsets
train_indices_sample = train_test_split(train_indices, train_size=0.2, random_state=42)[0]
test_indices_sample = train_test_split(test_indices, test_size=0.2, random_state=42)[1]

# Assign features and targets based on the selected indices
X_train = image_matrix_scaled[train_indices_sample]
X_test = image_matrix_scaled[test_indices_sample]
y_train = df.loc[train_indices_sample, 'category']
y_test = df.loc[test_indices_sample, 'category']

# Import and configure the SVM model with GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.0001, 0.001, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

svc = SVC(probability=True)
model = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)
model.fit(X_train, y_train)

# Divide test data into 4 batches for classification
batch_size = len(X_test) // 4  # Calculate the batch size based on the total number of samples in the test set

for i in range(4):
    start_index = i * batch_size
    end_index = start_index + batch_size if i < 3 else len(X_test)  # Ensure the last batch includes all remaining data

    X_batch = X_test[start_index:end_index]
    y_batch = y_test[start_index:end_index]
    
    # Predict model on the batch
    y_pred_batch = model.predict(X_batch)
    
    # Print the classification report for the current batch
    print(f"Classification Report for Batch {i+1}")
    print(classification_report(y_batch, y_pred_batch))
    print("\n" + "-"*80 + "\n")

# Calculate and print the overall running time
overall_time = time.time() - start_time_overall
print(f"Total running time: {overall_time:.2f} seconds.")

# SVM without PCA with SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import time

# Start timing the overall process
start_time_overall = time.time()

# SVM without PCA
X_train = image_matrix_scaled[df['subset'] == 'train']
y_train = df[df['subset'] == 'train']['category']
X_test = image_matrix_scaled[df['subset'] == 'test']
y_test = df[df['subset'] == 'test']['category']

# Define parameter grid for SGDClassifier
param_grid_sgd = {
    'alpha': [0.0001, 0.001, 0.01, 0.1],  # Regularization parameter
    'penalty': ['l2', 'l1', 'elasticnet'],
    'loss': ['hinge']  # Hinge loss corresponds to a linear SVM
}

# Create SGDClassifier
sgd = SGDClassifier(max_iter=1000, tol=1e-3)

# Create GridSearchCV model
grid_search_sgd = GridSearchCV(sgd, param_grid_sgd, cv=3, n_jobs=-1)

# Train model using training data
grid_search_sgd.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found:", model.best_params_)

# Test model using test data
y_pred_sgd = grid_search_sgd.predict(X_test)

# Print classification report
print(classification_report(y_test,y_pred_sgd))

# Calculate and print the overall running time
overall_time = time.time() - start_time_overall
print(f"Total running time: {overall_time:.2f} seconds.")

CNN

In [None]:
#IMPORTING LIBRARIES
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from keras import layers

LOAD AND NORMALISE DATA

In [None]:
# To make this notebook’s output stable across runs
np.random.seed(50)
tf.random.set_seed(50)


# Load the Dataset - we already run this above, but we run it again just to make sure this cell runs without running all the above cells
X_train = image_matrix_scaled[df['subset'] == 'train']
y_train = df[df['subset'] == 'train']['category']
X_test = image_matrix_scaled[df['subset'] == 'test']
y_test = df[df['subset'] == 'test']['category']


# Normalize pixel values to be between 0 and 1
X_train = X_train.astype(np.float32) / 255
X_test = X_test.astype(np.float32) / 255

### THE FOLLOWING CELL IS AN ATTEMPT TO VISUALISE DATA. IT IS NOT PART OF THE CNN MODEL. 
### DO NOT PANIC IF IT DOES NOT RUN.

In [None]:
#we already run this above, but we run it again just to make sure this cell runs without running all the above cells
categories = {'Benign': 0, 'Malignant': 1, 'Undetected': 2}

# visualising images with numeric tags and names  
def display_images_with_labels(images, labels, categories, num_images=10, tagnum = None):
    plt.figure(figsize=(10,10))
    # select only images with specified tag number
    if tagnum >=0 and tagnum<3: 
        indices = np.where(np.array([str(label) for label in labels]) == str(tagnum))[0][:num_images]
    # otherwise select randomly
    else: 
        indices = np.random.choice(range(len(images)), num_images, replace=False)
    for i, index in enumerate(indices):
        plt.subplot(5, 5, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(images[index], cmap=plt.cm.binary)
         # if there is a list of class names, show both tag number and class name
        if categories:
            label_with_name = f"{labels[index]} ({categories[labels[index]]})"
            plt.xlabel(label_with_name)
        # othwerwise visualise only tag number
        else:
            plt.xlabel(labels[index])
    plt.show()

display_images_with_labels(X_train, y_train, categories, num_images=25, category = 0)

CREATING THE MODEL

In [None]:
## Creating the models
# Lower Model
lower_model = keras.Sequential()
lower_model.add(keras.Input(shape=(28, 28, 1)))  # 28x28 B&W

lower_model.add(layers.Conv2D(32, 3, activation="relu", padding='same', kernel_initializer='he_normal'))
lower_model.add(layers.MaxPooling2D(2))

lower_model.add(layers.Conv2D(64, 3, activation="relu", padding='same', kernel_initializer='he_normal'))
lower_model.add(layers.MaxPooling2D(2))

# Upper Model
upper_model = keras.Sequential()
upper_model.add(layers.Flatten()) 

upper_model.add(layers.Dense(128, activation='relu', kernel_initializer='he_normal'))
upper_model.add(layers.Dense(10, activation='softmax'))

# Prepare the training dataset.
batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1024).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)

# Optimizer for the lower layers
lower_optimizer = keras.optimizers.SGD(learning_rate=1e-4)
# Optimizer for the upper layers
upper_optimizer = keras.optimizers.Nadam(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy()

# Prepare the testing dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
val_dataset = val_dataset.batch(batch_size)

INITIALISING METRICS AND RUNNING CNN MODEL

In [None]:
# Initialize metrics at the start of each epoch
loss_avg = keras.metrics.Mean()
accuracy = keras.metrics.SparseCategoricalAccuracy()
val_loss_avg = keras.metrics.Mean()
val_accuracy = keras.metrics.SparseCategoricalAccuracy()

# Training Loop
epochs = 5
for epoch in range(epochs):
    print(f"\nStart of epoch {epoch}")

    # Iterate over the batches of the training dataset
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        # Calculating the intermediate activations of the lower and upper model 
        with tf.GradientTape(persistent=True) as tape:
            lower_logits = lower_model(x_batch_train, training=True)
            logits = upper_model(lower_logits, training=True)
        # Calcutating the loss
            loss_value = loss_fn(y_batch_train, logits)

        # Gradient computation for both the models
        lower_grads = tape.gradient(loss_value, lower_model.trainable_weights)
        upper_grads = tape.gradient(loss_value, upper_model.trainable_weights)
        
        # Apply optimizers
        lower_optimizer.apply_gradients(zip(lower_grads, lower_model.trainable_weights))
        upper_optimizer.apply_gradients(zip(upper_grads, upper_model.trainable_weights))

        # Update training metrics
        loss_avg.update_state(loss_value)
        accuracy.update_state(y_batch_train, logits)

    # Print the mean training loss and accuracy over the epoch
    train_loss = loss_avg.result()
    train_accuracy = accuracy.result()
    print(f"Training loss over epoch: {float(train_loss):.4f}")
    print(f"Training accuracy over epoch: {float(train_accuracy):.4f}")


    # Perform validation at the end of the epoch
    for x_batch_val, y_batch_val in val_dataset:
        lower_val_logits = lower_model(x_batch_val, training=False)
        val_logits = upper_model(lower_val_logits, training=False)
        val_loss_value = loss_fn(y_batch_val, val_logits)
    
        val_loss_avg.update_state(val_loss_value)
        val_accuracy.update_state(y_batch_val, val_logits)
    
    # Compute the mean validation loss and accuracy for the epoch
    val_loss = val_loss_avg.result()
    val_acc = val_accuracy.result()
    print(f"Validation loss: {float(val_loss):.4f}")
    print(f"Validation accuracy: {float(val_acc):.4f}")