## Final Exam Machine Learning

Davide Tateo; 167275

Francesca Salute; 167284

Nicole Favero; 167340 

Tomás Gonçalves; 167288

In [None]:
!git clone https://github.com/frasalute/ML_FinalExam.git

In [None]:
import os
print(f"Current working directory: {os.getcwd()}")
print(f"Contents of the current directory: {os.listdir('.')}")

In [None]:
base_path = '/kaggle/working/ML_FinalExam/images'
print("Base path is:", base_path)

In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
# Importing required API and modules
import warnings
import numpy as np
import pandas as pd
from tensorflow.keras.utils import img_to_array
from keras.preprocessing.image import array_to_img
from tensorflow.keras.utils import load_img
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
def load_images_to_df(base_path, categories):
    data = {'photo_id': [], 'image': [], 'image_array' :[], 'category': []}
    categories = {'Benign': 0, 'Malignant': 1}
    
    for subset in ['train', 'test']:
        for category in categories.keys():
            folder_path = os.path.join(base_path, subset, category)
            for filename in os.listdir(folder_path):
                if filename.endswith('.jpg'):
                    file_path = os.path.join(folder_path, filename)
                    # Upload Images
                    image = load_img(file_path)
                    image_array = img_to_array(image)
                    # Add the data
                    data['photo_id'].append(filename)
                    data['image'].append(image)
                    data['image_array'].append(image_array)
                    data['category'].append(categories[category])
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

# Define categories outside the function so it's easier to modify or append later on 
categories = {'Benign': 0, 'Malignant': 1}
df = load_images_to_df(base_path, categories)

In [None]:
print(df.head())
row_count = len(df)
print("Number of rows in the DataFrame:", row_count)

# Code to make augmentation of pics

In [None]:
'''original_dataset_dir = './images/undetec_to_augm'
train_augm_undetected_dir = './images/train/Undetected'
test_augm_undetected_dir ='./images/test/Undetected'

# Create a data generator for augmentation
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range = [0.8, 1.3],
    fill_mode='nearest'
)

# List the files in the original dataset directory
file_list = os.listdir(original_dataset_dir)

# Ensure the target directories exist
os.makedirs(train_augm_undetected_dir, exist_ok=True)
os.makedirs(test_augm_undetected_dir, exist_ok=True)

# Desired number of images after augmentation
target_count_1 = 6000
target_count_2 = 1000
"""= target_count_2 = target_count_3 = target_count_4 = target_count_5 = target_count_6 = target_count_7"""

# Number of images in the original dataset
original_count = len(file_list)


# Number of times to repeat each image to reach the target count
repeats_1 = min(target_count_1 // original_count + 1, len(file_list))
repeats_2 = min(target_count_2 // original_count + 1, len(file_list))
"""repeats_3 = min(target_count_3 // original_count + 1, len(file_list))
repeats_4 = min(target_count_4 // original_count + 1, len(file_list))
repeats_5 = min(target_count_5 // original_count + 1, len(file_list))
repeats_6 = min(target_count_6 // original_count + 1, len(file_list))
repeats_7 = min(target_count_7 // original_count + 1, len(file_list))"""

# Augment and save the images for the 6000 train images
for file in file_list[:original_count]:
    img_path = os.path.join(original_dataset_dir, file)
    img = load_img(img_path)
    img = img_to_array(img)
    img = img.reshape((1,) + img.shape)

    i = 0
    for batch in datagen.flow(img, batch_size=1, save_to_dir = train_augm_undetected_dir, save_prefix='augm', save_format='jpeg'): 
        #if in the previous line i save them as jpg, it anyway augments only the jpeg ones
        i += 1
        if i >= repeats_1:
            break  # break the loop after reaching the desired number of augmented images



# Augment and save the images for the 1000 test images
for file in file_list[:original_count]:
    img_path = os.path.join(original_dataset_dir, file)
    img = load_img(img_path)
    img = img_to_array(img)
    img = img.reshape((1,) + img.shape)

    i = 0
    for batch in datagen.flow(img, batch_size=1, save_to_dir = test_augm_undetected_dir, save_prefix='aug', save_format='jpeg'):
        i += 1
        if i >= repeats_2:
            break  # break the loop after reaching the desired number of augmented images '''

In [None]:
# Append the new category to the dictionary of possible categories
categories['Undetected'] = 2

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Making sure it's a 2D array
image_matrix = np.stack(df['image_array'].values)
print(image_matrix)

In [None]:
# It's a three dimensions array so we have to flatten it 
image_matrix = np.array([img.flatten() for img in df['image_array']])

# Scale the data
scaler = StandardScaler()
image_matrix_scaled = scaler.fit_transform(image_matrix)

# Using cumulative variance ratio
pca_ratio=PCA()
pca_ratio.fit(image_matrix_scaled) # fit the PCA so it can learn

# Two ways to proceed to obtain the number of components

# Using cumulative variance
cumulative_variance_ratio=np.cumsum(pca_ratio.explained_variance_ratio_)
variance=0.95 # set to 95% to keep a sufficiently large portion of the variance
n_components= np.argmax(cumulative_variance_ratio >= variance) +1 # find the number of components needed 

print(f"Number of principal components: {n_components}")

# Setting threshold using pca
pca_threshold = PCA(n_components=0.99)
pca_threshold.fit(image_matrix_scaled) # fit the pca model to the data to learn patterns
print(f"Number of principal components: {pca_threshold.n_components_}")

# Screen plot eigenvalues - number of principal components
plt.figure(figsize=(10, 6))
sns.lineplot(x=np.arange(1, len(cumulative_variance_ratio) + 1), y=cumulative_variance_ratio, marker='o', color='#FF69B4')
plt.title('Scree Plot')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()

In [None]:
# Transform the original data using retained principal components 
df_reduced= pca_threshold.transform(image_matrix_scaled) # transform the original data to reduced dimensionality