# Ensemble model - AN2DL - Challenge 1

### Libraries and mounting

In [None]:
import gc
import os
#os.environ["CUDA_VISIBLE_DEVICES"]="0"
import random
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow.keras as tfk
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Average
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import mixed_precision

input_shape = (96,96,3)

In [None]:
# Clear out any old model state.
gc.collect()
tfk.backend.clear_session()

In [None]:
print("TensorFlow:", tf.__version__)

TensorFlow: 2.12.0


In [None]:
'''# Code to run the training on the TPU taken from
# https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/shakespeare_with_tpu_and_keras.ipynb#scrollTo=ExQ922tfzSGA
tf.keras.backend.clear_session()

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)'''

All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/UNI/MAGISTRALE/ANNDL/Challenge 1/ensemble_trials'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: '/content/drive/My Drive/UNI/MAGISTRALE/ANNDL/Challenge 1/ensemble_trials'
/content/drive/.shortcut-targets-by-id/17vuJrJgDzlV1IYNFiwDvQFTgjdlP3Xnd/Challenge 1/ensemble_trials


### Dataset generator (for large training dataset)
A larger augmented, balanced dataset of 40k samples for training and 5k for validation. It is loaded in ram in batches to avoid out of memory errors

In [None]:
def load_augmented_batches(batch_dir, preprocess_fn, batch_size):
    batch_files = [os.path.join(batch_dir, file_name) for file_name in os.listdir(batch_dir) if file_name.endswith('.npz')]
    while True:  # Loop indefinitely
        np.random.shuffle(batch_files)  # Shuffle the order of files to introduce randomness
        for batch_file in batch_files:
            # Load the entire batch file
            batch_data = np.load(batch_file)
            images = batch_data['data']
            labels = batch_data['labels']

            # Shuffle the data
            indices = np.arange(len(images))
            np.random.shuffle(indices)
            images = images[indices]
            labels = labels[indices]

            # Yield batches from the current file
            for i in range(0, len(images), batch_size):
                end_index = i + batch_size if (i + batch_size) < len(images) else len(images)
                batch_images = images[i:end_index]
                batch_labels = labels[i:end_index]
                gc.collect()
                #yield preprocess_fn(batch_images), batch_labels
                yield batch_images, batch_labels

In [None]:
#Fede's augmented data path
augmented_data_path_train = "../Data/Augmented_Experimental/Train"
augmented_data_path_val = "../Data/Augmented_Experimental/Validation"

npz_file_size = 5000
batch_size = 64

train_generator = load_augmented_batches(augmented_data_path_train, None, batch_size)
val_generator = load_augmented_batches(augmented_data_path_val, None, batch_size)

# Determine the number of steps per epoch (number of batches)
train_steps = sum(1 for _ in os.listdir(augmented_data_path_train) if _.endswith('.npz')) * (npz_file_size // batch_size)
val_steps = sum(1 for _ in os.listdir(augmented_data_path_val) if _.endswith('.npz')) * (npz_file_size // batch_size)

### Full dataset loading (for smaller training datasets)
An augmented dataset of 11500 samples for training and around 1000 for validation, it fits the RAM costraints of colab. Classes did not get balanced during the augmentation

In [None]:
# Load data
train = np.load('../Data/Augmented_Unbalanced/Train/11.5k.npz', allow_pickle=True)
X_train, y_train = train["data"], train["labels"]

val = np.load('../Data/Augmented_Unbalanced/Validation/valid_exp_split.npz', allow_pickle=True)
X_val, y_val = val["data"], val["labels"]

We tried using sklearn compute_class_weight ```compute_class_weight``` to balance the 2 classes, but we did not get any significant improvement.



In [None]:
class_weights = compute_class_weight(
    'balanced',
    classes = np.unique(y_train),
    y = y_train
)

# Conversion needed for fit function
class_weight_dict = dict(enumerate(class_weights))

## Single feature extractors


### ConvNeXtLarge

In [None]:
with strategy.scope():
  convnext = tfk.applications.ConvNeXtLarge(
    input_shape = input_shape,
    include_top = False,
    pooling='avg'
  )
'''  convnext.trainable = True
  for layer in convnext.layers[:-5]:  # Replace n with the number of layers you want to freeze
      layer.trainable = False'''


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/convnext/convnext_large_notop.h5


'  convnext.trainable = True\n  for layer in convnext.layers[:-5]:  # Replace n with the number of layers you want to freeze\n      layer.trainable = False'

In [None]:
convnext.trainable = False

In [None]:
convnext.summary()

Model: "convnext_large"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 96, 96, 3)]  0           []                               
                                                                                                  
 convnext_large_prestem_normali  (None, 96, 96, 3)   0           ['input_1[0][0]']                
 zation (Normalization)                                                                           
                                                                                                  
 convnext_large_stem (Sequentia  (None, 24, 24, 192)  9792       ['convnext_large_prestem_normaliz
 l)                                                              ation[0][0]']                    
                                                                                     

### MobileNetV2

In [None]:
with strategy.scope():
  mobilenet = tfk.applications.MobileNetV2(
    input_shape = input_shape,
    include_top = False,
    pooling='avg'
  )
'''  mobilenet.trainable = True
  for layer in mobilenet.layers[:-3]:  # Replace n with the number of layers you want to freeze
      layer.trainable = False'''

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_96_no_top.h5


'  mobilenet.trainable = True\n  for layer in mobilenet.layers[:-3]:  # Replace n with the number of layers you want to freeze\n      layer.trainable = False'

In [None]:
mobilenet.trainable = False

In [None]:
mobilenet.summary()

Model: "mobilenetv2_1.00_96"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 96, 96, 3)]  0           []                               
                                                                                                  
 Conv1 (Conv2D)                 (None, 48, 48, 32)   864         ['input_2[0][0]']                
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 48, 48, 32)   128         ['Conv1[0][0]']                  
                                                                                                  
 Conv1_relu (ReLU)              (None, 48, 48, 32)   0           ['bn_Conv1[0][0]']               
                                                                                

### EfficientNetV2L

In [None]:
with strategy.scope():
  efficientnet = tfk.applications.EfficientNetV2L(
    input_shape = input_shape,
    include_top = False,
    pooling='avg'
  )
'''  efficientnet.trainable = True
  for layer in efficientnet.layers[:-3]:  # Replace n with the number of layers you want to freeze
      layer.trainable = False'''

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-l_notop.h5


'  efficientnet.trainable = True\n  for layer in efficientnet.layers[:-3]:  # Replace n with the number of layers you want to freeze\n      layer.trainable = False'

In [None]:
efficientnet.trainable = False

In [None]:
efficientnet.summary()

Model: "efficientnetv2-l"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 96, 96, 3)]  0           []                               
                                                                                                  
 rescaling (Rescaling)          (None, 96, 96, 3)    0           ['input_3[0][0]']                
                                                                                                  
 stem_conv (Conv2D)             (None, 48, 48, 32)   864         ['rescaling[0][0]']              
                                                                                                  
 stem_bn (BatchNormalization)   (None, 48, 48, 32)   128         ['stem_conv[0][0]']              
                                                                                   

### Xception

In [None]:
with strategy.scope():
  xception = tfk.applications.Xception(
    input_shape = input_shape,
    include_top = False,
    pooling='avg'
  )
'''  xception.trainable = True
  for layer in xception.layers[:-3]:  # Replace n with the number of layers you want to freeze
      layer.trainable = False'''

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


'  xception.trainable = True\n  for layer in xception.layers[:-3]:  # Replace n with the number of layers you want to freeze\n      layer.trainable = False'

In [None]:
xception.trainable = False

In [None]:
xception.summary()

Model: "xception"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 96, 96, 3)]  0           []                               
                                                                                                  
 block1_conv1 (Conv2D)          (None, 47, 47, 32)   864         ['input_4[0][0]']                
                                                                                                  
 block1_conv1_bn (BatchNormaliz  (None, 47, 47, 32)  128         ['block1_conv1[0][0]']           
 ation)                                                                                           
                                                                                                  
 block1_conv1_act (Activation)  (None, 47, 47, 32)   0           ['block1_conv1_bn[0][0]'] 

## Single models
creation of single different models, each will be based on the feature extraction part of one of the keras.applications models, plus one or more dense layers (Relu), followed by the usual single node output layer (sigmoid)

In [None]:
dense_units = [512,512,512]
#with strategy.scope()
x = tfk.layers.Dense(dense_units[0], name="convnext_dense", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(convnext.output)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[1], name="convnext_dense_2", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[2], name="convnext_dense_3", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
prediction = tfk.layers.Dense(1, activation='sigmoid', kernel_regularizer=tfk.regularizers.l2(1e-4), name="convnext_output")(x)
convnext_model = Model(inputs=convnext.inputs, outputs=prediction)

x = tfk.layers.Dense(dense_units[0], name="mobilenet_dense", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(mobilenet.output)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[1], name="mobilenet_dense_2", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[2], name="mobilenet_dense_3", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
prediction = tfk.layers.Dense(1, activation='sigmoid',kernel_regularizer=tfk.regularizers.l2(1e-4), name="mobilenet_output")(x)
mobilenet_model = Model(inputs=mobilenet.inputs, outputs=prediction)

x = tfk.layers.Dense(dense_units[0], name="efficientnet_dense", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(efficientnet.output)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[1], name="efficientnet_dense_2", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[2], name="efficientnet_dense_3", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
prediction = tfk.layers.Dense(1, activation='sigmoid',kernel_regularizer=tfk.regularizers.l2(1e-4), name="efficientnet_output")(x)
efficientnet_model = Model(inputs=efficientnet.inputs, outputs=prediction)

x = tfk.layers.Dense(dense_units[0], name="xception_dense", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(xception.output)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[1], name="xception_dense_2", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
x = tfk.layers.Dense(dense_units[2], name="xception_dense_3", kernel_regularizer=tfk.regularizers.l2(1e-4),activation='relu')(x)
x = tfk.layers.BatchNormalization()(x)
prediction = tfk.layers.Dense(1, activation='sigmoid' ,kernel_regularizer=tfk.regularizers.l2(1e-4),name="xception_output")(x)
xception_model = Model(inputs=xception.inputs, outputs=prediction)

## Ensembling all single models into one

In [None]:
#with strategy.scope():
single_models = [convnext_model, mobilenet_model, efficientnet_model, xception_model]
inputs = tfk.Input(shape=input_shape)
model_outputs = [model(inputs) for model in single_models]
ensemble_output = Average()(model_outputs)
ensemble_model = Model(inputs=inputs, outputs=ensemble_output, name='ensemble')
# Use custom lr scheduler
lr_schedule = tfk.optimizers.schedules.ExponentialDecay(
  initial_learning_rate=1e-3,  # Start with this learning rate
  decay_steps=900,             # After how many steps to apply decay (More or less 11k dataset has 370 steps per epoch, so after about 3 epochs start decreasing lr)
  decay_rate=0.9,              # Decay rate
  staircase=True               # If True, learning rate is reduced at discrete intervals
)

optimizer = tfk.optimizers.AdamW(learning_rate = lr_schedule)
ensemble_model.compile(
    loss=tfk.losses.BinaryCrossentropy(),
    optimizer=optimizer,
    metrics=['accuracy']
)
ensemble_model.summary()

#callbacks
reduce_lr_on_plateau = tfk.callbacks.ReduceLROnPlateau(
  monitor='val_loss',
  factor=0.9,
  patience=5
)
es = tfk.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=30, restore_best_weights=True)

Model: "ensemble"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 96, 96, 3)]  0           []                               
                                                                                                  
 model_4 (Functional)           (None, 1)            197017793   ['input_6[0][0]']                
                                                                                                  
 model_5 (Functional)           (None, 1)            2914369     ['input_6[0][0]']                
                                                                                                  
 model_6 (Functional)           (None, 1)            118403233   ['input_6[0][0]']                
                                                                                           

## Training

### Train with data generator

In [None]:
gc.collect()
#with strategy.scope():
history = ensemble_model.fit(
    x = train_generator,  # Training data comes from the generator
    steps_per_epoch = train_steps,  # Number of batches in the training set
    epochs = 1000,
    validation_data = val_generator,  # Validation data comes from the generator
    validation_steps = val_steps,  # Number of batches in the validation set
    callbacks = [es, reduce_lr_on_plateau]
).history

### Train on full data

In [None]:
gc.collect()
with strategy.scope():
  history = ensemble_model.fit(
    X_train,
    y_train,
    batch_size = 64,
    #class_weight = class_weight_dict,
    validation_data=(X_val, y_val),
    epochs = 1000,
    callbacks = [es]
  ).history

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000

## Plot history

In [None]:
# Plot the training
plt.figure(figsize=(15,5))
plt.plot(history['loss'], alpha=.3, color='#4D61E2', linestyle='--')
plt.plot(history['val_loss'], label='ensemble model', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Binary Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(15,5))
plt.plot(history['accuracy'], alpha=.3, color='#4D61E2', linestyle='--')
plt.plot(history['val_accuracy'], label='ensemble model', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

dense12 no lr schedule, no rl reduce, no kernel reg: <86 \\
dense12 no lr schedule, no rl reduce, yes kernel reg: 87.21 \\
dense12 yes lr schedule, no rl reduce, yes kernel reg: 87.41 \\
dense16 yes lr schedule, no rl reduce, yes kernel reg: 88.11 \\
same but patience 30: 88.91

## Save (and reload)

In [None]:
save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
ensemble_model.save('ensemble_model_v2_DENSE512', options=save_locally)
#ensemble_model.save('ensemble_model_v2_DENSE512')

In [None]:
del ensemble_model

In [None]:
ensemble_model = tfk.models.load_model('ensemble_model_dense512')

In [None]:
test_dir = '../Data/Augmented/Test'
test_generator = load_augmented_batches(train_dir, preprocess_input, batch_size)
test_steps = sum(1 for _ in os.listdir(test_dir) if _.endswith('.npz'))

In [None]:
model.evaluate(
    x=test_generator,
    steps=test_steps
)