# Ensemble Learning

## Import libraries

In [None]:
# Fix randomness and hide warnings
SEED = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(SEED)

import logging

import random
random.seed(SEED)



# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(SEED)
tf.compat.v1.set_random_seed(SEED)


# Import other libraries
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from keras.applications.convnext import preprocess_input
from keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
from sklearn.utils import shuffle
import os
import shutil

## Load and process the dataset

In [None]:
items = np.load('/kaggle/input/dataset/dataset_clean.npz', allow_pickle=True)
leaves = items['data']
labels = items['labels']

print(f'Input shape: {leaves.shape[1:]}\n')

# Calculate the unique target labels and their counts
unique, count = np.unique(labels, return_counts=True)
print('Target labels:', unique)
for u in unique:
    print(f'Class {unique[u]} has {count[u]} samples')

In [None]:
# Convert labels to one-hot encoding format
labels = tfk.utils.to_categorical(labels, 2)

# Random shuffle
leaves, labels = shuffle(leaves, labels)

# Split data into train_val and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split (
    leaves,
    labels,
    random_state = SEED,
    test_size = 0.1,
    stratify = np.argmax(labels, axis=1)
)

# Further split train_val into train and validation sets
X_train, X_val, y_train, y_val = train_test_split (
    X_train_val,
    y_train_val,
    random_state = SEED,
    test_size = len(X_test),
    stratify = np.argmax(y_train_val, axis=1)
)

INPUT_SHAPE = X_train.shape[1:]
OUTPUT_SHAPE = y_train.shape[-1]

# Print shapes of the datasets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

## Ensemble Learning

In [None]:
num_models = 6
path_models = [    
    "...model1...",
    "...model2...",
    "...model3...",
    "...model4...",
    "...model5...",
    "...model6...",
    "/kaggle/working/ModelEnsemble"     # output model
]
models=[]

for i in range(num_models):
    # load model
    modelTemp = load_model(path_models[i])
    modelTemp._name = "model_" + str(i + 1) # change name to be unique
    models.append(modelTemp)

In [None]:
def ensembleModels(models, model_input):
    # collect outputs of models in a list
    yModels = [model(model_input) for model in models]

    # averaging outputs
    yAvg = tfkl.average(yModels)

    # build model from same input and avg output
    modelEnsemble = Model(inputs=model_input, outputs=yAvg, name='ensemble')
    return modelEnsemble

model_input = tfk.Input(shape=INPUT_SHAPE)
modelEns = ensembleModels(models, model_input)
modelEns.summary()

In [None]:
plot_model(modelEns, show_shapes=True, show_layer_names=True)

In [None]:
modelEns.save("model")

In [None]:
os.chdir(r'/kaggle/working')
!zip -r file.zip /kaggle/working

In [None]:
def predict(model, X_test, y_test):
    # Predict labels for the entire test set
    predictions = model.predict(preprocess_input(X_test*255))

    # Compute classification metrics
    accuracy = accuracy_score(np.argmax(y_test, axis=-1), np.argmax(predictions, axis=-1))
    precision = precision_score(np.argmax(y_test, axis=-1), np.argmax(predictions, axis=-1), average='macro')
    recall = recall_score(np.argmax(y_test, axis=-1), np.argmax(predictions, axis=-1), average='macro')
    f1 = f1_score(np.argmax(y_test, axis=-1), np.argmax(predictions, axis=-1), average='macro')

    # Display the computed metrics
    print('Accuracy:', accuracy.round(4))
    print('Precision:', precision.round(4))
    print('Recall:', recall.round(4))
    print('F1:', f1.round(4))

    # Compute the confusion matrix
    cm = confusion_matrix(np.argmax(y_test, axis=-1), np.argmax(predictions, axis=-1), normalize="true")
    print(f"Confusion matrix : {cm}")
    
    # Plot the confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm.T, xticklabels=list(('healthy','unhealthy')), yticklabels=list(('healthy','unhealthy')), cmap='Blues', annot=True)
    plt.xlabel('True labels')
    plt.ylabel('Predicted labels')
    plt.show()

In [None]:
print("Ensemble model")
predict(modelEns, X_test, y_test)

In [None]:
# shutil.rmtree("/kaggle/working")