In [95]:
import os
import pandas as pd
import numpy as np
import cv2
from keras import backend as K
from keras.layers import Input, Dense, Lambda, Convolution2D, MaxPooling2D, GlobalAveragePooling2D, UpSampling2D, Reshape, Flatten
from keras.models import Model
from keras.utils import to_categorical
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import class_weight
import tensorflow as tf
import optuna
import mlflow
from mlflow.models import infer_signature

tf.compat.v1.disable_eager_execution()

In [2]:
pwd

'C:\\Users\\admin\\Downloads'

In [3]:
groundtruth_file_path = "public/public.csv"  #path of csv file

In [4]:
groundtruth_df = pd.read_csv(groundtruth_file_path)


In [5]:
groundtruth_df

Unnamed: 0,name,ground truth
0,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0
1,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0
2,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0
3,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0
4,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0
...,...,...
5753,S-2001-019413_PAS_1of2_64762d5a435c92704a44caf...,1
5754,S-2001-019413_PAS_1of2_64762d5a435c92704a44caf...,1
5755,S-2001-019413_PAS_1of2_64762d5a435c92704a44caf...,1
5756,S-2001-019413_PAS_1of2_64762d5a435c92704a44caf...,1


In [6]:
groundtruth_df.columns


Index(['name', 'ground truth'], dtype='object')

In [7]:
for index, series in groundtruth_df.iterrows():
    label = series['ground truth']
    filename = series['name']
    print(label)
    print(filename)
    break;

0
S-2006-005094_PAS_1of2_64552732435c92704a3d37ca.png


In [8]:
for row in groundtruth_df.itertuples(index=False):
    label = row[1] 
    filename = row[0]  
    print(label)
    print(filename)
    print(type(label))
    break;

0
S-2006-005094_PAS_1of2_64552732435c92704a3d37ca.png
<class 'int'>


In [9]:
groundtruth_df.isna().sum()

name            0
ground truth    0
dtype: int64

In [10]:
groundtruth_df.nunique()

name            5758
ground truth       2
dtype: int64

In [11]:
# Load and preprocess the dataset using the folder path
def image_preprocessing():
    sclerotic_folder_path = "public/globally_sclerotic_glomeruli" 
    non_sclerotic_folder_path = "public/non_globally_sclerotic_glomeruli"
    groundtruth_file_path = "public/public.csv"
    groundtruth_df = pd.read_csv(groundtruth_file_path)
    images = []
    labels = []

    for row in groundtruth_df.itertuples(index=False):
        label = row[1]
        filename = row[0]
        if label == 1:
            image_path = os.path.join(sclerotic_folder_path, filename)
        else:
            image_path = os.path.join(non_sclerotic_folder_path, filename)
        image = cv2.imread(image_path)
        image = cv2.resize(image, (224, 224))  
        image = image / 255.0  
        images.append(image)
        labels.append(label)
        
    images = np.array(images)
    labels = np.array(labels)
    
    # Split the dataset into train and test sets
    strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_val_idx, test_idx = next(strat_split.split(images, labels))
    train_val_images, test_images = images[train_val_idx], images[test_idx]
    train_val_labels, test_labels = labels[train_val_idx], labels[test_idx]

    return train_val_images, train_val_labels, test_images, test_labels


In [12]:
# Sampling function for eager execution compatibility
def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.random.normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [13]:
# Create the Encoder-Decoder model
def encoder_decoder(num_layers, activation_function, filter_counts, latent_dimension=3):
    input_layer = Input(shape=(224, 224, 3))
    x = input_layer

    # Encoder
    for i in range(num_layers):
        x = Convolution2D(filters=filter_counts[i], kernel_size=(3, 3), padding="same", activation=activation_function)(x)
        if i < num_layers - 1:
            x = MaxPooling2D((2, 2), padding='same')(x)

    shape_before_flattening = K.int_shape(x)  
    x = Flatten()(x)  
    z_mean = Dense(latent_dimension, name='z_mean')(x)
    z_log_var = Dense(latent_dimension, name='z_log_var')(x)
    z = Lambda(sampling, name='sampling')([z_mean, z_log_var])

    encoder = Model(input_layer, [z_mean, z_log_var, z], name='encoder')

    # Decoder
    decoder_input = Input(shape=(latent_dimension,))
    x = Dense(np.prod(shape_before_flattening[1:]), activation=activation_function)(decoder_input)
    x = Reshape((shape_before_flattening[1], shape_before_flattening[2], shape_before_flattening[3]))(x)  

    
    for i in reversed(range(num_layers - 1)):
        x = Convolution2D(filters=filter_counts[i], kernel_size=(3, 3), padding="same", activation=activation_function)(x)
        x = UpSampling2D((2, 2))(x)

    output_layer = Convolution2D(3, (3, 3), padding='same', activation='sigmoid')(x)
    decoder = Model(decoder_input, output_layer, name='decoder')

    # Classifier model
    z_decoded = decoder(z)
    classification_output = Dense(2, activation='softmax', name='classification_output')(z)
    vae_model = Model(input_layer, [classification_output, z_decoded], name='vae')

    return vae_model, encoder, decoder


In [14]:
# Computing the VAE loss
def vae_loss(inputs, decoded_output, z_mean, z_log_var):
    reconstruction_loss = tf.reduce_sum(tf.keras.losses.binary_crossentropy(inputs, decoded_output), axis=(1, 2))
    kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
    return tf.reduce_mean(reconstruction_loss + kl_loss)

In [15]:
def vae_loss_parameters(z_mean, z_log_var):
    def loss(inputs, decoded_output):
        return vae_loss(inputs, decoded_output, z_mean, z_log_var)
    return loss

In [16]:
# Objective function for hyperparameters tuning
def objective(trial, train_val_images, train_val_labels):
    num_layers = trial.suggest_int("num_layers", 1, 5)
    filter_counts = [trial.suggest_int(f"filters_{i}", 16, 128) for i in range(num_layers)]
    activation_function = trial.suggest_categorical("activation_function", ["relu", "sigmoid"])
    latent_dimension = 3

    vae_model, encoder, decoder = encoder_decoder(num_layers, activation_function, filter_counts, latent_dimension)

    optimizer = tf.keras.optimizers.Adam()
    z_mean, z_log_var, _ = encoder.output

    vae_model.compile(optimizer=optimizer, 
                      loss={'classification_output': 'categorical_crossentropy', 'decoder': vae_loss_parameters(z_mean, z_log_var)},
                      metrics={'classification_output': 'accuracy'})

    # K-Fold Cross-Validation during Hyperparameter Tuning
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=223)
    val_losses = []
    precisions = []
    recalls = []
    f1_scores = []
    aucs = []
    
    for train_idx, val_idx in skf.split(train_val_images, train_val_labels):
        train_images, val_images = train_val_images[train_idx], train_val_images[val_idx]
        train_labels, val_labels = train_val_labels[train_idx], train_val_labels[val_idx]
        one_hot_train_labels = to_categorical(train_labels)
        one_hot_val_labels = to_categorical(val_labels)

        history = vae_model.fit(train_images, [one_hot_train_labels, train_images], batch_size=32, epochs=20,
                                validation_data=(val_images, [one_hot_val_labels, val_images]), verbose=0)
        val_loss = history.history['val_loss'][-1]
        val_losses.append(val_loss)
        val_predictions = vae_model.predict(val_images)[0]
        val_pred_labels = np.argmax(val_predictions, axis=1)
        precision = precision_score(val_labels, val_pred_labels, average='weighted')
        recall = recall_score(val_labels, val_pred_labels, average='weighted')
        f1 = f1_score(val_labels, val_pred_labels, average='weighted')
        auc = roc_auc_score(one_hot_val_labels, val_predictions, multi_class='ovr')
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        aucs.append(auc)

    avg_val_loss = np.mean(val_losses)
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1_scores)
    avg_auc = np.mean(aucs)

    std_val_loss = np.std(val_losses)
    std_precision = np.std(precisions)
    std_recall = np.std(recalls)
    std_f1 = np.std(f1_scores)
    std_auc = np.std(aucs)

    metrics_results = {
        'avg_val_loss': avg_val_loss,
        'std_val_loss': std_val_loss,
        'avg_precision': avg_precision,
        'std_precision': std_precision,
        'avg_recall': avg_recall,
        'std_recall': std_recall,
        'avg_f1': avg_f1,
        'std_f1': std_f1,
        'avg_auc': avg_auc,
        'std_auc': std_auc
    }
    
    metrics_df = pd.DataFrame([metrics_results])
    metrics_df.to_csv('public/k_fold_validation_metrics.csv', index=False)
    return avg_val_loss

In [17]:
# Create Optuna study for Hyperparameter Tuning
def optuna_study(train_val_images, train_val_labels):
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, train_val_images, train_val_labels), n_trials=10)
    return study

In [19]:
# Load the dataset
train_val_images, train_val_labels, test_images, test_labels = image_preprocessing()

[20]

In [55]:
# Do Hyperparameter Tuning
study = optuna_study(train_val_images, train_val_labels)

[I 2024-10-14 14:30:45,105] A new study created in memory with name: no-name-855d8e10-861e-44ae-995a-b20accf5d6b3
[W 2024-10-14 14:30:50,265] Trial 0 failed with parameters: {'num_layers': 5, 'filters_0': 49, 'filters_1': 119, 'filters_2': 126, 'filters_3': 96, 'filters_4': 19, 'activation_function': 'relu'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\admin\anaconda3\envs\ciml\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_7500\4131729830.py", line 4, in <lambda>
    study.optimize(lambda trial: objective(trial, train_val_images, train_val_labels), n_trials=10)
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_7500\2394663582.py", line 31, in objective
    history = vae_model.fit(train_images, [one_hot_train_labels, train_images], batch_size=32, epochs=20,
  File "C:\Users\admin\anaconda3\envs\ciml\lib\site-packages\

KeyboardInterrupt: 

In [101]:
def deployed_model_workflow(train_val_images, train_val_labels, test_images, test_labels, optimal_params):
    if mlflow.active_run():
        mlflow.end_run()

    mlflow.set_tracking_uri("http://127.0.0.1:8080")
    mlflow.set_experiment("Deep learning classification sclerotic vs non-sclerotic")
    
    vae_model, encoder, decoder = encoder_decoder(
        num_layers=optimal_params["num_layers"],
        activation_function=optimal_params["activation_function"],
        filter_counts=[optimal_params[f"filters_{i}"] for i in range(optimal_params["num_layers"])]
    )
    
    z_mean, z_log_var, _ = encoder.output

    vae_model.compile(optimizer="adam", 
                      loss={'classification_output': 'categorical_crossentropy', 
                            'decoder': vae_loss_parameters(z_mean, z_log_var)},
                      metrics={'classification_output': ['accuracy', 
                                                         tf.keras.metrics.Precision(name='precision'), 
                                                         tf.keras.metrics.Recall(name='recall'), 
                                                         tf.keras.metrics.AUC(name='auc')]})

    one_hot_train_labels = tf.keras.utils.to_categorical(train_val_labels)
    
    history = vae_model.fit(train_val_images, 
                            [one_hot_train_labels, train_val_images], 
                            batch_size=32, 
                            epochs=50, 
                            verbose=1)
    
    one_hot_test_labels = tf.keras.utils.to_categorical(test_labels)
    test_results = vae_model.evaluate(test_images, [one_hot_test_labels, test_images], verbose=1)

    with mlflow.start_run(run_name="Final_VAE_Model"):
        mlflow.log_param("num_layers", optimal_params["num_layers"])
        mlflow.log_param("activation_function", optimal_params["activation_function"])
        mlflow.log_param("filter_counts", [optimal_params[f"filters_{i}"] for i in range(optimal_params["num_layers"])])

        mlflow.log_metric("test_classification_accuracy", test_results[3])
        mlflow.log_metric("test_precision", test_results[4])
        mlflow.log_metric("test_recall", test_results[5])
        mlflow.log_metric("test_auc", test_results[6])

        vae_model.save('best_deployed_vae_model.h5')
        
        signature = infer_signature(train_val_images, vae_model.predict(train_val_images))
        mlflow.keras.log_model(vae_model, "vae_model", signature=signature)

    mlflow.end_run()

    return vae_model


optimal parameters taken from first trial: [I 2024-10-14 12:54:47,387] Trial 0 finished with value: 30464.67412810825 and parameters: {'num_layers': 3, 'filters_0': 39, 'filters_1': 105, 'filters_2': 21, 'activation_function': 'relu'}. Best is trial 0 with value: 30464.67412810825.

In [102]:
optimal_params = {
    "num_layers": 3,
    "filters_0": 39,
    "filters_1": 105,
    "filters_2": 21,
    "activation_function": "relu"
}

deployed_model = deployed_model_workflow(train_val_images, train_val_labels, test_images, test_labels, optimal_params)


2024/10/15 04:40:23 INFO mlflow.tracking.fluent: Experiment with name 'Deep learning classification sclerotic vs non-sclerotic' does not exist. Creating a new experiment.


Train on 4606 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


2024/10/15 17:30:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run Final_VAE_Model at: http://127.0.0.1:8080/#/experiments/11/runs/12bd7fbc508c469dbaf12d25e9d3b80c.
2024/10/15 17:30:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/11.


KeyboardInterrupt: 