In [None]:
pip install scikeras

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
from glob import glob
from os.path import join, getctime
import pandas as pd
import numpy as np
from string import ascii_lowercase
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import type_of_target
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from keras.utils import to_categorical

In [None]:
lowercase_alphabet = ascii_lowercase

# List of Evaluation Models

In [None]:
def knn(X_train, y_train):
    knn_model = KNeighborsClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_neighbors': [3, 5, 7, 10],
                  'weights': ['uniform', 'distance'],
                  'metric' : ['euclidean', 'manhattan', 'minkowski']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(knn_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_knn_model = grid_search.best_estimator_
    return best_knn_model

def logistic_regression(X_train, y_train):
    lr_model = LogisticRegression()

    # Define the parameter grid for grid search
    param_grid = {'penalty':['l2'],
                  'C': [0.01, 0.1, 1.0, 10]}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(lr_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_lr_model = grid_search.best_estimator_
    return best_lr_model

def svm(X_train, y_train):
    svm_model = SVC(probability=True)
    # Define the parameter grid for grid search
    param_grid = {'C': [0.01, 0.1, 1.0,10],
                  'kernel': ['linear', 'rbf','sigmoid','poly']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(svm_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_svm_model = grid_search.best_estimator_
    return best_svm_model

def naive_bayes(X_train, y_train):
    nb_model = GaussianNB()
    # No hyperparameters to tune for Naive Bayes
    nb_model.fit(X_train, y_train)
    return nb_model

def random_forest(X_train, y_train):
    rf_model = RandomForestClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_estimators': [10, 100, 200],
                  'max_depth': [None, 10, 50]}
    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(rf_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_rf_model = grid_search.best_estimator_
    return best_rf_model


def binary_neural_network(X_train, y_train,net_type):
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model


def multiclass_neural_network(X_train, y_train,net_type):
  y_train = to_categorical(y_train, 3)
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(3, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(3, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model

# Binaryclass Evaluation
- accuracy, sensitivity, specificity, ppv, npv, roc_auc
- d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv, d_roc_auc

In [None]:
def binary_metrics(TP, FP, TN, FN):
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    ppv = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    npv = TN / (TN + FN) if (TN + FN) > 0 else 0.0
    return accuracy, sensitivity, specificity, ppv, npv

def binary_confusion_matrix(true_labels, y_pred, class_weights):
    TP = sum((a == 1) and (p == 1) for a, p in zip(true_labels, y_pred))
    FP = sum((a != 1) and (p == 1) for a, p in zip(true_labels, y_pred))
    TN = sum((a != 1) and (p != 1) for a, p in zip(true_labels, y_pred))
    FN = sum((a == 1) and (p != 1) for a, p in zip(true_labels, y_pred))

    d_TP = sum(w * ((a == 1) and (p == 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_FP = sum((1-w) * ((a != 1) and (p == 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_TN = sum(w * ((a != 1) and (p != 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_FN = sum((1-w) * ((a == 1) and (p != 1)) for a, p, w in zip(true_labels, y_pred, class_weights))

    if sum(class_weights) == 0:  # If all weights are 0, accuracy is 1
        print("Sum of class weights is 0")
        return TP, FP, TN, FN, TP, FP, TN, FN
    else:
        return TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN


def binary_evaluation(true_labels, predicted_labels, class_weights):
    predicted_labels =  np.array(predicted_labels)[:,1]
    y_pred_binary = np.where(predicted_labels >= 0.5, 1, 0)
    TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN = binary_confusion_matrix(true_labels,y_pred_binary,class_weights)
    accuracy, sensitivity, specificity, ppv, npv = binary_metrics(TP, FP, TN, FN)
    d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv = binary_metrics(d_TP, d_FP, d_TN, d_FN)

    #AUC
    thresholds = np.linspace(0, 1, 100)
    tpr_list = []
    fpr_list = []
    d_tpr_list = []
    d_fpr_list = []
    for threshold in thresholds:    # Calculate TPR and FPR for each threshold
        y_pred_binary = np.where(predicted_labels >= threshold, 1, 0)
        TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN = binary_confusion_matrix(true_labels,y_pred_binary,class_weights)

        tpr = TP / (TP + FN)
        fpr = FP / (FP + TN)
        d_tpr = d_TP / (d_TP + d_FN)
        d_fpr = d_FP / (d_FP + d_TN)

        tpr_list.append(tpr)
        fpr_list.append(fpr)
        d_tpr_list.append(d_tpr)
        d_fpr_list.append(d_fpr)

    roc_auc = metrics.auc(fpr_list,tpr_list)
    d_roc_auc = metrics.auc(d_fpr_list,d_tpr_list)

    return [accuracy, sensitivity, specificity, ppv, npv, roc_auc , d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv, d_roc_auc]

# Multiclass Evaluation
- micro_accuracy, micro_sensitivity, micro_specificity, micro_ppv, micro_npv, micro_auc
- macro_accuracy, macro_sensitivity, macro_specificity, macro_ppv, macro_npv, macro_auc
- d_micro_accuracy, d_micro_sensitivity, d_micro_specificity, d_micro_ppv, d_micro_npv, d_micro_auc
- d_macro_accuracy, d_macro_sensitivity, d_macro_specificity, d_macro_ppv, macro_npv, d_macro_auc

In [None]:
def multiclass_metrics(confusion_matrix):
    num_classes = len(confusion_matrix)
    macro_accuracy = 0.0
    macro_sensitivity = 0.0
    macro_specificity = 0.0
    macro_ppv = 0.0
    macro_npv = 0.0

    for cls in confusion_matrix:
        TP = confusion_matrix[cls]['TP']
        FP = confusion_matrix[cls]['FP']
        TN = confusion_matrix[cls]['TN']
        FN = confusion_matrix[cls]['FN']

        # Calculate metrics for the current class
        accuracy = (TP + TN) / (TP + FP + TN + FN)
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        ppv = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        npv = TN / (TN + FN) if (TN + FN) > 0 else 0.0

        macro_accuracy += accuracy
        macro_sensitivity += sensitivity
        macro_specificity += specificity
        macro_ppv += ppv
        macro_npv += npv

    # Average the metrics across all classes
    macro_accuracy /= num_classes
    macro_sensitivity /= num_classes
    macro_specificity /= num_classes
    macro_ppv /= num_classes
    macro_npv /= num_classes

    ### calculate_micro_averaged_metrics
    total_TP = sum(confusion_matrix[cls]['TP'] for cls in confusion_matrix)
    total_FP = sum(confusion_matrix[cls]['FP'] for cls in confusion_matrix)
    total_TN = sum(confusion_matrix[cls]['TN'] for cls in confusion_matrix)
    total_FN = sum(confusion_matrix[cls]['FN'] for cls in confusion_matrix)

    # Calculate metrics using aggregated values
    micro_accuracy = (total_TP + total_TN) / (total_TP + total_FP + total_TN + total_FN)
    micro_sensitivity = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0.0
    micro_specificity = total_TN / (total_TN + total_FP) if (total_TN + total_FP) > 0 else 0.0
    micro_ppv = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0.0
    micro_npv = total_TN / (total_TN + total_FN) if (total_TN + total_FN) > 0 else 0.0

    metrics = [micro_accuracy, micro_sensitivity, micro_specificity, micro_ppv, micro_npv,
              macro_accuracy, macro_sensitivity, macro_specificity, macro_ppv,macro_npv]
    return metrics

def multiclass_confusion_matrix(true_labels, predicted_labels, classes, case_weights):
    confusion_matrix = {cls: {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0} for cls in classes}
    for a, p in zip(true_labels, predicted_labels):
        for cls in classes:
            TP = ((a == cls) and (p == cls))
            FP = ((a != cls) and (p == cls))
            TN = ((a != cls) and (p != cls))
            FN = ((a == cls) and (p != cls))
            confusion_matrix[cls]['TP'] += TP
            confusion_matrix[cls]['FP'] += FP
            confusion_matrix[cls]['TN'] += TN
            confusion_matrix[cls]['FN'] += FN

    d_confusion_matrix = {cls: {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0} for cls in classes}
    for a, p, w in zip(true_labels, predicted_labels, case_weights):
        for cls in classes:
            TP = w * ((a == cls) and (p == cls))
            FP = (1-w) * ((a != cls) and (p == cls))
            TN = w * ((a != cls) and (p != cls))
            FN = (1-w) * ((a == cls) and (p != cls))
            d_confusion_matrix[cls]['TP'] += TP
            d_confusion_matrix[cls]['FP'] += FP
            d_confusion_matrix[cls]['TN'] += TN
            d_confusion_matrix[cls]['FN'] += FN

    if sum(case_weights) == 0:  # If all weights are 0, accuracy is 1
        print("Sum of class weights is 0")
        return confusion_matrix, confusion_matrix
    else:
        return confusion_matrix, d_confusion_matrix


def multi_evaluation(true_labels, prediction_probabilities, classes, case_weights):
    confusion_matrix, d_confusion_matrix = multiclass_confusion_matrix(true_labels,np.argmax(prediction_probabilities, axis=1),classes,case_weights)
    conventional_result = multiclass_metrics(confusion_matrix)
    new_result = multiclass_metrics(d_confusion_matrix)

    #AUC
    y_true = true_labels
    y_true = to_categorical(y_true)
    y_pred = prediction_probabilities
    expanded_weight = np.repeat(case_weights, len(classes))

    # Flatten the true labels and predicted probabilities
    y_true_flat = y_true.ravel()
    y_pred_flat = y_pred.ravel()

    # Compute true positive rate (TPR) and false positive rate (FPR) for binary classification
    def calculate_tpr_fpr_binary(y_true, y_pred, threshold, expanded_weight):
        y_pred_binary = (y_pred >= threshold).astype(int)
        tp = np.sum((y_true == 1) & (y_pred_binary == 1))
        fp = np.sum((y_true == 0) & (y_pred_binary == 1))
        tn = np.sum((y_true == 0) & (y_pred_binary == 0))
        fn = np.sum((y_true == 1) & (y_pred_binary == 0))
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)

        d_tp = np.sum(expanded_weight * ((y_true == 1) & (y_pred_binary == 1)))
        d_fp = np.sum((1.0 - np.array(expanded_weight)) * ((y_true == 0) & (y_pred_binary == 1)))
        d_tn = np.sum(expanded_weight * ((y_true == 0) & (y_pred_binary == 0)))
        d_fn = np.sum((1.0 - np.array(expanded_weight)) * ((y_true == 1) & (y_pred_binary == 0)))
        d_tpr = d_tp / (d_tp + d_fn)
        d_fpr = d_fp / (d_fp + d_tn)

        if sum(expanded_weight) == 0:  # If all weights are 0, accuracy is 1
            print("Sum of class weights is 0")
            return tpr, fpr, tpr, fpr
        else:
            return tpr, fpr, d_tpr, d_fpr

    # Calculate TPR and FPR for different thresholds for binary classification
    thresholds = np.linspace(0, 1, 100)
    tprs = []
    fprs = []
    d_tprs = []
    d_fprs = []
    for threshold in thresholds:
        tpr, fpr, d_tpr, d_fpr = calculate_tpr_fpr_binary(y_true_flat, y_pred_flat, threshold,expanded_weight)
        tprs.append(tpr)
        fprs.append(fpr)
        d_tprs.append(d_tpr)
        d_fprs.append(d_fpr)

    # Calculate the micro AUC by integrating the area under the micro-average ROC curve
    micro_auc = np.trapz(tprs, fprs)
    d_micro_auc = np.trapz(d_tprs, d_fprs)

    # Calculate the macro AUC by averaging the AUC for each class
    macro_auc = 0.0
    d_macro_auc = 0.0
    for i in range(len(classes)):
        class_tprs = []
        class_fprs = []
        d_class_tprs = []
        d_class_fprs = []
        for threshold in thresholds:
            tpr, fpr, d_tpr, d_fpr = calculate_tpr_fpr_binary(y_true[:, i], y_pred[:, i], threshold,case_weights)
            class_tprs.append(tpr)
            class_fprs.append(fpr)
            d_class_tprs.append(d_tpr)
            d_class_fprs.append(d_fpr)

        auc_i = np.trapz(class_tprs, class_fprs)
        d_auc_i = np.trapz(d_class_tprs, d_class_fprs)
        macro_auc += auc_i
        d_macro_auc += d_auc_i

    macro_auc /= len(classes)
    d_macro_auc /= len(classes)

    conventional_result.insert(5,abs(micro_auc))
    conventional_result.insert(11,abs(macro_auc))
    new_result.insert(5,abs(d_micro_auc))
    new_result.insert(11,abs(d_macro_auc))

    conventional_result = [np.round(x,3) for x in conventional_result]
    new_result = [np.round(x,3) for x in new_result]
    return conventional_result + new_result

# Main Evaluation

## Case difficulty from CDmc

In [None]:
# Call the files
def get_sort_files(path, extension):
    list_of_files = []
    for file in glob(join(path,f'*{extension}')):
        list_of_files.append((getctime(file), file))
    return [file for _, file in sorted(list_of_files)]

file_list_random_order = get_sort_files('../Evaluation/App1/simulated/', '*.xlsx')
file_list_random_order
file_list = sorted(file_list_random_order, key=lambda x:x[:-8])
file_list

['../Evaluation/App1/simulated/(a)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(b)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(c)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(d)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(e)_Approach1_neural_network_with_pool.xlsx',
 '../Evaluation/App1/simulated/(f)_Approach1_neural_network_with_pool.xlsx',
 '../Evaluation/App1/simulated/(g)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(h)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(i)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(j)_Approach1_neural_network_with_pool.xlsx',
 '../Evaluation/App1/simulated/(k)_Approach1_neural_network_with_pool.xlsx',
 '../Evaluation/App1/simulated/(l)_Approach1_neural_network_with_pool.xlsx',
 '../Evaluation/App1/simulated/(m)_Approach1_neural_network.xlsx',
 '../Evaluation/App1/simulated/(n)_Approach1_neural_network.xlsx',
 '../Evaluat

In [None]:
evaluation_result = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Simulated_dataset_evaluation_App1.xlsx',engine='openpyxl')
evaluatoin_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Simulated_dataset_evaluation_detailed_App1.xlsx',engine='openpyxl')
for i in range(len(file_list)):
  print(file_list[i])
  simulated_data = pd.read_excel(file_list[i],index_col=[0])
  X = simulated_data[['x1','x2','difficulty']]
  y = simulated_data['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

  X_test_difficulty = X_test['difficulty']
  X_train = X_train[['x1','x2']]
  X_test = X_test[['x1','x2']]

  best_knn_model = knn(X_train,y_train)
  best_lr_model = logistic_regression(X_train,y_train)
  best_svm_model = svm(X_train, y_train)
  nb_model = naive_bayes(X_train, y_train)
  best_rf_model = random_forest(X_train, y_train)

  if len(np.unique(y_train)) == 2:
    best_snn_model = binary_neural_network(X_train, y_train,'snn')
    best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
  elif len(np.unique(y_train)) > 2:
    best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
    best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

  y_val_pred_knn = best_knn_model.predict_proba(X_test)
  y_val_pred_lr = best_lr_model.predict_proba(X_test)
  y_val_pred_svm = best_svm_model.predict_proba(X_test)
  y_val_pred_nb = nb_model.predict_proba(X_test)
  y_val_pred_rf = best_rf_model.predict_proba(X_test)
  y_val_pred_snn = best_snn_model.predict_proba(X_test)
  y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

  y_type = type_of_target(y_test)
  if y_type == 'binary':
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

      print("Logistic Regression")
      lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

      print("SVM")
      svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

      print("Naive Bayes")
      nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

      print("Random Forest")
      rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

      print("Simple Neural Network")
      snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

      print("Deep Neural Network")
      dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                     'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
      print(result)

  elif y_type == 'multiclass':
      classes = [0, 1, 2]
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result = multi_evaluation(true_labels, y_val_pred_knn, classes, weights)

      print("Logistic Regression")
      lr_result = multi_evaluation(true_labels, y_val_pred_lr, classes, weights)

      print("SVM")
      svm_result = multi_evaluation(true_labels, y_val_pred_svm, classes, weights)

      print("Naive Bayes")
      nb_result = multi_evaluation(true_labels, y_val_pred_nb, classes, weights)

      print("Random Forest")
      rf_result = multi_evaluation(true_labels, y_val_pred_rf, classes, weights)

      print("Simpe Neural Network")
      snn_result = multi_evaluation(true_labels, y_val_pred_snn, classes, weights)

      print("Deep Neural Network")
      dnn_result = multi_evaluation(true_labels, y_val_pred_dnn, classes, weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['micro_accuracy', 'micro_sensitivity', 'micro_specificity', 'micro_ppv', 'micro_npv', 'micro_auc',
                                    'macro_accuracy', 'macro_sensitivity', 'macro_specificity', 'macro_ppv', 'macro_npv', 'macro_auc',
                                    'd_micro_accuracy', 'd_micro_sensitivity', 'd_micro_specificity', 'd_micro_ppv', 'd_micro_npv', 'd_micro_auc',
                                    'd_macro_accuracy', 'd_macro_sensitivity', 'd_macro_specificity', 'd_macro_ppv', 'd_macro_npv', 'd_macro_auc'])
      print(result)

  sheet_name = lowercase_alphabet[i]
  result.to_excel(evaluation_result, sheet_name=sheet_name)

  predictions = pd.DataFrame({'x1': X_test['x1'],'x2': X_test['x2'],'label': y_test,'difficulty':X_test_difficulty,
             'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
             'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                              'dnn':np.argmax(y_val_pred_dnn, axis=1)})
  predictions.to_excel(evaluatoin_detail, sheet_name=sheet_name)

evaluation_result.close()
evaluatoin_detail.close()

../Evaluation/App1/simulated/(a)_Approach1_neural_network.xlsx
Best: 1.000000 using {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best: 1.000000 using {'C': 0.01, 'penalty': 'l2'}
Best: 1.000000 using {'C': 0.01, 'kernel': 'linear'}
Best: 1.000000 using {'max_depth': None, 'n_estimators': 100}
Best: 1.000000 using {'batch_size': 32, 'epochs': 10}
Best: 1.000000 using {'batch_size': 32, 'epochs': 10}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity  ppv  npv  roc_auc  d_accuracy  \
knn       1.0          1.0          1.0  1.0  1.0      1.0         1.0   
lr        1.0          1.0          1.0  1.0  1.0      1.0         1.0   
svm       1.0          1.0          1.0  1.0  1.0      1.0         1.0   
nb        1.0          1.0          1.0  1.0  1.0      1.0         1.0   
rf        1.0          1.0          1.0  1.0  1.0      1.0         1.0   
snn       1.0          1.0          1.0

## Case difficulty from CDdm

In [None]:
# Call the files
def get_sort_files(path, extension):
    list_of_files = []
    for file in glob(join(path,f'*{extension}')):
        list_of_files.append((getctime(file), file))
    return [file for _, file in sorted(list_of_files)]

file_list_random_order = get_sort_files('../Evaluation/App2/simulated/', '*.xlsx')
file_list_random_order
file_list = sorted(file_list_random_order, key=lambda x:x[:-8])
file_list

['../Evaluation/App2/simulated/(a)two_class_ND.xlsx',
 '../Evaluation/App2/simulated/(b)two_class_ND.xlsx',
 '../Evaluation/App2/simulated/(c)two_class_ND.xlsx',
 '../Evaluation/App2/simulated/(d)three_class_ND.xlsx',
 '../Evaluation/App2/simulated/(e)three_class_ND.xlsx',
 '../Evaluation/App2/simulated/(f)three_class_ND.xlsx',
 '../Evaluation/App2/simulated/(g)two_class_MS.xlsx',
 '../Evaluation/App2/simulated/(h)two_class_MS.xlsx',
 '../Evaluation/App2/simulated/(i)two_class_MS.xlsx',
 '../Evaluation/App2/simulated/(j)three_class_MS.xlsx',
 '../Evaluation/App2/simulated/(k)three_class_MS.xlsx',
 '../Evaluation/App2/simulated/(l)three_class_MS.xlsx',
 '../Evaluation/App2/simulated/(m)two_class_cs.xlsx',
 '../Evaluation/App2/simulated/(n)two_class_cs.xlsx',
 '../Evaluation/App2/simulated/(o)two_class_cs.xlsx',
 '../Evaluation/App2/simulated/(p)three_class_CS.xlsx',
 '../Evaluation/App2/simulated/(q)three_class_CS.xlsx',
 '../Evaluation/App2/simulated/(r)three_class_CS.xlsx']

In [None]:
evaluation_result = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Simulated_dataset_evaluation_App2.xlsx',engine='openpyxl')
evaluatoin_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Simulated_dataset_evaluation_detailed_App2.xlsx',engine='openpyxl')
for i in range(len(file_list)):
  print(file_list[i])
  simulated_data = pd.read_excel(file_list[i])
  X = simulated_data[['x1','x2','difficulty']]
  y = simulated_data['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

  X_test_difficulty = X_test['difficulty']
  X_train = X_train[['x1','x2']]
  X_test = X_test[['x1','x2']]

  best_knn_model = knn(X_train,y_train)
  best_lr_model = logistic_regression(X_train,y_train)
  best_svm_model = svm(X_train, y_train)
  nb_model = naive_bayes(X_train, y_train)
  best_rf_model = random_forest(X_train, y_train)

  if len(np.unique(y_train)) == 2:
    best_snn_model = binary_neural_network(X_train, y_train,'snn')
    best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
  elif len(np.unique(y_train)) > 2:
    best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
    best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

  y_val_pred_knn = best_knn_model.predict_proba(X_test)
  y_val_pred_lr = best_lr_model.predict_proba(X_test)
  y_val_pred_svm = best_svm_model.predict_proba(X_test)
  y_val_pred_nb = nb_model.predict_proba(X_test)
  y_val_pred_rf = best_rf_model.predict_proba(X_test)
  y_val_pred_snn = best_snn_model.predict_proba(X_test)
  y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

  y_type = type_of_target(y_test)
  if y_type == 'binary':
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

      print("Logistic Regression")
      lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

      print("SVM")
      svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

      print("Naive Bayes")
      nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

      print("Random Forest")
      rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

      print("Simple Neural Network")
      snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

      print("Deep Neural Network")
      dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                     'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
      print(result)

  elif y_type == 'multiclass':
      classes = [0, 1, 2]
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result = multi_evaluation(true_labels, y_val_pred_knn, classes, weights)

      print("Logistic Regression")
      lr_result = multi_evaluation(true_labels, y_val_pred_lr, classes, weights)

      print("SVM")
      svm_result = multi_evaluation(true_labels, y_val_pred_svm, classes, weights)

      print("Naive Bayes")
      nb_result = multi_evaluation(true_labels, y_val_pred_nb, classes, weights)

      print("Random Forest")
      rf_result = multi_evaluation(true_labels, y_val_pred_rf, classes, weights)

      print("Simpe Neural Network")
      snn_result = multi_evaluation(true_labels, y_val_pred_snn, classes, weights)

      print("Deep Neural Network")
      dnn_result = multi_evaluation(true_labels, y_val_pred_dnn, classes, weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['micro_accuracy', 'micro_sensitivity', 'micro_specificity', 'micro_ppv', 'micro_npv', 'micro_auc',
                                    'macro_accuracy', 'macro_sensitivity', 'macro_specificity', 'macro_ppv', 'macro_npv', 'macro_auc',
                                    'd_micro_accuracy', 'd_micro_sensitivity', 'd_micro_specificity', 'd_micro_ppv', 'd_micro_npv', 'd_micro_auc',
                                    'd_macro_accuracy', 'd_macro_sensitivity', 'd_macro_specificity', 'd_macro_ppv', 'd_macro_npv', 'd_macro_auc'])
      print(result)

  sheet_name = lowercase_alphabet[i]
  result.to_excel(evaluation_result, sheet_name=sheet_name)

  predictions = pd.DataFrame({'x1': X_test['x1'],'x2': X_test['x2'],'label': y_test,'difficulty':X_test_difficulty,
             'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
             'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                              'dnn':np.argmax(y_val_pred_dnn, axis=1)})
  predictions.to_excel(evaluatoin_detail, sheet_name=sheet_name)

evaluation_result.close()
evaluatoin_detail.close()

../Evaluation/App2/simulated/(a)two_class_ND.xlsx
Best: 1.000000 using {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best: 1.000000 using {'C': 0.01, 'penalty': 'l2'}
Best: 1.000000 using {'C': 0.01, 'kernel': 'linear'}
Best: 1.000000 using {'max_depth': None, 'n_estimators': 10}
Best: 1.000000 using {'batch_size': 32, 'epochs': 10}
Best: 1.000000 using {'batch_size': 32, 'epochs': 10}
KNN
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weights is 0
Sum of class weig

  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr 

Logistic Regression
Sum of class weights is 0
SVM


  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr 

Sum of class weights is 0
Naive Bayes
Sum of class weights is 0


  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr 

Random Forest
Sum of class weights is 0
Simpe Neural Network


  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr 

Sum of class weights is 0
Deep Neural Network
Sum of class weights is 0


  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr = d_tp / (d_tp + d_fn)
  d_fpr = d_fp / (d_fp + d_tn)
  d_tpr 

     micro_accuracy  micro_sensitivity  micro_specificity  micro_ppv  \
knn             1.0                1.0                1.0        1.0   
lr              1.0                1.0                1.0        1.0   
svm             1.0                1.0                1.0        1.0   
nb              1.0                1.0                1.0        1.0   
rf              1.0                1.0                1.0        1.0   
snn             1.0                1.0                1.0        1.0   
dnn             1.0                1.0                1.0        1.0   

     micro_npv  micro_auc  macro_accuracy  macro_sensitivity  \
knn        1.0        1.0             1.0                1.0   
lr         1.0        1.0             1.0                1.0   
svm        1.0        1.0             1.0                1.0   
nb         1.0        1.0             1.0                1.0   
rf         1.0        1.0             1.0                1.0   
snn        1.0        1.0             1

## Case difficulty from CDpu

In [None]:
# Call the files
def get_sort_files(path, extension):
    list_of_files = []
    for file in glob(join(path,f'*{extension}')):
        list_of_files.append((getctime(file), file))
    return [file for _, file in sorted(list_of_files)]

file_list_random_order = get_sort_files('../Evaluation/App3/simulated/', '*.xlsx')
file_list_random_order
file_list = sorted(file_list_random_order, key=lambda x:x[:-8])
file_list

['../Evaluation/App3/simulated/app3_a_100.xlsx',
 '../Evaluation/App3/simulated/app3_b_100.xlsx',
 '../Evaluation/App3/simulated/app3_c_100.xlsx',
 '../Evaluation/App3/simulated/app3_d_100.xlsx',
 '../Evaluation/App3/simulated/app3_e_100.xlsx',
 '../Evaluation/App3/simulated/app3_f_100.xlsx',
 '../Evaluation/App3/simulated/app3_g_100.xlsx',
 '../Evaluation/App3/simulated/app3_h_100.xlsx',
 '../Evaluation/App3/simulated/app3_i_100.xlsx',
 '../Evaluation/App3/simulated/app3_j_100.xlsx',
 '../Evaluation/App3/simulated/app3_k_100.xlsx',
 '../Evaluation/App3/simulated/app3_l_100.xlsx',
 '../Evaluation/App3/simulated/app3_m_100.xlsx',
 '../Evaluation/App3/simulated/app3_n_100.xlsx',
 '../Evaluation/App3/simulated/app3_o_100.xlsx',
 '../Evaluation/App3/simulated/app3_p_100.xlsx',
 '../Evaluation/App3/simulated/app3_q_100.xlsx',
 '../Evaluation/App3/simulated/app3_r_100.xlsx']

In [None]:
evaluation_result = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Simulated_dataset_evaluation_App3.xlsx',engine='openpyxl')
evaluatoin_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Simulated_dataset_evaluation_detailed_App3.xlsx',engine='openpyxl')
for i in range(len(file_list)):
    print(file_list[i])
    simulated_data = pd.read_excel(file_list[i],index_col=[0])
    X = simulated_data[['x1','x2','difficulty']]
    y = simulated_data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    X_test_difficulty = X_test['difficulty']
    X_train = X_train[['x1','x2']]
    X_test = X_test[['x1','x2']]

    #Convert over 1 value below 1
    values = X_test_difficulty.to_list()
    converted_values = [1 if value > 1 else value for value in values]
    X_test_difficulty = converted_values


    best_knn_model = knn(X_train,y_train)
    best_lr_model = logistic_regression(X_train,y_train)
    best_svm_model = svm(X_train, y_train)
    nb_model = naive_bayes(X_train, y_train)
    best_rf_model = random_forest(X_train, y_train)

    if len(np.unique(y_train)) == 2:
        best_snn_model = binary_neural_network(X_train, y_train,'snn')
        best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
    elif len(np.unique(y_train)) > 2:
        best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
        best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

    y_val_pred_knn = best_knn_model.predict_proba(X_test)
    y_val_pred_lr = best_lr_model.predict_proba(X_test)
    y_val_pred_svm = best_svm_model.predict_proba(X_test)
    y_val_pred_nb = nb_model.predict_proba(X_test)
    y_val_pred_rf = best_rf_model.predict_proba(X_test)
    y_val_pred_snn = best_snn_model.predict_proba(X_test)
    y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

    y_type = type_of_target(y_test)
    if y_type == 'binary':
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

      print("Logistic Regression")
      lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

      print("SVM")
      svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

      print("Naive Bayes")
      nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

      print("Random Forest")
      rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

      print("Simple Neural Network")
      snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

      print("Deep Neural Network")
      dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                     'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
      print(result)

    elif y_type == 'multiclass':
      classes = [0, 1, 2]
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result = multi_evaluation(true_labels, y_val_pred_knn, classes, weights)

      print("Logistic Regression")
      lr_result = multi_evaluation(true_labels, y_val_pred_lr, classes, weights)

      print("SVM")
      svm_result = multi_evaluation(true_labels, y_val_pred_svm, classes, weights)

      print("Naive Bayes")
      nb_result = multi_evaluation(true_labels, y_val_pred_nb, classes, weights)

      print("Random Forest")
      rf_result = multi_evaluation(true_labels, y_val_pred_rf, classes, weights)

      print("Simpe Neural Network")
      snn_result = multi_evaluation(true_labels, y_val_pred_snn, classes, weights)

      print("Deep Neural Network")
      dnn_result = multi_evaluation(true_labels, y_val_pred_dnn, classes, weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['micro_accuracy', 'micro_sensitivity', 'micro_specificity', 'micro_ppv', 'micro_npv', 'micro_auc',
                                    'macro_accuracy', 'macro_sensitivity', 'macro_specificity', 'macro_ppv', 'macro_npv', 'macro_auc',
                                    'd_micro_accuracy', 'd_micro_sensitivity', 'd_micro_specificity', 'd_micro_ppv', 'd_micro_npv', 'd_micro_auc',
                                    'd_macro_accuracy', 'd_macro_sensitivity', 'd_macro_specificity', 'd_macro_ppv', 'd_macro_npv', 'd_macro_auc'])
      print(result)

    sheet_name = lowercase_alphabet[i]
    result.to_excel(evaluation_result, sheet_name=sheet_name)

    predictions = pd.DataFrame({'x1': X_test['x1'],'x2': X_test['x2'],'label': y_test,'difficulty':X_test_difficulty,
             'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
             'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                              'dnn':np.argmax(y_val_pred_dnn, axis=1)})
    predictions.to_excel(evaluatoin_detail, sheet_name=sheet_name)

evaluation_result.close()
evaluatoin_detail.close()

../Evaluation/App3/simulated/app3_a_100.xlsx
Best: 1.000000 using {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best: 1.000000 using {'C': 0.01, 'penalty': 'l2'}
Best: 1.000000 using {'C': 0.01, 'kernel': 'linear'}
Best: 1.000000 using {'max_depth': None, 'n_estimators': 10}
Best: 1.000000 using {'batch_size': 32, 'epochs': 10}
Best: 1.000000 using {'batch_size': 32, 'epochs': 10}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity  ppv  npv  roc_auc  d_accuracy  \
knn       1.0          1.0          1.0  1.0  1.0      1.0         1.0   
lr        1.0          1.0          1.0  1.0  1.0      1.0         1.0   
svm       1.0          1.0          1.0  1.0  1.0      1.0         1.0   
nb        1.0          1.0          1.0  1.0  1.0      1.0         1.0   
rf        1.0          1.0          1.0  1.0  1.0      1.0         1.0   
snn       1.0          1.0          1.0  1.0  1.0      1.0