In [None]:
pip install scikeras

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install scikit-learn==1.2.2

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
from glob import glob
from os.path import join, getctime
import pandas as pd
import numpy as np
from string import ascii_lowercase
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import type_of_target
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from keras.utils import to_categorical

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [None]:
numeric_columns = ['Age', 'Work_Experience', 'Family_Size']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

def standard(original_data):
  # Customer Data preprocessing
  categorical_transformed = original_data.drop(columns=numeric_columns)
  numeric_transformed = pd.DataFrame(numeric_transformer.fit_transform(original_data[numeric_columns]), columns=numeric_columns)
  # Concatenate the transformed columns back into a new DataFrame
  standard_data = pd.concat([numeric_transformed, categorical_transformed.set_index(numeric_transformed.index)], axis=1)
  # Set the original index back
  standard_data = standard_data.set_index(original_data.index)
  return(standard_data)

# Machine Learning Models

In [None]:
def knn(X_train, y_train):
    knn_model = KNeighborsClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_neighbors': [3, 5, 7, 10],
                  'weights': ['uniform', 'distance'],
                  'metric' : ['euclidean', 'manhattan', 'minkowski']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(knn_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_knn_model = grid_search.best_estimator_
    return best_knn_model


def logistic_regression(X_train, y_train):
    lr_model = LogisticRegression(max_iter=1000)

    # Define the parameter grid for grid search
    param_grid = {'penalty':['l2'],
                  'C': [0.01, 0.1, 1.0, 10]}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(lr_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_lr_model = grid_search.best_estimator_
    return best_lr_model


def svm(X_train, y_train):
    svm_model = SVC(probability=True)

    # Define the parameter grid for grid search
    param_grid = {'C': [0.01, 0.1, 1.0,10],
                  'kernel': ['linear', 'rbf','sigmoid','poly']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(svm_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_svm_model = grid_search.best_estimator_
    return best_svm_model


def naive_bayes(X_train, y_train):
    nb_model = GaussianNB()
    # No hyperparameters to tune for Naive Bayes
    nb_model.fit(X_train, y_train)
    return nb_model


def random_forest(X_train, y_train):
    rf_model = RandomForestClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_estimators': [10, 100, 200],
                  'max_depth': [None, 10, 50]}
    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(rf_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_rf_model = grid_search.best_estimator_
    return best_rf_model


def binary_neural_network(X_train, y_train,net_type):
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model


def multiclass_neural_network(X_train, y_train,net_type):
  y_train = to_categorical(y_train, 4)
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(4, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(4, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model

# Evaluation metrics
- micro_accuracy, micro_sensitivity, micro_specificity, micro_ppv, micro_npv, micro_auc
- macro_accuracy, macro_sensitivity, macro_specificity, macro_ppv, macro_npv, macro_auc
- d_micro_accuracy, d_micro_sensitivity, d_micro_specificity, d_micro_ppv, d_micro_npv, d_micro_auc
- d_macro_accuracy, d_macro_sensitivity, d_macro_specificity, d_macro_ppv, macro_npv, d_macro_auc

In [None]:
def multiclass_metrics(confusion_matrix):
    num_classes = len(confusion_matrix)
    macro_accuracy = 0.0
    macro_sensitivity = 0.0
    macro_specificity = 0.0
    macro_ppv = 0.0
    macro_npv = 0.0

    for cls in confusion_matrix:
        TP = confusion_matrix[cls]['TP']
        FP = confusion_matrix[cls]['FP']
        TN = confusion_matrix[cls]['TN']
        FN = confusion_matrix[cls]['FN']

        # Calculate metrics for the current class
        accuracy = (TP + TN) / (TP + FP + TN + FN)
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        ppv = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        npv = TN / (TN + FN) if (TN + FN) > 0 else 0.0

        macro_accuracy += accuracy
        macro_sensitivity += sensitivity
        macro_specificity += specificity
        macro_ppv += ppv
        macro_npv += npv

    # Average the metrics across all classes
    macro_accuracy /= num_classes
    macro_sensitivity /= num_classes
    macro_specificity /= num_classes
    macro_ppv /= num_classes
    macro_npv /= num_classes

    ### calculate_micro_averaged_metrics
    total_TP = sum(confusion_matrix[cls]['TP'] for cls in confusion_matrix)
    total_FP = sum(confusion_matrix[cls]['FP'] for cls in confusion_matrix)
    total_TN = sum(confusion_matrix[cls]['TN'] for cls in confusion_matrix)
    total_FN = sum(confusion_matrix[cls]['FN'] for cls in confusion_matrix)

    # Calculate metrics using aggregated values
    micro_accuracy = (total_TP + total_TN) / (total_TP + total_FP + total_TN + total_FN)
    micro_sensitivity = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0.0
    micro_specificity = total_TN / (total_TN + total_FP) if (total_TN + total_FP) > 0 else 0.0
    micro_ppv = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0.0
    micro_npv = total_TN / (total_TN + total_FN) if (total_TN + total_FN) > 0 else 0.0

    metrics = [micro_accuracy, micro_sensitivity, micro_specificity, micro_ppv, micro_npv,
              macro_accuracy, macro_sensitivity, macro_specificity, macro_ppv,macro_npv]
    return metrics

def multiclass_confusion_matrix(true_labels, predicted_labels, classes, case_weights):
    confusion_matrix = {cls: {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0} for cls in classes}
    for a, p in zip(true_labels, predicted_labels):
        for cls in classes:
            TP = ((a == cls) and (p == cls))
            FP = ((a != cls) and (p == cls))
            TN = ((a != cls) and (p != cls))
            FN = ((a == cls) and (p != cls))
            confusion_matrix[cls]['TP'] += TP
            confusion_matrix[cls]['FP'] += FP
            confusion_matrix[cls]['TN'] += TN
            confusion_matrix[cls]['FN'] += FN

    d_confusion_matrix = {cls: {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0} for cls in classes}
    for a, p, w in zip(true_labels, predicted_labels, case_weights):
        for cls in classes:
            TP = w * ((a == cls) and (p == cls))
            FP = (1-w) * ((a != cls) and (p == cls))
            TN = w * ((a != cls) and (p != cls))
            FN = (1-w) * ((a == cls) and (p != cls))
            d_confusion_matrix[cls]['TP'] += TP
            d_confusion_matrix[cls]['FP'] += FP
            d_confusion_matrix[cls]['TN'] += TN
            d_confusion_matrix[cls]['FN'] += FN

    if sum(case_weights) == 0:  # If all weights are 0, accuracy is 1
        print("Sum of class weights is 0")
        return confusion_matrix, confusion_matrix
    else:
        return confusion_matrix, d_confusion_matrix


def multi_evaluation(true_labels, prediction_probabilities, classes, case_weights):
    confusion_matrix, d_confusion_matrix = multiclass_confusion_matrix(true_labels,np.argmax(prediction_probabilities, axis=1),classes,case_weights)
    conventional_result = multiclass_metrics(confusion_matrix)
    new_result = multiclass_metrics(d_confusion_matrix)

    #AUC
    y_true = true_labels
    y_true = to_categorical(y_true)
    y_pred = prediction_probabilities
    expanded_weight = np.repeat(case_weights, len(classes))

    # Flatten the true labels and predicted probabilities for a binary classification approach
    y_true_flat = y_true.ravel()
    y_pred_flat = y_pred.ravel()

    # Compute true positive rate (TPR) and false positive rate (FPR) for binary classification
    def calculate_tpr_fpr_binary(y_true, y_pred, threshold, expanded_weight):
        y_pred_binary = (y_pred >= threshold).astype(int)
        tp = np.sum((y_true == 1) & (y_pred_binary == 1))
        fp = np.sum((y_true == 0) & (y_pred_binary == 1))
        tn = np.sum((y_true == 0) & (y_pred_binary == 0))
        fn = np.sum((y_true == 1) & (y_pred_binary == 0))
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)

        d_tp = np.sum(expanded_weight * ((y_true == 1) & (y_pred_binary == 1)))
        d_fp = np.sum((1.0 - np.array(expanded_weight)) * ((y_true == 0) & (y_pred_binary == 1)))
        d_tn = np.sum(expanded_weight * ((y_true == 0) & (y_pred_binary == 0)))
        d_fn = np.sum((1.0 - np.array(expanded_weight)) * ((y_true == 1) & (y_pred_binary == 0)))
        d_tpr = d_tp / (d_tp + d_fn)
        d_fpr = d_fp / (d_fp + d_tn)

        if sum(expanded_weight) == 0:  # If all weights are 0, accuracy is 1
            print("Sum of class weights is 0")
            return tpr, fpr, tpr, fpr
        else:
            return tpr, fpr, d_tpr, d_fpr

    # Calculate TPR and FPR for different thresholds for binary classification
    thresholds = np.linspace(0, 1, 100)
    tprs = []
    fprs = []
    d_tprs = []
    d_fprs = []
    for threshold in thresholds:
        tpr, fpr, d_tpr, d_fpr = calculate_tpr_fpr_binary(y_true_flat, y_pred_flat, threshold,expanded_weight)
        tprs.append(tpr)
        fprs.append(fpr)
        d_tprs.append(d_tpr)
        d_fprs.append(d_fpr)

    # Calculate the micro AUC by integrating the area under the micro-average ROC curve
    micro_auc = np.trapz(tprs, fprs)
    d_micro_auc = np.trapz(d_tprs, d_fprs)

    # Calculate the macro AUC by averaging the AUC for each class
    macro_auc = 0.0
    d_macro_auc = 0.0
    for i in range(len(classes)):
        class_tprs = []
        class_fprs = []
        d_class_tprs = []
        d_class_fprs = []
        for threshold in thresholds:
            tpr, fpr, d_tpr, d_fpr = calculate_tpr_fpr_binary(y_true[:, i], y_pred[:, i], threshold,case_weights)
            class_tprs.append(tpr)
            class_fprs.append(fpr)
            d_class_tprs.append(d_tpr)
            d_class_fprs.append(d_fpr)

        auc_i = np.trapz(class_tprs, class_fprs)
        d_auc_i = np.trapz(d_class_tprs, d_class_fprs)
        macro_auc += auc_i
        d_macro_auc += d_auc_i

    macro_auc /= len(classes)
    d_macro_auc /= len(classes)

    conventional_result.insert(5,abs(micro_auc))
    conventional_result.insert(11,abs(macro_auc))
    new_result.insert(5,abs(d_micro_auc))
    new_result.insert(11,abs(d_macro_auc))

    conventional_result = [np.round(x,3) for x in conventional_result]
    new_result = [np.round(x,3) for x in new_result]
    return conventional_result + new_result

# Evaluation

## CDmc

In [None]:
app1_customer = pd.read_excel('../Evaluation/real_world/app1_customer_one_hot.xlsx',
                              index_col=[0])
app1_customer

Unnamed: 0_level_0,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Graduated_No,Graduated_Yes,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,...,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,Age,Work_Experience,Family_Size,y,difficulty
case_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,22,1.0,4.0,3,0.012346
1,1,0,0,1,0,1,0,0,1,0,...,0,1,0,0,0,38,,3.0,0,1.000000
2,1,0,0,1,0,1,0,0,1,0,...,0,0,0,1,0,67,1.0,1.0,1,0.395062
3,0,1,0,1,0,1,0,0,0,0,...,0,0,0,1,0,67,0.0,2.0,1,0.024691
4,1,0,0,1,0,1,0,0,0,1,...,0,0,0,1,0,40,,6.0,0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8063,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,22,0.0,7.0,3,0.024691
8064,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,35,3.0,4.0,3,0.012346
8065,1,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,33,1.0,1.0,3,0.012346
8066,1,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,27,1.0,4.0,1,1.000000


In [None]:
Customer_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Customer_evaluation_App1.xlsx',engine='openpyxl')
Customer_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Customer_evaluation_App1_detailed.xlsx',engine='openpyxl')

X = app1_customer.iloc[:,:-2]
X['difficulty'] = app1_customer['difficulty']
y = app1_customer['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

X_train = standard(X_train)
X_test = standard(X_test)

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'multiclass':
      classes = [0, 1, 2, 3]
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result = multi_evaluation(true_labels, y_val_pred_knn, classes, weights)

      print("Logistic Regression")
      lr_result = multi_evaluation(true_labels, y_val_pred_lr, classes, weights)

      print("SVM")
      svm_result = multi_evaluation(true_labels, y_val_pred_svm, classes, weights)

      print("Naive Bayes")
      nb_result = multi_evaluation(true_labels, y_val_pred_nb, classes, weights)

      print("Random Forest")
      rf_result = multi_evaluation(true_labels, y_val_pred_rf, classes, weights)

      print("Simpe Neural Network")
      snn_result = multi_evaluation(true_labels, y_val_pred_snn, classes, weights)

      print("Deep Neural Network")
      dnn_result = multi_evaluation(true_labels, y_val_pred_dnn, classes, weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['micro_accuracy', 'micro_sensitivity', 'micro_specificity', 'micro_ppv', 'micro_npv', 'micro_auc',
                                    'macro_accuracy', 'macro_sensitivity', 'macro_specificity', 'macro_ppv', 'macro_npv', 'macro_auc',
                                    'd_micro_accuracy', 'd_micro_sensitivity', 'd_micro_specificity', 'd_micro_ppv', 'd_micro_npv', 'd_micro_auc',
                                    'd_macro_accuracy', 'd_macro_sensitivity', 'd_macro_specificity', 'd_macro_ppv', 'd_macro_npv', 'd_macro_auc'])
      print(result)

sheet_name = 'App1'
result.to_excel(Customer_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})
new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(Customer_detail, sheet_name=sheet_name)

Customer_evaluation.close()
Customer_detail.close()

Best: 0.494775 using {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'uniform'}
Best: 0.508058 using {'C': 0.1, 'penalty': 'l2'}
Best: 0.522579 using {'C': 1.0, 'kernel': 'rbf'}
Best: 0.528069 using {'max_depth': 10, 'n_estimators': 200}


2023-10-07 00:08:35.272088: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Best: 0.529663 using {'batch_size': 128, 'epochs': 30}
Best: 0.526830 using {'batch_size': 128, 'epochs': 30}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simpe Neural Network
Deep Neural Network
     micro_accuracy  micro_sensitivity  micro_specificity  micro_ppv  \
knn           0.743              0.487              0.829      0.487   
lr            0.752              0.505              0.835      0.505   
svm           0.763              0.526              0.842      0.526   
nb            0.747              0.494              0.831      0.494   
rf            0.768              0.537              0.846      0.537   
snn           0.761              0.523              0.841      0.523   
dnn           0.766              0.532              0.844      0.532   

     micro_npv  micro_auc  macro_accuracy  macro_sensitivity  \
knn      0.829      0.757           0.743              0.477   
lr       0.835      0.776           0.752              0.491   
svm      0.842      0.790 

## CDdm

In [None]:
app2_customer = pd.read_excel('../Evaluation/real_world/app2_customer_one_hot.xlsx',
                              index_col=[0])
app2_customer

Unnamed: 0,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Graduated_No,Graduated_Yes,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,...,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,Age,Work_Experience,Family_Size,label,difficulty
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,22,1.0,4.0,3,3.576279e-07
1,1,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,33,1.0,3.0,3,5.083613e-01
2,1,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,18,3.0,4.0,3,9.298325e-06
3,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,58,1.0,3.0,1,7.616209e-01
4,0,1,0,1,1,0,1,0,0,0,...,0,0,0,1,0,56,1.0,3.0,2,3.303696e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8063,0,1,0,1,1,0,0,0,0,0,...,0,1,0,0,0,39,8.0,4.0,0,6.890299e-01
8064,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,23,1.0,3.0,3,1.637352e-02
8065,0,1,0,1,1,0,0,0,0,0,...,0,0,0,1,0,85,1.0,1.0,3,6.728786e-01
8066,0,1,0,1,1,0,1,0,0,0,...,0,0,0,1,0,65,0.0,2.0,2,5.811445e-01


In [None]:
Customer_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Customer_evaluation_App2.xlsx',engine='openpyxl')
Customer_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Customer_evaluation_App2_detailed.xlsx',engine='openpyxl')

X = app2_customer.iloc[:,:-2]
X['difficulty'] = app2_customer['difficulty']
y = app2_customer['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

X_train = standard(X_train)
X_test = standard(X_test)

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'multiclass':
      classes = [0, 1, 2, 3]
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result = multi_evaluation(true_labels, y_val_pred_knn, classes, weights)

      print("Logistic Regression")
      lr_result = multi_evaluation(true_labels, y_val_pred_lr, classes, weights)

      print("SVM")
      svm_result = multi_evaluation(true_labels, y_val_pred_svm, classes, weights)

      print("Naive Bayes")
      nb_result = multi_evaluation(true_labels, y_val_pred_nb, classes, weights)

      print("Random Forest")
      rf_result = multi_evaluation(true_labels, y_val_pred_rf, classes, weights)

      print("Simpe Neural Network")
      snn_result = multi_evaluation(true_labels, y_val_pred_snn, classes, weights)

      print("Deep Neural Network")
      dnn_result = multi_evaluation(true_labels, y_val_pred_dnn, classes, weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['micro_accuracy', 'micro_sensitivity', 'micro_specificity', 'micro_ppv', 'micro_npv', 'micro_auc',
                                    'macro_accuracy', 'macro_sensitivity', 'macro_specificity', 'macro_ppv', 'macro_npv', 'macro_auc',
                                    'd_micro_accuracy', 'd_micro_sensitivity', 'd_micro_specificity', 'd_micro_ppv', 'd_micro_npv', 'd_micro_auc',
                                    'd_macro_accuracy', 'd_macro_sensitivity', 'd_macro_specificity', 'd_macro_ppv', 'd_macro_npv', 'd_macro_auc'])
      print(result)

sheet_name = 'App2'
result.to_excel(Customer_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})
new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(Customer_detail, sheet_name=sheet_name)

Customer_evaluation.close()
Customer_detail.close()

Best: 0.491058 using {'metric': 'euclidean', 'n_neighbors': 10, 'weights': 'uniform'}
Best: 0.508235 using {'C': 10, 'penalty': 'l2'}
Best: 0.525413 using {'C': 1.0, 'kernel': 'rbf'}
Best: 0.525059 using {'max_depth': 10, 'n_estimators': 200}
Best: 0.523465 using {'batch_size': 128, 'epochs': 30}
Best: 0.528778 using {'batch_size': 64, 'epochs': 30}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simpe Neural Network
Deep Neural Network
     micro_accuracy  micro_sensitivity  micro_specificity  micro_ppv  \
knn           0.746              0.493              0.831      0.493   
lr            0.752              0.504              0.835      0.504   
svm           0.762              0.525              0.842      0.525   
nb            0.738              0.475              0.825      0.475   
rf            0.767              0.534              0.845      0.534   
snn           0.759              0.519              0.840      0.519   
dnn           0.761              0.523           

## CDpu

In [None]:
app3_customer = pd.read_excel('../Evaluation/real_world/app3_customer_one_hot.xlsx',
                              index_col=[0])
app3_customer

Unnamed: 0,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Graduated_No,Graduated_Yes,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,...,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,Age,Work_Experience,Family_Size,label,difficulty
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,22,1.0,4.0,3,0.063331
1,1,0,0,1,0,1,0,0,1,0,...,0,1,0,0,0,38,,3.0,0,0.327182
2,1,0,0,1,0,1,0,0,1,0,...,0,0,0,1,0,67,1.0,1.0,1,0.287890
3,0,1,0,1,0,1,0,0,0,0,...,0,0,0,1,0,67,0.0,2.0,1,0.273518
4,1,0,0,1,0,1,0,0,0,1,...,0,0,0,1,0,40,,6.0,0,0.358977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8063,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,22,0.0,7.0,3,0.306748
8064,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,35,3.0,4.0,3,0.255205
8065,1,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,33,1.0,1.0,3,0.229414
8066,1,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,27,1.0,4.0,1,0.308741


In [None]:
Customer_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Customer_evaluation_App3.xlsx',engine='openpyxl')
Customer_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                   'Customer_evaluation_App3_detailed.xlsx',engine='openpyxl')

X = app3_customer.iloc[:,:-2]
X['difficulty'] = app3_customer['difficulty']
y = app3_customer['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

X_train = standard(X_train)
X_test = standard(X_test)

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'multiclass':
      classes = [0, 1, 2, 3]
      true_labels = y_test
      weights = X_test_difficulty

      print("KNN")
      knn_result = multi_evaluation(true_labels, y_val_pred_knn, classes, weights)

      print("Logistic Regression")
      lr_result = multi_evaluation(true_labels, y_val_pred_lr, classes, weights)

      print("SVM")
      svm_result = multi_evaluation(true_labels, y_val_pred_svm, classes, weights)

      print("Naive Bayes")
      nb_result = multi_evaluation(true_labels, y_val_pred_nb, classes, weights)

      print("Random Forest")
      rf_result = multi_evaluation(true_labels, y_val_pred_rf, classes, weights)

      print("Simpe Neural Network")
      snn_result = multi_evaluation(true_labels, y_val_pred_snn, classes, weights)

      print("Deep Neural Network")
      dnn_result = multi_evaluation(true_labels, y_val_pred_dnn, classes, weights)

      result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
      result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                            columns=['micro_accuracy', 'micro_sensitivity', 'micro_specificity', 'micro_ppv', 'micro_npv', 'micro_auc',
                                    'macro_accuracy', 'macro_sensitivity', 'macro_specificity', 'macro_ppv', 'macro_npv', 'macro_auc',
                                    'd_micro_accuracy', 'd_micro_sensitivity', 'd_micro_specificity', 'd_micro_ppv', 'd_micro_npv', 'd_micro_auc',
                                    'd_macro_accuracy', 'd_macro_sensitivity', 'd_macro_specificity', 'd_macro_ppv', 'd_macro_npv', 'd_macro_auc'])
      print(result)

sheet_name = 'App3'
result.to_excel(Customer_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})
new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(Customer_detail, sheet_name=sheet_name)

Customer_evaluation.close()
Customer_detail.close()

Best: 0.494775 using {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'uniform'}
Best: 0.508058 using {'C': 0.1, 'penalty': 'l2'}
Best: 0.522579 using {'C': 1.0, 'kernel': 'rbf'}
Best: 0.528069 using {'max_depth': 10, 'n_estimators': 200}
Best: 0.527185 using {'batch_size': 128, 'epochs': 30}
Best: 0.525059 using {'batch_size': 64, 'epochs': 10}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simpe Neural Network
Deep Neural Network
     micro_accuracy  micro_sensitivity  micro_specificity  micro_ppv  \
knn           0.743              0.487              0.829      0.487   
lr            0.752              0.505              0.835      0.505   
svm           0.764              0.527              0.842      0.527   
nb            0.747              0.494              0.831      0.494   
rf            0.768              0.535              0.845      0.535   
snn           0.768              0.535              0.845      0.535   
dnn           0.770              0.539          