In [None]:
pip install scikeras

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install scikit-learn==1.2.2

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
from glob import glob
from os.path import join, getctime
import pandas as pd
import numpy as np
from string import ascii_lowercase
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import type_of_target
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from keras.utils import to_categorical

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Machine Learning Models

In [None]:
def knn(X_train, y_train):
    knn_model = KNeighborsClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_neighbors': [3, 5, 7, 10],
                  'weights': ['uniform', 'distance'],
                  'metric' : ['euclidean', 'manhattan', 'minkowski']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(knn_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_knn_model = grid_search.best_estimator_
    return best_knn_model

def logistic_regression(X_train, y_train):
    lr_model = LogisticRegression(max_iter=1000)

    # Define the parameter grid for grid search
    param_grid = {'penalty':['l2'],
                  'C': [0.01, 0.1, 1.0, 10]}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(lr_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_lr_model = grid_search.best_estimator_
    return best_lr_model

def svm(X_train, y_train):
    svm_model = SVC(probability=True)

    # Define the parameter grid for grid search
    param_grid = {'C': [0.01, 0.1, 1.0,10],
                  'kernel': ['linear', 'rbf','sigmoid','poly']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(svm_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_svm_model = grid_search.best_estimator_
    return best_svm_model

def naive_bayes(X_train, y_train):
    nb_model = GaussianNB()
    # No hyperparameters to tune for Naive Bayes
    nb_model.fit(X_train, y_train)
    return nb_model


def random_forest(X_train, y_train):
    rf_model = RandomForestClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_estimators': [10, 100, 200],
                  'max_depth': [10, 50]}
    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(rf_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_rf_model = grid_search.best_estimator_
    return best_rf_model


def binary_neural_network(X_train, y_train,net_type):
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model


def multiclass_neural_network(X_train, y_train,net_type):
  y_train = to_categorical(y_train, 3)
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(3, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(3, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model

# Evaluation metrics
- accuracy, sensitivity, specificity, ppv, npv, roc_auc
- d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv, d_roc_auc

In [None]:
def binary_metrics(TP, FP, TN, FN):
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    ppv = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    npv = TN / (TN + FN) if (TN + FN) > 0 else 0.0
    return accuracy, sensitivity, specificity, ppv, npv

def binary_confusion_matrix(true_labels, y_pred, class_weights):
    TP = sum((a == 1) and (p == 1) for a, p in zip(true_labels, y_pred))
    FP = sum((a != 1) and (p == 1) for a, p in zip(true_labels, y_pred))
    TN = sum((a != 1) and (p != 1) for a, p in zip(true_labels, y_pred))
    FN = sum((a == 1) and (p != 1) for a, p in zip(true_labels, y_pred))

    d_TP = sum(w * ((a == 1) and (p == 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_FP = sum((1-w) * ((a != 1) and (p == 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_TN = sum(w * ((a != 1) and (p != 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_FN = sum((1-w) * ((a == 1) and (p != 1)) for a, p, w in zip(true_labels, y_pred, class_weights))

    if sum(class_weights) == 0:  # If all weights are 0, accuracy is 1
        print("Sum of class weights is 0")
        return TP, FP, TN, FN, TP, FP, TN, FN
    else:
        return TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN


def binary_evaluation(true_labels, predicted_labels, class_weights):
    predicted_labels =  np.array(predicted_labels)[:,1]
    y_pred_binary = np.where(predicted_labels >= 0.5, 1, 0)
    TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN = binary_confusion_matrix(true_labels,y_pred_binary,class_weights)
    accuracy, sensitivity, specificity, ppv, npv = binary_metrics(TP, FP, TN, FN)
    d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv = binary_metrics(d_TP, d_FP, d_TN, d_FN)

    #AUC
    thresholds = np.linspace(0, 1, 100)
    tpr_list = []
    fpr_list = []
    d_tpr_list = []
    d_fpr_list = []
    for threshold in thresholds:    # Calculate TPR and FPR for each threshold
        y_pred_binary = np.where(predicted_labels >= threshold, 1, 0)
        TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN = binary_confusion_matrix(true_labels,y_pred_binary,class_weights)

        tpr = TP / (TP + FN)
        fpr = FP / (FP + TN)
        d_tpr = d_TP / (d_TP + d_FN)
        d_fpr = d_FP / (d_FP + d_TN)

        tpr_list.append(tpr)
        fpr_list.append(fpr)
        d_tpr_list.append(d_tpr)
        d_fpr_list.append(d_fpr)

    roc_auc = metrics.auc(fpr_list,tpr_list)
    d_roc_auc = metrics.auc(d_fpr_list,d_tpr_list)

    return [accuracy, sensitivity, specificity, ppv, npv, roc_auc , d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv, d_roc_auc]

# Evaluation

## CDmc

In [None]:
app1_telco = pd.read_excel('../Evaluation/real_world/app1_telco_one_hot.xlsx',index_col=[0])
app1_telco

Unnamed: 0_level_0,tenure,TotalCharges,MonthlyCharges,gender_0,gender_1,SeniorCitizen_0,SeniorCitizen_1,Partner_0,Partner_1,Dependents_0,...,Contract_1,Contract_2,PaperlessBilling_0,PaperlessBilling_1,PaymentMethod_0,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3,y,difficulty
case_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,29.85,29.85,1,0,1,0,0,1,1,...,0,0,0,1,0,0,1,0,0,1.000000
1,34,56.95,1889.50,0,1,1,0,1,0,1,...,1,0,1,0,0,0,0,1,0,0.014286
2,2,53.85,108.15,0,1,1,0,1,0,1,...,0,0,0,1,0,0,0,1,1,1.000000
3,45,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,1,0,1,0,0,0,0,0.014286
4,2,70.70,151.65,1,0,1,0,1,0,1,...,0,0,0,1,0,0,1,0,1,0.014286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,24,84.80,1990.50,0,1,1,0,0,1,0,...,1,0,0,1,0,0,0,1,0,0.014286
7039,72,103.20,7362.90,1,0,1,0,0,1,0,...,1,0,0,1,0,1,0,0,0,0.014286
7040,11,29.60,346.45,1,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0.014286
7041,4,74.40,306.60,0,1,0,1,0,1,1,...,0,0,0,1,0,0,0,1,1,0.028571


In [None]:
telco_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'Telco_evaluation_App1.xlsx',engine='openpyxl')
telco_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'Telco_evaluation_App1_detailed.xlsx',engine='openpyxl')

X = app1_telco.iloc[:,:-2]
X['difficulty'] = app1_telco['difficulty']
y = app1_telco['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

X_train[["tenure", "TotalCharges", "MonthlyCharges"]]  =  scaler.fit_transform(X_train[["tenure", "TotalCharges", "MonthlyCharges"]])
X_test[["tenure", "TotalCharges", "MonthlyCharges"]]  =  scaler.transform(X_test[["tenure", "TotalCharges", "MonthlyCharges"]])

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  print('binary')
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  print('multi')
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test.values)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'binary':
  true_labels = y_test
  weights = X_test_difficulty

  print("KNN")
  knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

  print("Logistic Regression")
  lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

  print("SVM")
  svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

  print("Naive Bayes")
  nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

  print("Random Forest")
  rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

  print("Simple Neural Network")
  snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

  print("Deep Neural Network")
  dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

  result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
  result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                        columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                 'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
  print(result)

sheet_name = 'App1'
result.to_excel(telco_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})

new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(telco_detail, sheet_name=sheet_name)

telco_evaluation.close()
telco_detail.close()

Best: 0.782960 using {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'uniform'}
Best: 0.802838 using {'C': 0.01, 'penalty': 'l2'}
Best: 0.800201 using {'C': 0.01, 'kernel': 'linear'}
Best: 0.794116 using {'max_depth': 10, 'n_estimators': 200}
binary
Best: 0.803244 using {'batch_size': 64, 'epochs': 10}
Best: 0.801418 using {'batch_size': 64, 'epochs': 10}




KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity       ppv       npv   roc_auc  \
knn  0.772362     0.639373     0.821962  0.572543  0.859375  0.822189   
lr   0.810696     0.527875     0.916179  0.701389  0.838786  0.856183   
svm  0.807856     0.541812     0.907083  0.685022  0.841471  0.851521   
nb   0.699953     0.869338     0.636777  0.471645  0.928910  0.784951   
rf   0.806910     0.517422     0.914880  0.693925  0.835608  0.854964   
snn  0.806910     0.550523     0.902534  0.678112  0.843352  0.855962   
dnn  0.804543     0.562718     0.894737  0.665979  0.845823  0.853414   

     d_accuracy  d_sensitivity  d_specificity     d_ppv     d_npv  d_roc_auc  
knn    0.442045       0.786561       0.222353  0.392094  0.620303   0.486762  
lr     0.736575       0.529007       0.875121  0.738734  0.735707   0.809630  
svm    0.663972       0.583799       0.742896  0.690909  0.644533   0.731828  
nb 

## CDdm

In [None]:
app2_telco = pd.read_excel('../Evaluation/real_world/app2_telco_case_difficulty.xlsx')
app2_telco

Unnamed: 0,tenure,TotalCharges,MonthlyCharges,gender_0,gender_1,SeniorCitizen_0,SeniorCitizen_1,Partner_0,Partner_1,Dependents_0,...,Contract_1,Contract_2,PaperlessBilling_0,PaperlessBilling_1,PaymentMethod_0,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3,label,difficulty
0,1,29.85,29.85,1,0,1,0,0,1,1,...,0,0,0,1,0,0,1,0,0,0.580291
1,10,29.75,301.90,1,0,1,0,1,0,1,...,0,0,1,0,0,0,0,1,0,0.009262
2,12,19.80,202.25,0,1,1,0,0,1,1,...,1,0,1,0,1,0,0,0,0,0.029072
3,49,59.60,2970.30,0,1,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0.580291
4,30,55.30,1530.60,1,0,1,0,1,0,1,...,0,0,0,1,1,0,0,0,0,0.027438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,68,64.10,4326.25,1,0,1,0,0,1,1,...,0,1,1,0,1,0,0,0,0,0.063983
7039,2,20.05,39.25,1,0,1,0,1,0,1,...,0,0,0,1,0,0,0,1,0,0.452563
7040,1,75.75,75.75,0,1,0,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0.073197
7041,38,69.50,2625.25,0,1,1,0,1,0,1,...,0,0,0,1,0,1,0,0,0,0.133840


In [None]:
telco_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'Telco_evaluation_App2.xlsx',engine='openpyxl')
telco_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'Telco_evaluation_App2_detailed.xlsx',engine='openpyxl')

X = app2_telco.iloc[:,:-2]
X['difficulty'] = app2_telco['difficulty']
y = app2_telco['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

X_train[["tenure", "TotalCharges", "MonthlyCharges"]]  =  scaler.fit_transform(X_train[["tenure", "TotalCharges", "MonthlyCharges"]])
X_test[["tenure", "TotalCharges", "MonthlyCharges"]]  =  scaler.transform(X_test[["tenure", "TotalCharges", "MonthlyCharges"]])

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  print('binary')
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  print('multi')
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'binary':
  true_labels = y_test
  weights = X_test_difficulty

  print("KNN")
  knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

  print("Logistic Regression")
  lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

  print("SVM")
  svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

  print("Naive Bayes")
  nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

  print("Random Forest")
  rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

  print("Simple Neural Network")
  snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

  print("Deep Neural Network")
  dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

  result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
  result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                        columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                 'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
  print(result)

sheet_name = 'App2'
result.to_excel(telco_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})
new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(telco_detail, sheet_name=sheet_name)

telco_evaluation.close()
telco_detail.close()

Best: 0.775254 using {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'uniform'}
Best: 0.801624 using {'C': 0.01, 'penalty': 'l2'}
Best: 0.796553 using {'C': 0.01, 'kernel': 'linear'}
Best: 0.795741 using {'max_depth': 10, 'n_estimators': 200}
binary
Best: 0.800409 using {'batch_size': 64, 'epochs': 10}
Best: 0.795742 using {'batch_size': 64, 'epochs': 10}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity       ppv       npv   roc_auc  \
knn  0.769522     0.631679     0.814978  0.529600  0.870296  0.805751   
lr   0.809749     0.517176     0.906230  0.645238  0.850561  0.838728   
svm  0.809276     0.522901     0.903713  0.641686  0.851720  0.832868   
nb   0.678183     0.856870     0.619257  0.425996  0.929178  0.766958   
rf   0.812115     0.524809     0.906860  0.650118  0.852663  0.844151   
snn  0.804070     0.572519     0.880428  0.612245  0.861984  0.840710   
dnn  0.806910     0.520992   

## CDpu

In [None]:
app3_telco = pd.read_excel('../Evaluation/real_world/app3_telco_one_hot.xlsx',index_col=[0])
app3_telco

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x39,x40,x41,x42,x43,x44,x45,x46,label,difficulty
0,1,29.85,29.85,1,0,1,0,0,1,1,...,0,0,0,1,0,0,1,0,0,0.478178
1,34,56.95,1889.50,0,1,1,0,1,0,1,...,1,0,1,0,0,0,0,1,0,0.109456
2,2,53.85,108.15,0,1,1,0,1,0,1,...,0,0,0,1,0,0,0,1,1,0.496709
3,45,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,1,0,1,0,0,0,0,0.178446
4,2,70.70,151.65,1,0,1,0,1,0,1,...,0,0,0,1,0,0,1,0,1,0.290220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,24,84.80,1990.50,0,1,1,0,0,1,0,...,1,0,0,1,0,0,0,1,0,0.156939
7039,72,103.20,7362.90,1,0,1,0,0,1,0,...,1,0,0,1,0,1,0,0,0,0.198271
7040,11,29.60,346.45,1,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0.375517
7041,4,74.40,306.60,0,1,0,1,0,1,1,...,0,0,0,1,0,0,0,1,1,0.271145


In [None]:
telco_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'Telco_evaluation_App3.xlsx',engine='openpyxl')
telco_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'Telco_evaluation_App3_detailed.xlsx',engine='openpyxl')

X = app3_telco.iloc[:,:-2]
X['difficulty'] = app3_telco['difficulty']
y = app3_telco['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

X_train[["x1", "x2", "x3"]]  =  scaler.fit_transform(X_train[["x1", "x2", "x3"]])
X_test[["x1", "x2", "x3"]]  =  scaler.transform(X_test[["x1", "x2", "x3"]])

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  print('binary')
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  print('multi')
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'binary':
  true_labels = y_test
  weights = X_test_difficulty

  print("KNN")
  knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

  print("Logistic Regression")
  lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

  print("SVM")
  svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

  print("Naive Bayes")
  nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

  print("Random Forest")
  rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

  print("Simple Neural Network")
  snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

  print("Deep Neural Network")
  dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

  result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
  result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                        columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                 'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
  print(result)

sheet_name = 'App3'
result.to_excel(telco_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})
new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(telco_detail, sheet_name=sheet_name)

telco_evaluation.close()
telco_detail.close()

Best: 0.782960 using {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'uniform'}
Best: 0.802838 using {'C': 0.01, 'penalty': 'l2'}
Best: 0.800201 using {'C': 0.01, 'kernel': 'linear'}
Best: 0.793913 using {'max_depth': 10, 'n_estimators': 200}
binary
Best: 0.803852 using {'batch_size': 128, 'epochs': 10}
Best: 0.802230 using {'batch_size': 128, 'epochs': 10}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity       ppv       npv   roc_auc  \
knn  0.772362     0.639373     0.821962  0.572543  0.859375  0.822189   
lr   0.810696     0.527875     0.916179  0.701389  0.838786  0.856183   
svm  0.807856     0.541812     0.907083  0.685022  0.841471  0.851554   
nb   0.699953     0.869338     0.636777  0.471645  0.928910  0.784951   
rf   0.800284     0.505226     0.910331  0.677570  0.831454  0.855390   
snn  0.804070     0.515679     0.911631  0.685185  0.834622  0.856505   
dnn  0.804070     0.529617 