In [None]:
pip install scikeras

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
from glob import glob
from os.path import join, getctime
import pandas as pd
import numpy as np
from string import ascii_lowercase
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import type_of_target
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from keras.utils import to_categorical

# Machine Learning Models

In [None]:
def knn(X_train, y_train):
    knn_model = KNeighborsClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_neighbors': [3, 5, 7, 10],
                  'weights': ['uniform', 'distance'],
                  'metric' : ['euclidean', 'manhattan', 'minkowski']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(knn_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_knn_model = grid_search.best_estimator_
    return best_knn_model

def logistic_regression(X_train, y_train):
    lr_model = LogisticRegression()

    # Define the parameter grid for grid search
    param_grid = {'penalty':['l2'],
                  'C': [0.01, 0.1, 1.0, 10]}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(lr_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_lr_model = grid_search.best_estimator_
    return best_lr_model

def svm(X_train, y_train):
    svm_model = SVC(probability=True)

    # Define the parameter grid for grid search
    param_grid = {'C': [0.01, 0.1, 1.0,10],
                  'kernel': ['linear', 'rbf','sigmoid','poly']}

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(svm_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_svm_model = grid_search.best_estimator_
    return best_svm_model

def naive_bayes(X_train, y_train):
    nb_model = GaussianNB()
    # No hyperparameters to tune for Naive Bayes
    nb_model.fit(X_train, y_train)
    return nb_model


def random_forest(X_train, y_train):
    rf_model = RandomForestClassifier()
    # Define the parameter grid for grid search
    param_grid = {'n_estimators': [10, 100, 200],
                  'max_depth': [None, 10, 50]}
    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(rf_model, param_grid, cv=3, n_jobs=-1)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_rf_model = grid_search.best_estimator_
    return best_rf_model


def binary_neural_network(X_train, y_train,net_type):
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model


def multiclass_neural_network(X_train, y_train,net_type):
  y_train = to_categorical(y_train, 3)
  if net_type == 'snn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(3, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_snn_model = grid_search.best_estimator_
    return best_snn_model

  elif net_type == 'dnn':
    def create_model():
      model = keras.Sequential()
      model.add(keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(64, activation='relu'))
      model.add(keras.layers.Dropout(0.2))
      model.add(keras.layers.Dense(32, activation='relu'))
      model.add(keras.layers.Dense(3, activation='softmax'))
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
      return model
    model = KerasClassifier(model=create_model, verbose=0)
    param_grid = {'batch_size': [32, 64, 128],
                  'epochs': [10, 30, 50, 100]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_result = grid_search.fit(X_train, y_train)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    best_dnn_model = grid_search.best_estimator_
    return best_dnn_model

# Evaluation metrics
- accuracy, sensitivity, specificity, ppv, npv, roc_auc
- d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv, d_roc_auc

In [None]:
def binary_metrics(TP, FP, TN, FN):
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    ppv = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    npv = TN / (TN + FN) if (TN + FN) > 0 else 0.0
    return accuracy, sensitivity, specificity, ppv, npv

def binary_confusion_matrix(true_labels, y_pred, class_weights):
    TP = sum((a == 1) and (p == 1) for a, p in zip(true_labels, y_pred))
    FP = sum((a != 1) and (p == 1) for a, p in zip(true_labels, y_pred))
    TN = sum((a != 1) and (p != 1) for a, p in zip(true_labels, y_pred))
    FN = sum((a == 1) and (p != 1) for a, p in zip(true_labels, y_pred))

    d_TP = sum(w * ((a == 1) and (p == 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_FP = sum((1-w) * ((a != 1) and (p == 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_TN = sum(w * ((a != 1) and (p != 1)) for a, p, w in zip(true_labels, y_pred, class_weights))
    d_FN = sum((1-w) * ((a == 1) and (p != 1)) for a, p, w in zip(true_labels, y_pred, class_weights))

    if sum(class_weights) == 0:  # If all weights are 0, accuracy is 1
        print("Sum of class weights is 0")
        return TP, FP, TN, FN, TP, FP, TN, FN
    else:
        return TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN


def binary_evaluation(true_labels, predicted_labels, class_weights):
    predicted_labels =  np.array(predicted_labels)[:,1]
    y_pred_binary = np.where(predicted_labels >= 0.5, 1, 0)
    TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN = binary_confusion_matrix(true_labels,y_pred_binary,class_weights)
    accuracy, sensitivity, specificity, ppv, npv = binary_metrics(TP, FP, TN, FN)
    d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv = binary_metrics(d_TP, d_FP, d_TN, d_FN)

    #AUC
    thresholds = np.linspace(0, 1, 100)
    tpr_list = []
    fpr_list = []
    d_tpr_list = []
    d_fpr_list = []
    for threshold in thresholds:    # Calculate TPR and FPR for each threshold
        y_pred_binary = np.where(predicted_labels >= threshold, 1, 0)
        TP, FP, TN, FN, d_TP, d_FP, d_TN, d_FN = binary_confusion_matrix(true_labels,y_pred_binary,class_weights)

        tpr = TP / (TP + FN)
        fpr = FP / (FP + TN)
        d_tpr = d_TP / (d_TP + d_FN)
        d_fpr = d_FP / (d_FP + d_TN)

        tpr_list.append(tpr)
        fpr_list.append(fpr)
        d_tpr_list.append(d_tpr)
        d_fpr_list.append(d_fpr)

    roc_auc = metrics.auc(fpr_list,tpr_list)
    d_roc_auc = metrics.auc(d_fpr_list,d_tpr_list)

    return [accuracy, sensitivity, specificity, ppv, npv, roc_auc , d_accuracy, d_sensitivity, d_specificity, d_ppv, d_npv, d_roc_auc]

# Evaluation

## CDmc

In [None]:
app1_uci = pd.read_excel('../Evaluation/real_world/app1_UCI_breast_cancer.xlsx',index_col=[0])
app1_uci

Unnamed: 0_level_0,CT,UCSize,UCShape,MA,SECSize,BN,BC,NN,Mitoses,y,difficulty
case_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,5,1,1,1,2,1.0,3,1,1,0,0.142857
1,5,4,4,5,7,10.0,3,2,1,0,1.000000
2,3,1,1,1,2,2.0,3,1,1,0,0.142857
3,6,8,8,1,3,4.0,3,7,1,0,1.000000
4,4,1,1,3,2,1.0,3,1,1,0,0.142857
...,...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,0,0.142857
695,2,1,1,1,2,1.0,1,1,1,0,0.142857
696,5,10,10,3,7,3.0,8,10,2,1,0.142857
697,4,8,6,4,3,4.0,10,6,1,1,0.142857


In [None]:
breast_cancer_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'UCI_breast_cancer_evaluation_App1.xlsx',engine='openpyxl')
breast_cancer_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'UCI_breast_cancer_evaluation_App1_detailed.xlsx',engine='openpyxl')

X = app1_uci.iloc[:,:-2]
X['difficulty'] = app1_uci['difficulty']
y = app1_uci['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  print('binary')
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  print('multi')
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'binary':
  true_labels = y_test
  weights = X_test_difficulty

  print("KNN")
  knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

  print("Logistic Regression")
  lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

  print("SVM")
  svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

  print("Naive Bayes")
  nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

  print("Random Forest")
  rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

  print("Simple Neural Network")
  snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

  print("Deep Neural Network")
  dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

  result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
  result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                        columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                 'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
  print(result)

sheet_name = 'App1'
result.to_excel(breast_cancer_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})

new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(breast_cancer_detail, sheet_name=sheet_name)

breast_cancer_evaluation.close()
breast_cancer_detail.close()

Best: 0.965235 using {'metric': 'euclidean', 'n_neighbors': 10, 'weights': 'distance'}
Best: 0.965235 using {'C': 1.0, 'penalty': 'l2'}
Best: 0.969325 using {'C': 0.1, 'kernel': 'rbf'}
Best: 0.959100 using {'max_depth': None, 'n_estimators': 100}
binary
Best: 0.957055 using {'batch_size': 32, 'epochs': 100}
Best: 0.961145 using {'batch_size': 32, 'epochs': 50}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity       ppv       npv   roc_auc  \
knn  0.966667     0.925373     0.986014  0.968750  0.965753  0.997234   
lr   0.952381     0.880597     0.986014  0.967213  0.946309  0.997704   
svm  0.961905     0.955224     0.965035  0.927536  0.978723  0.996712   
nb   0.961905     0.970149     0.958042  0.915493  0.985612  0.983718   
rf   0.966667     0.940299     0.979021  0.954545  0.972222  0.995460   
snn  0.971429     0.955224     0.979021  0.955224  0.979021  0.996399   
dnn  0.980952     0.985075    

## CDdm

In [None]:
app2_uci = pd.read_excel('../Evaluation/real_world/app2_UCI_breast_cancer.xlsx')
app2_uci

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,label,difficulty
0,4,2,1,1,2,1.000000,2,1,1,0,0.000590
1,2,1,1,1,2,1.000000,2,1,1,0,0.000560
2,6,1,1,1,2,1.000000,3,1,1,0,0.004322
3,8,4,5,1,2,3.113636,7,3,1,1,0.028894
4,2,1,1,1,2,1.000000,2,1,1,0,0.000560
...,...,...,...,...,...,...,...,...,...,...,...
694,2,1,1,1,2,1.000000,3,1,1,0,0.000987
695,1,1,1,1,2,1.000000,1,1,1,0,0.000815
696,3,1,1,1,2,1.000000,2,3,1,0,0.005097
697,5,10,10,5,4,5.000000,4,4,1,1,0.224570


In [None]:
breast_cancer_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'UCI_breast_cancer_evaluation_App2.xlsx',engine='openpyxl')
breast_cancer_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'UCI_breast_cancer_evaluation_App2_detailed.xlsx',engine='openpyxl')

X = app2_uci.iloc[:,:-2]
X['difficulty'] = app2_uci['difficulty']
y = app2_uci['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  print('binary')
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  print('multi')
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'binary':
  true_labels = y_test
  weights = X_test_difficulty

  print("KNN")
  knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

  print("Logistic Regression")
  lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

  print("SVM")
  svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

  print("Naive Bayes")
  nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

  print("Random Forest")
  rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

  print("Simple Neural Network")
  snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

  print("Deep Neural Network")
  dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

  result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
  result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                        columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                 'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
  print(result)

sheet_name = 'App2'
result.to_excel(breast_cancer_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})

new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(breast_cancer_detail, sheet_name=sheet_name)

breast_cancer_evaluation.close()
breast_cancer_detail.close()

Best: 0.975460 using {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
Best: 0.963190 using {'C': 0.1, 'penalty': 'l2'}
Best: 0.967280 using {'C': 0.01, 'kernel': 'linear'}
Best: 0.965235 using {'max_depth': None, 'n_estimators': 100}
binary
Best: 0.957055 using {'batch_size': 32, 'epochs': 30}
Best: 0.963190 using {'batch_size': 64, 'epochs': 30}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity       ppv       npv   roc_auc  \
knn  0.971429     0.958904     0.978102  0.958904  0.978102  0.983302   
lr   0.966667     0.945205     0.978102  0.958333  0.971014  0.994501   
svm  0.961905     0.945205     0.970803  0.945205  0.970803  0.994251   
nb   0.957143     0.972603     0.948905  0.910256  0.984848  0.976202   
rf   0.971429     0.986301     0.963504  0.935065  0.992481  0.987401   
snn  0.928571     0.863014     0.963504  0.926471  0.929577  0.987451   
dnn  0.938095     0.972603   

## CDpu

In [None]:
app3_uci = pd.read_excel('../Evaluation/real_world/app3_UCI_breast_cancer.xlsx',index_col=[0])
app3_uci

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,label,difficulty
0,5,1,1,1,2,1.0,3,1,1,0,0.016598
1,5,4,4,5,7,10.0,3,2,1,0,0.611221
2,3,1,1,1,2,2.0,3,1,1,0,0.007619
3,6,8,8,1,3,4.0,3,7,1,0,0.545318
4,4,1,1,3,2,1.0,3,1,1,0,0.005387
...,...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,0,0.067376
695,2,1,1,1,2,1.0,1,1,1,0,0.005687
696,5,10,10,3,7,3.0,8,10,2,1,0.054388
697,4,8,6,4,3,4.0,10,6,1,1,0.538430


In [None]:
breast_cancer_evaluation = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'UCI_breast_cancer_evaluation_App3.xlsx',engine='openpyxl')
breast_cancer_detail = pd.ExcelWriter('../Evaluation/outcome/'+
                                          'UCI_breast_cancer_evaluation_App3_detailed.xlsx',engine='openpyxl')

X = app3_uci.iloc[:,:-2]
X['difficulty'] = app3_uci['difficulty']
y = app3_uci['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_difficulty = X_test['difficulty']
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

best_knn_model = knn(X_train,y_train)
best_lr_model = logistic_regression(X_train,y_train)
best_svm_model = svm(X_train, y_train)
nb_model = naive_bayes(X_train, y_train)
best_rf_model = random_forest(X_train, y_train)

if len(np.unique(y_train)) == 2:
  print('binary')
  best_snn_model = binary_neural_network(X_train, y_train,'snn')
  best_dnn_model = binary_neural_network(X_train, y_train,'dnn')
elif len(np.unique(y_train)) > 2:
  print('multi')
  best_snn_model = multiclass_neural_network(X_train, y_train,'snn')
  best_dnn_model = multiclass_neural_network(X_train, y_train,'dnn')

y_val_pred_knn = best_knn_model.predict_proba(X_test)
y_val_pred_lr = best_lr_model.predict_proba(X_test)
y_val_pred_svm = best_svm_model.predict_proba(X_test)
y_val_pred_nb = nb_model.predict_proba(X_test)
y_val_pred_rf = best_rf_model.predict_proba(X_test)
y_val_pred_snn = best_snn_model.predict_proba(X_test)
y_val_pred_dnn = best_dnn_model.predict_proba(X_test)

y_type = type_of_target(y_test)
if y_type == 'binary':
  true_labels = y_test
  weights = X_test_difficulty

  print("KNN")
  knn_result =  binary_evaluation(true_labels, y_val_pred_knn,weights)

  print("Logistic Regression")
  lr_result =   binary_evaluation(true_labels, y_val_pred_lr,weights)

  print("SVM")
  svm_result =   binary_evaluation(true_labels, y_val_pred_svm,weights)

  print("Naive Bayes")
  nb_result =  binary_evaluation(true_labels, y_val_pred_nb,weights)

  print("Random Forest")
  rf_result =  binary_evaluation(true_labels, y_val_pred_rf,weights)

  print("Simple Neural Network")
  snn_result =  binary_evaluation(true_labels, y_val_pred_snn,weights)

  print("Deep Neural Network")
  dnn_result =  binary_evaluation(true_labels, y_val_pred_dnn,weights)

  result_list = [knn_result, lr_result, svm_result, nb_result, rf_result,snn_result,dnn_result]
  result = pd.DataFrame(result_list, index=['knn','lr','svm','nb','rf','snn','dnn'],
                        columns=['accuracy', 'sensitivity', 'specificity', 'ppv', 'npv', 'roc_auc',
                                 'd_accuracy', 'd_sensitivity', 'd_specificity', 'd_ppv', 'd_npv', 'd_roc_auc'])
  print(result)

sheet_name = 'App3'
result.to_excel(breast_cancer_evaluation, sheet_name=sheet_name)

predictions = pd.DataFrame({'label': y_test,'difficulty':X_test_difficulty,
         'knn': np.argmax(y_val_pred_knn, axis=1),'lr': np.argmax(y_val_pred_lr, axis=1),'svm': np.argmax(y_val_pred_svm, axis=1),
         'nb': np.argmax(y_val_pred_nb, axis=1),'rf': np.argmax(y_val_pred_rf, axis=1),'snn':np.argmax(y_val_pred_snn, axis=1),
                          'dnn':np.argmax(y_val_pred_dnn, axis=1)})

new_df = pd.concat([X_test, predictions], axis=1)
new_df.to_excel(breast_cancer_detail, sheet_name=sheet_name)

breast_cancer_evaluation.close()
breast_cancer_detail.close()

Best: 0.965235 using {'metric': 'euclidean', 'n_neighbors': 10, 'weights': 'distance'}
Best: 0.965235 using {'C': 1.0, 'penalty': 'l2'}
Best: 0.969325 using {'C': 0.1, 'kernel': 'rbf'}
Best: 0.959100 using {'max_depth': None, 'n_estimators': 100}
binary
Best: 0.959100 using {'batch_size': 32, 'epochs': 50}
Best: 0.961145 using {'batch_size': 32, 'epochs': 50}
KNN
Logistic Regression
SVM
Naive Bayes
Random Forest
Simple Neural Network
Deep Neural Network
     accuracy  sensitivity  specificity       ppv       npv   roc_auc  \
knn  0.966667     0.925373     0.986014  0.968750  0.965753  0.997234   
lr   0.952381     0.880597     0.986014  0.967213  0.946309  0.997704   
svm  0.961905     0.955224     0.965035  0.927536  0.978723  0.996608   
nb   0.961905     0.970149     0.958042  0.915493  0.985612  0.983718   
rf   0.961905     0.925373     0.979021  0.953846  0.965517  0.996973   
snn  0.961905     0.910448     0.986014  0.968254  0.959184  0.995669   
dnn  0.980952     0.985075     