In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import os

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import roc_curve, roc_auc_score

import sklearn.discriminant_analysis as DA
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

from data_mining_process import *


import warnings
warnings.filterwarnings('ignore')

# make directory for visualization
if not os.path.exists('visualization'):
    os.makedirs('visualization')

In [2]:
# load cases from case features folder
Ent_Case = pd.read_csv('case_features/Window_Entropy_Case.csv', header=None)
Slope_Case = pd.read_csv('case_features/Window_Slope_Case.csv', header=None)
Mf_Case = pd.read_csv('case_features/Window_Mf_Case.csv', header=None)

In [3]:
# load controls from control features folder
Ent_Control = pd.read_csv('control_features/Window_Entropy_Control.csv', header=None)
Slope_Control = pd.read_csv('control_features/Window_Slope_Control.csv', header=None)
Mf_Control = pd.read_csv('control_features/Window_Mf_Control.csv', header=None)

In [4]:
col_names = ["Entropy ", "Slope ", "ID", "Hurst Exponent ",
    "Left Slope ", "Right Slope ","Left Tangent ",
    "Right Tangent", "Broadness", "Left Tangent Point", "Right Tangent Point"]
# combine Ent and Slope and Mfcc dataframes for each class
Case = pd.concat([ Ent_Case.T, Slope_Case.T, Mf_Case ], axis=1)
Case.columns = col_names
Control = pd.concat([ Ent_Control.T, Slope_Control.T, Mf_Control ], axis=1)
Control.columns = col_names
Case['Case'] = 1
Control['Case'] = 0
Control

Unnamed: 0,Entropy,Slope,ID,Hurst Exponent,Left Slope,Right Slope,Left Tangent,Right Tangent,Broadness,Left Tangent Point,Right Tangent Point,Case
0,0.228845,-1.872362,14998,-0.491226,0.719788,-0.485864,1.7,-0.7,0.689497,-0.769086,-0.079589,0
1,0.280654,-1.697829,14998,-0.584685,0.632049,-0.627706,1.2,-0.9,0.635052,-0.901116,-0.266065,0
2,0.295749,-2.329581,14998,-0.576963,0.655232,-0.503350,1.5,-0.8,0.702573,-0.882198,-0.179625,0
3,0.278578,-1.303314,14998,-0.452763,0.665261,-0.556428,1.3,-0.8,0.660070,-0.753397,-0.093327,0
4,0.201263,-2.625640,23625,-0.375353,0.718251,-0.563209,1.4,-0.8,0.633562,-0.653807,-0.020245,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2542,,,85349,-0.459627,0.843282,-0.538658,1.3,-0.8,0.608461,-0.696795,-0.088334,0
2543,,,9983,-0.445740,0.692655,-0.465482,1.8,-0.7,0.718406,-0.734484,-0.016078,0
2544,,,9983,-0.386403,0.686540,-0.513355,1.4,-0.8,0.680909,-0.677719,0.003190,0
2545,,,9983,-0.454157,0.815750,-0.461461,1.7,-0.7,0.678579,-0.699330,-0.020751,0


In [5]:
# combine Case and Control data
data = pd.concat([Case, Control], axis=0)

# check if any missing values
data.isna().sum()

Entropy                156
Slope                  156
ID                       0
Hurst Exponent           0
Left Slope               0
Right Slope              0
Left Tangent             0
Right Tangent            0
Broadness                0
Left Tangent Point       0
Right Tangent Point      0
Case                     0
dtype: int64

In [6]:
# suffle data
data = data.sample(frac=1, random_state= 42).reset_index(drop=True);
# replace NaN values with mean of the column
data = data.fillna(data.mean())

In [7]:
# a model using three features: entorpy, slope, and left tangent point
X = data.iloc[:, [0,1,9]]

# replace NaN values with mean of the column
y = data.iloc[:, 11]

In [8]:
target = data.columns[11]  # target variable name
features = [data.columns[0], data.columns[1], data.columns[9]]  # predictor features name

In [9]:
def build_knn(mod_params):
    # param: mod_params= a list of parameters(the order of elems matters)
    # the order is n_neighbors, weights, algorithm, metric 
    # return sklearn GridSearchCV object with given param
    return KNeighborsClassifier(n_neighbors=mod_params[0], weights=mod_params[1],
                               algorithm=mod_params[2], metric=mod_params[3])

parameters = {"n_neighbors": range(1, 30),"weights": ["uniform", "distance"],
              "algorithm" :[ "ball_tree", "kd_tree", "brute"],
               "metric": ["euclidean", "manhattan", "chebyshev", "minkowski"]}

# train and test a model with multiple samples
#mod_params = [5, 'uniform', 'auto', 'minkowski']  # default value
mod_params = [35, 'distance', 'auto', 'minkowski']
performance_measures_dict, max_cm = repeat_sampling_and_training(build_knn, mod_params, data,
                                    target, features, num_repeat=1000, doMinMaxScaling=False)

In [10]:
print_performance_metrics(performance_measures_dict)
max_cm

Mean of Training Accuracy: 1.0
Mean of Testing Accuracy: 0.637161943319838
Standard deviation of Testing Accuracy: 0.029816201434567258
Mean of Sensitivity: 0.5679898622267476
Standard deviation of Sensitivity: 0.04911965793197239
Mean of Speicificity: 0.7101846870437865
Standard deviation of Speicificity: 0.04805176283049268
Mean of Case Precision: 0.6697927135776364
Standard deviation of Case Precision: 0.047507874712352835
Mean of Control Precision: 0.614392422240116
Standard deviation of Control Precision: 0.03955580435774377
Mean of Case F1: 0.6129312864312819
Standard deviation of Case F1: 0.03614500771295051
Mean of Control F1: 0.6574305494978431
Standard deviation of Control F1: 0.0311823890946208
Mean of AUC: 0.7019393592072595
Standard deviation of AUC: 0.03077181163251468


Unnamed: 0,Predicted Negative(Absent),Predicted Positive(Present)
Actual Negative(Absent),107,12
Actual positive(Present),55,73


In [11]:
# Using all features in the data
target = data.columns[11]  # target variable name
features = [col_name for col_name in data.columns[1:11]]  # predictor features name
features.pop(features.index('ID'))  # remove patient ID from feature names list
features

['Slope ',
 'Hurst Exponent ',
 'Left Slope ',
 'Right Slope ',
 'Left Tangent ',
 'Right Tangent',
 'Broadness',
 'Left Tangent Point',
 'Right Tangent Point']

In [12]:
# train and test a model based on all features with multiple samples
parameters = {"n_neighbors": range(1, 30),"weights": ["uniform", "distance"],
              "algorithm" :[ "ball_tree", "kd_tree", "brute"],
               "metric": ["euclidean", "manhattan", "chebyshev", "minkowski"]}
#mod_params = [5, 'uniform', 'auto', 'minkowski']  # default value
mod_params = [25, 'distance', 'auto', 'manhattan']
performance_measures_dict, max_cm = repeat_sampling_and_training(build_knn, mod_params, data,
                                    target, features, num_repeat=1000, doMinMaxScaling=False)

In [13]:
print_performance_metrics(performance_measures_dict)
max_cm

Mean of Training Accuracy: 1.0
Mean of Testing Accuracy: 0.6343603238866397
Standard deviation of Testing Accuracy: 0.02961754470331144
Mean of Sensitivity: 0.5580329061374625
Standard deviation of Sensitivity: 0.04706882838120276
Mean of Speicificity: 0.715519117079407
Standard deviation of Speicificity: 0.04931026135734358
Mean of Case Precision: 0.6721096042998729
Standard deviation of Case Precision: 0.048141544063603194
Mean of Control Precision: 0.6085912999116048
Standard deviation of Control Precision: 0.03884105122790706
Mean of Case F1: 0.6079795694534343
Standard deviation of Case F1: 0.03496588785902675
Mean of Control F1: 0.656345382226896
Standard deviation of Control F1: 0.03165513283773619
Mean of AUC: 0.6940703916941239
Standard deviation of AUC: 0.030556136464650982


Unnamed: 0,Predicted Negative(Absent),Predicted Positive(Present)
Actual Negative(Absent),99,21
Actual positive(Present),51,76


No class weights in KNN