# Importing Libraries and Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import warnings
%matplotlib inline

In [2]:
dataset = pd.read_csv("raw_ckd.csv")
mode_dataset = pd.read_csv("mode_ckd.csv")
mean_dataset = pd.read_csv("mean_ckd.csv")

In [3]:
cleanup = {"rbc":     {"normal": 1, "abnormal": 0},
           "pc": {"normal": 1, "abnormal": 0},
           "pcc": {"present": 1, "notpresent": 0},
           "ba": {"present": 1, "notpresent": 0},
           "htn": {"yes": 1, "no": 0},
           "dm": {"yes": 1, "no": 0},
           "cad": {"yes": 1, "no": 0},
           "appet": {"good": 1, "poor": 0},
           "pe": {"yes": 1, "no": 0},
           "ane": {"yes": 1, "no": 0},
           "class": {"ckd": 1, "notckd": 0}}

In [4]:
dataset.replace(cleanup, inplace = True)

In [5]:
cm = sns.light_palette("green", as_cmap=True)
dataset.head().style.background_gradient(cmap=cm).set_precision(2).highlight_null('red')

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,1.0,0.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.02,4.0,0.0,,1.0,0.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.0,4.0,0.0,1.0,0.0,1.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1


# KNN imputational process

In [6]:
from sklearn.impute import KNNImputer

In [7]:
features_list = []
for feature in dataset.columns:
    if feature != 'class':
        features_list.append(feature)
corrs = {}
for feature in features_list:
    corrs[feature] = dataset['class'].corr(dataset[feature])

In [8]:
def knnImputerDatasetGenerator(df, neighbors):
    
    imputer_uniform = KNNImputer(n_neighbors=neighbors, weights = 'uniform')
    dataset_uniform = imputer_uniform.fit_transform(df)
    
    return pd.DataFrame(dataset_uniform, columns = df.columns)

In [9]:
df_knn_imputed_uniform_3 = knnImputerDatasetGenerator(dataset, 3)
df_knn_imputed_uniform_5 = knnImputerDatasetGenerator(dataset, 5)
df_knn_imputed_uniform_7 = knnImputerDatasetGenerator(dataset, 7)
df_knn_imputed_uniform_9 = knnImputerDatasetGenerator(dataset, 9)
df_knn_imputed_uniform_11 = knnImputerDatasetGenerator(dataset, 11)

In [10]:
dataset.fillna(round(dataset.median(),2), inplace = True)

In [11]:
cm = sns.light_palette("green", as_cmap=True)
dataset.head().style.background_gradient(cmap=cm).set_precision(2).highlight_null('red')

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,1.0,1.0,0.0,0.0,121.0,36.0,1.2,138.0,4.4,15.4,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.02,4.0,0.0,1.0,1.0,0.0,0.0,121.0,18.0,0.8,138.0,4.4,11.3,38.0,6000.0,4.8,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,53.0,1.8,138.0,4.4,9.6,31.0,7500.0,4.8,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.0,4.0,0.0,1.0,0.0,1.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,26.0,1.4,138.0,4.4,11.6,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1


# KNN imputed MODELS for K = 3, 5, 7, 9 & 11

In [12]:
def models(x):
    
    if(x==0):
        classifier = LogisticRegression()
    elif(x==1):
        classifier = KNeighborsClassifier()
    elif(x==2):
        classifier = RandomForestClassifier(n_estimators = 500)
    elif(x==3):
        classifier = SVC(kernel='rbf',random_state=None)
    else:
        classifier = GaussianNB()
        
    res = {}
    
    #dfs: array of imputed df
    dfs = [df_knn_imputed_uniform_3,df_knn_imputed_uniform_5,
           df_knn_imputed_uniform_7,df_knn_imputed_uniform_9,
           df_knn_imputed_uniform_11]
    
    dfs_name = ["knn_imputed_3", "knn_imputed_5",
                "knn_imputed_7","knn_imputed_9",
                "knn_imputed_11"]
    
    for i, df in enumerate(dfs):
        X = df.iloc[:,:-1]
        y = df['class']
        X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle = True, random_state = 2)
        
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)
        classifier.fit(X_train,y_train)
        
        test_pred = classifier.predict(X_test)
        train_pred = classifier.predict(X_train)
       
        # print(test_pred)
        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        
        tn, fp, fn, tp = confusion_matrix(y_test, test_pred).ravel()
        classification_report(y_test, test_pred)
        res[dfs_name[i]] = [train_acc, test_acc, tn, fp, fn, tp]
        
    return res

In [13]:
for i in range(5):
    result = models(i)
    res = pd.DataFrame.from_dict(data = result,orient='index', columns = ['train_acc', 'test_acc', 'true_neg', 'false_pos', 'false_neg', 'true_pos'])
    
    if(i==0):
        print("\033[1m",i+1,".PERFORMANCE INDICATORS for Logistic Regression \n")
    elif(i==1):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for KNN \n")
    elif(i==2):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Random Forest \n")
    elif(i==3):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for SVM \n")
    else:
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Naive Bayes \n")
        
    print(res,'\n')

[1m 1 .PERFORMANCE INDICATORS for Logistic Regression 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
knn_imputed_3    0.993333      0.99        34          1          0        65
knn_imputed_5    1.000000      0.99        34          1          0        65
knn_imputed_7    1.000000      0.98        33          2          0        65
knn_imputed_9    1.000000      0.98        33          2          0        65
knn_imputed_11   1.000000      0.98        33          2          0        65 

2 [1m.PERFORMANCE INDICATORS for KNN 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
knn_imputed_3    0.973333      0.97        35          0          3        62
knn_imputed_5    0.976667      0.97        35          0          3        62
knn_imputed_7    0.983333      0.98        35          0          2        63
knn_imputed_9    0.983333      0.97        35          0          3        62
knn_imputed_11   0.983333      0.96        

# Mode Imputed MODELS

In [14]:
def fun1(x):
    
        if(x==0):
            classifier = LogisticRegression()
        elif(x==1):
            classifier = KNeighborsClassifier()
        elif(x==2):
            classifier = RandomForestClassifier(n_estimators = 500)
        elif(x==3):
            classifier = SVC(kernel='rbf',random_state=None)
        else:
            classifier = GaussianNB()
        
    
        res={}
        dfs_name2 = ["Log_Regression","K-NN","Random_Forest","SVM","Naive_Bayes"]
        X = mode_dataset.iloc[:,:-1]
        y = mode_dataset['class']
        X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle = True, random_state = 2)

        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)

#         classifier = LogisticRegression()
        classifier.fit(X_train, y_train)

        test_pred = classifier.predict(X_test)
        train_pred = classifier.predict(X_train)

#         print(test_pred)
        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)

        cm=metrics.confusion_matrix(y_test,test_pred)

        tn, fp, fn, tp = confusion_matrix(y_test, test_pred).ravel()
        classification_report(y_test, test_pred)
        res[dfs_name2[x]] = [train_acc, test_acc, tn, fp, fn, tp]
        
        return res

In [15]:
for i in range(5):
    result = fun1(i)
    res = pd.DataFrame.from_dict(data = result,orient='index', columns = ['train_acc', 'test_acc', 'true_neg', 'false_pos', 'false_neg', 'true_pos'])
    
    if(i==0):
        print("\033[1m",i+1,".PERFORMANCE INDICATORS for Logistic Regression \n")
    elif(i==1):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for KNN \n")
    elif(i==2):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Random Forest \n")
    elif(i==3):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for SVM \n")
    else:
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Naive Bayes \n")
    
    print(res,'\n\n')

[1m 1 .PERFORMANCE INDICATORS for Logistic Regression 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
Log_Regression        1.0       1.0        35          0          0        65 


2 [1m.PERFORMANCE INDICATORS for KNN 

      train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
K-NN       0.99      0.98        35          0          2        63 


3 [1m.PERFORMANCE INDICATORS for Random Forest 

               train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
Random_Forest        1.0       1.0        35          0          0        65 


4 [1m.PERFORMANCE INDICATORS for SVM 

     train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
SVM        1.0       1.0        35          0          0        65 


5 [1m.PERFORMANCE INDICATORS for Naive Bayes 

             train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
Naive_Bayes       0.95      0.95        33          2          3        62 




# Mean Imputed MODELS

In [18]:
def fun2(x):
    
        if(x==0):
            classifier = LogisticRegression()
        elif(x==1):
            classifier = KNeighborsClassifier()
        elif(x==2):
            classifier = RandomForestClassifier(n_estimators = 500)
        elif(x==3):
            classifier = SVC(kernel='rbf',random_state=None)
        else:
            classifier = GaussianNB()
        
    
        res={}
        dfs_name2 = ["Log_Regression","K-NN","Random_Forest","SVM","Naive_Bayes"]
        X = mean_dataset.iloc[:,:-1]
        y = mean_dataset['class']
        X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle = True, random_state = 2)

        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)

#         classifier = LogisticRegression()
        classifier.fit(X_train, y_train)

        test_pred = classifier.predict(X_test)
        train_pred = classifier.predict(X_train)

#         print(test_pred)
        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)

        cm=metrics.confusion_matrix(y_test,test_pred)

        tn, fp, fn, tp = confusion_matrix(y_test, test_pred).ravel()
        classification_report(y_test, test_pred)
        res[dfs_name2[x]] = [train_acc, test_acc, tn, fp, fn, tp]
        
        return res

In [19]:
for i in range(5):
    result = fun2(i)
    res = pd.DataFrame.from_dict(data = result,orient='index', columns = ['train_acc', 'test_acc', 'true_neg', 'false_pos', 'false_neg', 'true_pos'])
    
    if(i==0):
        print("\033[1m",i+1,".PERFORMANCE INDICATORS for Logistic Regression \n")
    elif(i==1):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for KNN \n")
    elif(i==2):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Random Forest \n")
    elif(i==3):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for SVM \n")
    else:
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Naive Bayes \n")
    
    print(res,'\n\n')

[1m 1 .PERFORMANCE INDICATORS for Logistic Regression 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
Log_Regression        1.0       1.0        35          0          0        65 


2 [1m.PERFORMANCE INDICATORS for KNN 

      train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
K-NN       0.99      0.98        35          0          2        63 


3 [1m.PERFORMANCE INDICATORS for Random Forest 

               train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
Random_Forest        1.0       1.0        35          0          0        65 


4 [1m.PERFORMANCE INDICATORS for SVM 

     train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
SVM        1.0       1.0        35          0          0        65 


5 [1m.PERFORMANCE INDICATORS for Naive Bayes 

             train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
Naive_Bayes       0.95      0.95        33          2          3        62 


