# Imputing data with KNN-Imputaion and Mean Imputation

In [1]:
import pandas as pd
import numpy as np
import datetime
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

from sklearn.impute import KNNImputer

from sklearn import linear_model
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

Importing Raw Dataset

In [2]:
df=pd.read_csv('raw_ckd.csv')

In [3]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


Let's find out whether our data contains any missing data or not.

In [4]:
df.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wbcc     106
rbcc     131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

From the above code, we can understand that there are lots of missing values in almost all columns. It is difficult for us to proceed with analyzing these data if we don't handle these missing data on first place.

Let's find out what are different types of data in our dataset.

In [5]:
df.dtypes

age      float64
bp       float64
sg       float64
al       float64
su       float64
rbc       object
pc        object
pcc       object
ba        object
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
class     object
dtype: object

We can see that many colummns contains data which are not in either float or integer format. This can hinder our analysis.So, it is advisable for us to change these to numeric data.

Checking columns of object types

In [6]:
df['class'].unique()

array(['ckd', 'notckd'], dtype=object)

In [7]:
df['ane'].unique()

array(['no', 'yes', nan], dtype=object)

In [8]:
df['pe'].unique()

array(['no', 'yes', nan], dtype=object)

In [9]:
df['appet'].unique()

array(['good', 'poor', nan], dtype=object)

In [10]:
df['cad'].unique()

array(['no', 'yes', nan], dtype=object)

In [11]:
df['dm'].unique()

array(['yes', 'no', nan], dtype=object)

In [12]:
df['htn'].unique()

array(['yes', 'no', nan], dtype=object)

In [13]:
df['rbcc'].unique()

array([5.2, nan, 3.9, 4.6, 4.4, 5. , 4. , 3.7, 3.8, 3.4, 2.6, 2.8, 4.3,
       3.2, 3.6, 4.1, 4.9, 2.5, 4.2, 4.5, 3.1, 4.7, 3.5, 6. , 2.1, 5.6,
       2.3, 2.9, 2.7, 8. , 3.3, 3. , 2.4, 4.8, 5.4, 6.1, 6.2, 6.3, 5.1,
       5.8, 5.5, 5.3, 6.4, 5.7, 5.9, 6.5])

In [14]:
df['wbcc'].unique()

array([ 7800.,  6000.,  7500.,  6700.,  7300.,    nan,  6900.,  9600.,
       12100.,  4500., 12200., 11000.,  3800., 11400.,  5300.,  9200.,
        6200.,  8300.,  8400., 10300.,  9800.,  9100.,  7900.,  6400.,
        8600., 18900., 21600.,  4300.,  8500., 11300.,  7200.,  7700.,
       14600.,  6300.,  7100., 11800.,  9400.,  5500.,  5800., 13200.,
       12500.,  5600.,  7000., 11900., 10400., 10700., 12700.,  6800.,
        6500., 13600., 10200.,  9000., 14900.,  8200., 15200.,  5000.,
       16300., 12400., 10500.,  4200.,  4700., 10900.,  8100.,  9500.,
        2200., 12800., 11200., 19100., 12300., 16700.,  2600., 26400.,
        8800.,  7400.,  4900.,  8000., 12000., 15700.,  4100.,  5700.,
       11500.,  5400., 10800.,  9900.,  5200.,  5900.,  9300.,  9700.,
        5100.,  6600.])

In [15]:
df['pcv'].unique()

array([44., 38., 31., 32., 35., 39., 36., 33., 29., 28., nan, 16., 24.,
       37., 30., 34., 40., 45., 27., 48., 52., 14., 22., 18., 42., 17.,
       46., 23., 19., 25., 41., 26., 15., 21., 43., 20., 47.,  9., 49.,
       50., 53., 51., 54.])

In [16]:
df['ba'].unique()

array(['notpresent', 'present', nan], dtype=object)

In [17]:
df['pcc'].unique()

array(['notpresent', 'present', nan], dtype=object)

In [18]:
df['pc'].unique()

array(['normal', 'abnormal', nan], dtype=object)

In [19]:
df['rbc'].unique()

array([nan, 'normal', 'abnormal'], dtype=object)

We have to handle missing values. We analyzed our dataste throughly and we can up with few reults.

1) In columns like: age, bgr, bu, sc, sod, pot, hemo, pcv and rbcc , missing values will be replaced with average/mean of that column's data.

2) In columns like: rbc, pc, pcc, ba, pcv, wbcc, htn, dm , cad, appet, pe, ane and classification , missing values will be replaced with most frequently occuring data as these contain string data.

In [20]:
cleanup = {"rbc":     {"normal": 1, "abnormal": 0},
           "pc": {"normal": 1, "abnormal": 0},
           "pcc": {"present": 1, "notpresent": 0},
           "ba": {"present": 1, "notpresent": 0},
           "htn": {"yes": 1, "no": 0},
           "dm": {"yes": 1, "no": 0},
           "cad": {"yes": 1, "no": 0},
           "appet": {"good": 1, "poor": 0},
           "pe": {"yes": 1, "no": 0},
           "ane": {"yes": 1, "no": 0},
           "class": {"ckd": 1, "notckd": 0}}

In [21]:
df.replace(cleanup, inplace = True)

Type casting the columns into numeric data type

In [22]:
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.020,4.0,0.0,,1.0,0.0,0.0,,...,38.0,6000.0,,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.010,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.010,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,140.0,...,47.0,6700.0,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0
396,42.0,70.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,75.0,...,54.0,7800.0,6.2,0.0,0.0,0.0,1.0,0.0,0.0,0
397,12.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,100.0,...,49.0,6600.0,5.4,0.0,0.0,0.0,1.0,0.0,0.0,0
398,17.0,60.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,114.0,...,51.0,7200.0,5.9,0.0,0.0,0.0,1.0,0.0,0.0,0


In [23]:
def knnImputerDatasetGenerator(df, neighbors):
    
    imputer_uniform = KNNImputer(n_neighbors=neighbors, weights = 'uniform')
    dataset_uniform = imputer_uniform.fit_transform(df)
    
    return pd.DataFrame(dataset_uniform, columns = df.columns)

In [28]:
df_knn_imputed_uniform_3 = knnImputerDatasetGenerator(df, 3)
df_knn_imputed_uniform_5 = knnImputerDatasetGenerator(df, 5)
df_knn_imputed_uniform_7 = knnImputerDatasetGenerator(df, 7)
df_knn_imputed_uniform_9 = knnImputerDatasetGenerator(df, 9)    
df_knn_imputed_uniform_11 = knnImputerDatasetGenerator(df, 11)
df_mean_imputed = df.fillna(df.mean())

All the NaN/missing value gets Imputed with 5 types of KNN-Imputated values and 1 mean value

In [30]:
df_knn_imputed_uniform_3.to_csv("df_knn_imputed_uniform_3.csv")
df_knn_imputed_uniform_5.to_csv("df_knn_imputed_uniform_5.csv")
df_knn_imputed_uniform_7.to_csv("df_knn_imputed_uniform_7.csv")
df_knn_imputed_uniform_9.to_csv("df_knn_imputed_uniform_9.csv")
df_knn_imputed_uniform_11.to_csv("df_knn_imputed_uniform_11.csv")
df_mean_imputed.to_csv("df_mean_imputed.csv")