In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv('../kidney_disease_data.csv',sep= ',')

In [4]:
df = df.drop('id',axis=1)

There are 25 attributes: <br/>
1) age: Age (yeas) --> numerical<br/>
2) bp: Blood Pressure (mm/Hg) --> numerical<br/>
3) sg: Specific Gravity (1.005,1.010,1.015,1.020,1.025) --> numerical<br/>
4) al: Albumin(0,1,2,3,4,5) --> numerical<br/>
5) su: Sugar(0,1,2,3,4,5) --> numerical<br/>
6) rbc: Red Blood Cells (normal, abnormal) --> binary <br/>
7) pc: Pus Cell (normal, abnormal) --> binary <br/>
8) pcc: Pus Cell clumps (present, notpresent) --> binary<br/>
9) ba: Bacteria (present, notpresent) --> binary<br/>
10) bgr: Blood Glucose Random (mgs/dl) --> numerical<br/>
11) bu: Blood Urea (mgs/dl) --> numerical<br/>
12) sc: Serum Creatinine (mgs/dl) --> numerical<br/>
13) sod: Sodium (mEq/L) --> numerical<br/>
14) pot: Potassium (mEq/L) --> numerical<br/>
15) hemo: Hemoglobin (gms) --> numerical<br/>
16) pcv: Packed Cell Volume --> numerical<br/>
17) wc: White Blood Cell Count (cells/cumm) --> numerical<br/>
18) rc: Red Blood Cell Count (millions/cmm) --> numerical<br/>
19) htn: Hypertension (yes, no) --> binary <br/>
20) dm: Diabetes Mellitus (yes, no) --> binary <br/>
21) cad: Coronary Artery Disease (yes, no) --> binary<br/>
22) appet: Appetite (good, poor) --> binary <br/>
23) pe: Pedal Edema (yes, no) --> binary <br/>
24) ane: Anemia (yes, no) --> binary <br/>

and 1 class: ckd (chronic kidney disease) and notckd (not chronic kidney disease)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
rbc               248 non-null object
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
wc                295 non-null object
rc                270 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane       

We will remove the attributes `rbc`, `wc`, and `rc` which contain so many missing values.

In [6]:
df = df.drop(['rbc', 'wc', 'rc'],axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 22 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane               399 non-null object
classification    400 non-null object
dtypes: float64(11), object(11)
memory usage: 68

In [8]:
df

Unnamed: 0,age,bp,sg,al,su,pc,pcc,ba,bgr,bu,...,pot,hemo,pcv,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.020,1.0,0.0,normal,notpresent,notpresent,121.0,36.0,...,,15.4,44,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,normal,notpresent,notpresent,,18.0,...,,11.3,38,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,notpresent,notpresent,423.0,53.0,...,,9.6,31,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,abnormal,present,notpresent,117.0,56.0,...,2.5,11.2,32,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,notpresent,notpresent,106.0,26.0,...,,11.6,35,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,,notpresent,notpresent,74.0,25.0,...,3.2,12.2,39,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.010,0.0,0.0,normal,notpresent,notpresent,100.0,54.0,...,4.0,12.4,36,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,abnormal,notpresent,notpresent,410.0,31.0,...,,12.4,44,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,abnormal,present,notpresent,138.0,60.0,...,,10.8,33,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.020,2.0,0.0,abnormal,present,notpresent,70.0,107.0,...,3.7,9.5,29,yes,yes,no,poor,no,yes,ckd


In [9]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            # numerical --> mean, categorical --> median
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], index=X.columns)  
                               
            # numerical, categorical --> median                   
            #if X[c].dtype == np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [10]:
df = DataFrameImputer().fit_transform(df)
df

Unnamed: 0,age,bp,sg,al,su,pc,pcc,ba,bgr,bu,...,pot,hemo,pcv,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.000000,1.020000,1.000000,0.000000,normal,notpresent,notpresent,121.000000,36.000000,...,4.627244,15.400000,44,yes,yes,no,good,no,no,ckd
1,7.0,50.000000,1.020000,4.000000,0.000000,normal,notpresent,notpresent,148.036517,18.000000,...,4.627244,11.300000,38,no,no,no,good,no,no,ckd
2,62.0,80.000000,1.010000,2.000000,3.000000,normal,notpresent,notpresent,423.000000,53.000000,...,4.627244,9.600000,31,no,yes,no,poor,no,yes,ckd
3,48.0,70.000000,1.005000,4.000000,0.000000,abnormal,present,notpresent,117.000000,56.000000,...,2.500000,11.200000,32,yes,no,no,poor,yes,yes,ckd
4,51.0,80.000000,1.010000,2.000000,0.000000,normal,notpresent,notpresent,106.000000,26.000000,...,4.627244,11.600000,35,no,no,no,good,no,no,ckd
5,60.0,90.000000,1.015000,3.000000,0.000000,normal,notpresent,notpresent,74.000000,25.000000,...,3.200000,12.200000,39,yes,yes,no,good,yes,no,ckd
6,68.0,70.000000,1.010000,0.000000,0.000000,normal,notpresent,notpresent,100.000000,54.000000,...,4.000000,12.400000,36,no,no,no,good,no,no,ckd
7,24.0,76.469072,1.015000,2.000000,4.000000,abnormal,notpresent,notpresent,410.000000,31.000000,...,4.627244,12.400000,44,no,yes,no,good,yes,no,ckd
8,52.0,100.000000,1.015000,3.000000,0.000000,abnormal,present,notpresent,138.000000,60.000000,...,4.627244,10.800000,33,yes,yes,no,good,no,yes,ckd
9,53.0,90.000000,1.020000,2.000000,0.000000,abnormal,present,notpresent,70.000000,107.000000,...,3.700000,9.500000,29,yes,yes,no,poor,no,yes,ckd


In [19]:
index=df.columns

In [20]:
index

Index(['age', 'bp', 'sg', 'al', 'su', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc',
       'sod', 'pot', 'hemo', 'pcv', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane',
       'classification'],
      dtype='object')

In [21]:
index[0]

'age'

In [28]:
np.dtype(df['ane'])

dtype('O')

In [29]:
np.dtype(df['age'])

dtype('float64')

In [35]:
X = df.drop('classification',axis=1)

In [37]:
index = X.columns

In [38]:
index[0]

'age'

In [39]:
n = len(index)
n

21

In [43]:
for i in range(n):   
    if np.dtype(X[index[i]]) == np.dtype('O'):
        print(i, np.unique(X[index[i]]))

5 ['abnormal' 'normal']
6 ['notpresent' 'present']
7 ['notpresent' 'present']
14 ['\t43' '\t?' '14' '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25'
 '26' '27' '28' '29' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39'
 '40' '41' '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53'
 '54' '9']
15 ['no' 'yes']
16 ['\tno' '\tyes' ' yes' 'no' 'yes']
17 ['\tno' 'no' 'yes']
18 ['good' 'poor']
19 ['no' 'yes']
20 ['no' 'yes']


In [44]:
X = X.replace('\t','',regex=True)

In [45]:
for i in range(n):   
    if np.dtype(X[index[i]]) == np.dtype('O'):
        print(i, np.unique(X[index[i]]))

5 ['abnormal' 'normal']
6 ['notpresent' 'present']
7 ['notpresent' 'present']
14 ['14' '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27'
 '28' '29' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '9' '?']
15 ['no' 'yes']
16 [' yes' 'no' 'yes']
17 ['no' 'yes']
18 ['good' 'poor']
19 ['no' 'yes']
20 ['no' 'yes']


In [46]:
index[14]

'pcv'

In [48]:
X = X.replace(' ','',regex=True)

In [50]:
for i in range(n):   
    if np.dtype(X[index[i]]) == np.dtype('O'):
        print(i, np.unique(X[index[i]]))

5 ['abnormal' 'normal']
6 ['notpresent' 'present']
7 ['notpresent' 'present']
14 ['14' '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27'
 '28' '29' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '9' '?']
15 ['no' 'yes']
16 ['no' 'yes']
17 ['no' 'yes']
18 ['good' 'poor']
19 ['no' 'yes']
20 ['no' 'yes']


In [53]:
a = index[14]

In [54]:
a

'pcv'

In [60]:
X[a] = X[a].replace('\?','np.nan',regex=True).astype(float)

ValueError: could not convert string to float: 'np.nan'