In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv('../kidney_disease_data.csv',sep= ',')

In [4]:
df = df.drop('id',axis=1)

There are 25 attributes: <br/>
1) age: Age (yeas) --> numerical<br/>
2) bp: Blood Pressure (mm/Hg) --> numerical<br/>
3) sg: Specific Gravity (1.005,1.010,1.015,1.020,1.025) --> numerical<br/>
4) al: Albumin(0,1,2,3,4,5) --> numerical<br/>
5) su: Sugar(0,1,2,3,4,5) --> numerical<br/>
6) rbc: Red Blood Cells (normal, abnormal) --> binary <br/>
7) pc: Pus Cell (normal, abnormal) --> binary <br/>
8) pcc: Pus Cell clumps (present, notpresent) --> binary<br/>
9) ba: Bacteria (present, notpresent) --> binary<br/>
10) bgr: Blood Glucose Random (mgs/dl) --> numerical<br/>
11) bu: Blood Urea (mgs/dl) --> numerical<br/>
12) sc: Serum Creatinine (mgs/dl) --> numerical<br/>
13) sod: Sodium (mEq/L) --> numerical<br/>
14) pot: Potassium (mEq/L) --> numerical<br/>
15) hemo: Hemoglobin (gms) --> numerical<br/>
16) pcv: Packed Cell Volume --> numerical<br/>
17) wc: White Blood Cell Count (cells/cumm) --> numerical<br/>
18) rc: Red Blood Cell Count (millions/cmm) --> numerical<br/>
19) htn: Hypertension (yes, no) --> binary <br/>
20) dm: Diabetes Mellitus (yes, no) --> binary <br/>
21) cad: Coronary Artery Disease (yes, no) --> binary<br/>
22) appet: Appetite (good, poor) --> binary <br/>
23) pe: Pedal Edema (yes, no) --> binary <br/>
24) ane: Anemia (yes, no) --> binary <br/>

and 1 class: ckd (chronic kidney disease) and notckd (not chronic kidney disease)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
rbc               248 non-null object
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
wc                295 non-null object
rc                270 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane       

We will remove the attributes `rbc`, `wc`, and `rc` which contain so many missing values.

In [6]:
df = df.drop(['rbc', 'wc', 'rc'],axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 22 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane               399 non-null object
classification    400 non-null object
dtypes: float64(11), object(11)
memory usage: 68

In [8]:
df.replace(r'^\s+$', np.nan, regex=True)

Unnamed: 0,age,bp,sg,al,su,pc,pcc,ba,bgr,bu,...,pot,hemo,pcv,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.020,1.0,0.0,normal,notpresent,notpresent,121.0,36.0,...,,15.4,44,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,normal,notpresent,notpresent,,18.0,...,,11.3,38,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,notpresent,notpresent,423.0,53.0,...,,9.6,31,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,abnormal,present,notpresent,117.0,56.0,...,2.5,11.2,32,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,notpresent,notpresent,106.0,26.0,...,,11.6,35,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,,notpresent,notpresent,74.0,25.0,...,3.2,12.2,39,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.010,0.0,0.0,normal,notpresent,notpresent,100.0,54.0,...,4.0,12.4,36,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,abnormal,notpresent,notpresent,410.0,31.0,...,,12.4,44,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,abnormal,present,notpresent,138.0,60.0,...,,10.8,33,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.020,2.0,0.0,abnormal,present,notpresent,70.0,107.0,...,3.7,9.5,29,yes,yes,no,poor,no,yes,ckd


In [9]:
X = df.drop('classification',axis=1)
index = X.columns

In [10]:
np.dtype(X['ane'])

dtype('O')

In [11]:
X['ane'].value_counts()

no     339
yes     60
Name: ane, dtype: int64

In [20]:
X = X.replace(r'^\s+$', np.nan, regex=True)

In [22]:
X['ane'].value_counts()

no     339
yes     60
Name: ane, dtype: int64

In [23]:
X['ane'][294]

nan

In [12]:
#n = len(index)

#for i in range(n):   
#    if np.dtype(X[index[i]]) == np.dtype('O'):
#        print(i, np.unique(X[index[i]]))

In [14]:
#for t in range(len(a)):
    #if a[t] != 'yes' and a[t] != 'no':
    #    print(t)
    

In [15]:
#for t in range(400):
#    if X['ane'][t].isnull():
#        print(t)

#null_data = df[df.isnull().any(axis=1)]

#null_data = df[df.isnull().any(axis=1)]

In [16]:
#X = X.replace('\t','',regex=True)
#X = X.replace(' ','',regex=True)

#X[index[14]] = X[index[14]].replace('\?','np.nan',regex=True).astype(float)

In [17]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            # numerical --> mean, categorical --> median
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], index=X.columns)  
                               
            # numerical, categorical --> median                   
            #if X[c].dtype == np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [18]:
#df = DataFrameImputer().fit_transform(df)