In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv('../kidney_disease_data.csv',sep= ',')

In [4]:
df = df.drop('id',axis=1)

There are 25 attributes: <br/>
1) age: Age (yeas) --> numerical<br/>
2) bp: Blood Pressure (mm/Hg) --> numerical<br/>
3) sg: Specific Gravity (1.005,1.010,1.015,1.020,1.025) --> numerical<br/>
4) al: Albumin(0,1,2,3,4,5) --> numerical<br/>
5) su: Sugar(0,1,2,3,4,5) --> numerical<br/>
6) rbc: Red Blood Cells (normal, abnormal) --> binary <br/>
7) pc: Pus Cell (normal, abnormal) --> binary <br/>
8) pcc: Pus Cell clumps (present, notpresent) --> binary<br/>
9) ba: Bacteria (present, notpresent) --> binary<br/>
10) bgr: Blood Glucose Random (mgs/dl) --> numerical<br/>
11) bu: Blood Urea (mgs/dl) --> numerical<br/>
12) sc: Serum Creatinine (mgs/dl) --> numerical<br/>
13) sod: Sodium (mEq/L) --> numerical<br/>
14) pot: Potassium (mEq/L) --> numerical<br/>
15) hemo: Hemoglobin (gms) --> numerical<br/>
16) pcv: Packed Cell Volume --> numerical<br/>
17) wc: White Blood Cell Count (cells/cumm) --> numerical<br/>
18) rc: Red Blood Cell Count (millions/cmm) --> numerical<br/>
19) htn: Hypertension (yes, no) --> binary <br/>
20) dm: Diabetes Mellitus (yes, no) --> binary <br/>
21) cad: Coronary Artery Disease (yes, no) --> binary<br/>
22) appet: Appetite (good, poor) --> binary <br/>
23) pe: Pedal Edema (yes, no) --> binary <br/>
24) ane: Anemia (yes, no) --> binary <br/>

and 1 class: ckd (chronic kidney disease) and notckd (not chronic kidney disease)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
rbc               248 non-null object
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
wc                295 non-null object
rc                270 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane       

We will remove the attributes `rbc`, `wc`, and `rc` which contain so many missing values.

In [6]:
df = df.drop(['rbc', 'wc', 'rc','sod','pot'],axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 20 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane               399 non-null object
classification    400 non-null object
dtypes: float64(9), object(11)
memory usage: 62.6+ KB


In [8]:
# repalce empty by nan
df = df.replace(r'^\s+$', np.nan, regex=True)
df.head()

Unnamed: 0,age,bp,sg,al,su,pc,pcc,ba,bgr,bu,sc,hemo,pcv,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,normal,notpresent,notpresent,121.0,36.0,1.2,15.4,44,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,notpresent,notpresent,,18.0,0.8,11.3,38,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,notpresent,notpresent,423.0,53.0,1.8,9.6,31,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,abnormal,present,notpresent,117.0,56.0,3.8,11.2,32,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,notpresent,notpresent,106.0,26.0,1.4,11.6,35,no,no,no,good,no,no,ckd


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 20 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane               399 non-null object
classification    400 non-null object
dtypes: float64(9), object(11)
memory usage: 62.6+ KB


In [10]:
# find bad rows having too many missing values
n_null = np.array(df.isnull().sum(axis=1))
bad_row = np.array([])
for t in range(len(n_null)):
    if n_null[t] > 4:
        #print(t)
        bad_row = np.append(bad_row,t)
        
bad_row

array([ 13.,  17.,  21.,  23.,  30.,  57.,  59.,  82.,  86., 104., 109.,
       113., 122., 125., 132., 142., 148., 161., 165., 166., 188., 197.,
       203., 215., 222., 228., 232., 268.])

In [11]:
len(bad_row)

28

In [12]:
# delete bad rows
df = df.drop(bad_row)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372 entries, 0 to 399
Data columns (total 20 columns):
age               366 non-null float64
bp                364 non-null float64
sg                347 non-null float64
al                348 non-null float64
su                345 non-null float64
pc                330 non-null object
pcc               368 non-null object
ba                368 non-null object
bgr               338 non-null float64
bu                359 non-null float64
sc                361 non-null float64
hemo              336 non-null float64
pcv               325 non-null object
htn               370 non-null object
dm                370 non-null object
cad               370 non-null object
appet             371 non-null object
pe                371 non-null object
ane               371 non-null object
classification    372 non-null object
dtypes: float64(9), object(11)
memory usage: 61.0+ KB


In [13]:
df = df.replace('\t','',regex=True)

In [14]:
df = df.replace(' ','',regex=True)

In [15]:
df = df.replace('\?','np.nan',regex=True)

In [16]:
np.dtype(df['pcv'])

dtype('O')

In [17]:
df["pcv"] = pd.to_numeric(df.pcv, errors='coerce')

In [18]:
#for t in range(400):
#    if X['ane'][t].isnull():
#        print(t)

#null_data = df[df.isnull().any(axis=1)]
#X = X.replace('\t','',regex=True)
#X = X.replace(' ','',regex=True)

#X[index[14]] = X[index[14]].replace('\?','np.nan',regex=True).astype(float)

In [19]:
X = df.drop('classification',axis=1)
y = df['classification']

In [20]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            # numerical --> mean, categorical --> median
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], index=X.columns)  
                               
            # numerical, categorical --> median                   
            #if X[c].dtype == np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [21]:
X = DataFrameImputer().fit_transform(X)

In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372 entries, 0 to 399
Data columns (total 19 columns):
age      372 non-null float64
bp       372 non-null float64
sg       372 non-null float64
al       372 non-null float64
su       372 non-null float64
pc       372 non-null object
pcc      372 non-null object
ba       372 non-null object
bgr      372 non-null float64
bu       372 non-null float64
sc       372 non-null float64
hemo     372 non-null float64
pcv      372 non-null float64
htn      372 non-null object
dm       372 non-null object
cad      372 non-null object
appet    372 non-null object
pe       372 non-null object
ane      372 non-null object
dtypes: float64(10), object(9)
memory usage: 58.1+ KB


In [23]:
X = np.array(X)

In [24]:
l,n = X.shape
ib_list = [5,6,7,13,14,15,16,17,18]
xb = np.zeros((l,n))
for ib in ib_list:
    xb[:,ib] = np.array([1. if t == np.unique(X[:,ib])[0] else -1. for t in X[:,ib]])

In [25]:
X1 = X[:,:5].astype(float)
X2 = X[:,8:13].astype(float)

In [26]:
X = np.hstack([X1,xb[:,5:8],X2,xb[:,13:]])

In [27]:
X.shape

(372, 19)

In [28]:
#np.savetxt('../kidney_X_cleaned.txt',X,fmt='%f')

In [33]:
y = np.array(y)
np.unique(y,return_counts=True)

(array(['ckd', 'notckd'], dtype=object), array([223, 149]))

In [34]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

y = onehot_encoder.fit_transform(y.reshape(-1,1))
y = np.argmax(y,axis=1)
y.shape

(372,)

In [36]:
np.savetxt('../kidney_X_cleaned.txt',X,fmt='%f')
np.savetxt('../kidney_y_cleaned.txt',y,fmt='%i')