## Import necessary python libraries

In [1]:
import pandas as pd
import numpy as np

from scipy.io import arff 

Relevant Information:
- age		-	age	
- bp		-	blood pressure
- sg		-	specific gravity
- al		-   	albumin
- su		-	sugar
- rbc		-	red blood cells
- pc		-	pus cell
- pcc		-	pus cell clumps
- ba		-	bacteria
- bgr		-	blood glucose random
- bu		-	blood urea
- sc		-	serum creatinine
- sod		-	sodium
- pot		-	potassium
- hemo		-	hemoglobin
- pcv		-	packed cell volume
- wc		-	white blood cell count
- rc		-	red blood cell count
- htn		-	hypertension
- dm		-	diabetes mellitus
- cad		-	coronary artery disease
- appet		-	appetite
- pe		-	pedal edema
- ane		-	anemia
- class		-	class

Attribute Informations

- 'age' numeric
- 'bp'  numeric
- 'sg' {1.005,1.010,1.015,1.020,1.025}
- 'al' {0,1,2,3,4,5}  
- 'su' {0,1,2,3,4,5}  
- 'rbc' {normal,abnormal}
- 'pc' {normal,abnormal} 
- 'pcc' {present,notpresent}
- 'ba' {present,notpresent}
- 'bgr'  numeric
- 'bu' numeric
- 'sc' numeric
- 'sod' numeric
- 'pot' numeric
- 'hemo' numeric
- 'pcv' numeric
- 'wbcc' numeric
- 'rbcc' numeric
- 'htn' {yes,no}
- 'dm' {yes,no}
- 'cad' {yes,no}
- 'appet' {good,poor}
- 'pe' {yes,no} 
- 'ane' {yes,no}
- 'class' {ckd,notckd}

## Load Data from arff file

In [2]:
# Load data from given arff file path
data, meta = arff.loadarff('./data/original/chronic_kidney_disease.arff')

In [3]:
# Convert data to pandas DataFrame
df = pd.DataFrame.from_records(data)

## Data Information

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      400 non-null    object 
 3   al      400 non-null    object 
 4   su      400 non-null    object 
 5   rbc     400 non-null    object 
 6   pc      400 non-null    object 
 7   pcc     400 non-null    object 
 8   ba      400 non-null    object 
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      383 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     329 non-null    float64
 16  wbcc    294 non-null    float64
 17  rbcc    269 non-null    float64
 18  htn     400 non-null    object 
 19  dm      400 non-null    object 
 20  cad     400 non-null    object 
 21  appet   400 non-null    object 
 22  pe

In [5]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,b'1.020',b'1',b'0',b'?',b'normal',b'notpresent',b'notpresent',121.0,...,44.0,7800.0,5.2,b'yes',b'yes',b'no',b'good',b'no',b'no',b'ckd'
1,7.0,50.0,b'1.020',b'4',b'0',b'?',b'normal',b'notpresent',b'notpresent',,...,38.0,6000.0,,b'no',b'no',b'no',b'good',b'no',b'no',b'ckd'
2,62.0,80.0,b'1.010',b'2',b'3',b'normal',b'normal',b'notpresent',b'notpresent',423.0,...,31.0,7500.0,,b'no',b'yes',b'no',b'poor',b'no',b'yes',b'ckd'
3,48.0,70.0,b'1.005',b'4',b'0',b'normal',b'abnormal',b'present',b'notpresent',117.0,...,32.0,6700.0,3.9,b'yes',b'no',b'no',b'poor',b'yes',b'yes',b'ckd'
4,51.0,80.0,b'1.010',b'2',b'0',b'normal',b'normal',b'notpresent',b'notpresent',106.0,...,35.0,7300.0,4.6,b'no',b'no',b'no',b'good',b'no',b'no',b'ckd'


In [6]:
df.shape

(400, 25)

In [7]:
df["class"].value_counts()

b'ckd'       250
b'notckd'    150
Name: class, dtype: int64

Lets look at the info of the dataframe.

When we look at the values of some features like sugar `su`, we see that the datatype is in byte strings. So, these datatypes are recoded to appropriate datatype.

Some attributes like `su` can have float datatype while other attributes like `rbc` can have string datatype.

## Recoding

At first, let us write a function that decodes the byte strings to strings datatype. Also, the function replaces `?` with `np.nan`.

In [8]:
def recode_bytes(val):
    """
    Recode columns datatype with byte strings to strings. 
    And, replace ? with NaN.
    """
    out = val.decode("latin1")
    if out == '?':
        out = np.nan
    return out

Now, let us iterate over each columns and change the datatypes.


In [9]:
for column_name, column_dtype in df.dtypes.items():
    # Skip the columns whose datatype is not object.
    if column_dtype is not np.dtype(object):
        continue
    
    # Recode object column with recode_bytes function.
    new_col = df[column_name].apply(recode_bytes)
    
    # Convert to float if possible.
    try:
        new_col = new_col.astype(float)
    except ValueError:
        pass
    
    df[column_name] = new_col 

In [10]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


## Cleaning

In [11]:
cleaned_df = df.dropna(how='any')

In [12]:
cleaned_df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd
11,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,...,32.0,4500.0,3.8,yes,yes,no,poor,yes,no,ckd
14,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,157.0,...,16.0,11000.0,2.6,yes,yes,yes,poor,yes,no,ckd
20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,173.0,...,24.0,9200.0,3.2,yes,yes,yes,poor,yes,yes,ckd


In [13]:
cleaned_df["class"].value_counts()

notckd    115
ckd        43
Name: class, dtype: int64

## Export to CSV file

Both uncleaned and cleaned dataframes are exported as csv file.

In [14]:
df.to_csv("./data/chronic_kidney_disease_full.csv")
cleaned_df.to_csv("./data/chronic_kidney_disease_cleaned.csv")