## Import necessary python libraries

In [1]:
import pandas as pd
import numpy as np

from scipy.io import arff 

Relevant Information:
- age		-	age	
- bp		-	blood pressure
- sg		-	specific gravity
- al		-   	albumin
- su		-	sugar
- rbc		-	red blood cells
- pc		-	pus cell
- pcc		-	pus cell clumps
- ba		-	bacteria
- bgr		-	blood glucose random
- bu		-	blood urea
- sc		-	serum creatinine
- sod		-	sodium
- pot		-	potassium
- hemo		-	hemoglobin
- pcv		-	packed cell volume
- wc		-	white blood cell count
- rc		-	red blood cell count
- htn		-	hypertension
- dm		-	diabetes mellitus
- cad		-	coronary artery disease
- appet		-	appetite
- pe		-	pedal edema
- ane		-	anemia
- class		-	class

Attribute Informations

- 'age' numeric
- 'bp'  numeric
- 'sg' {1.005,1.010,1.015,1.020,1.025}
- 'al' {0,1,2,3,4,5}  
- 'su' {0,1,2,3,4,5}  
- 'rbc' {normal,abnormal}
- 'pc' {normal,abnormal} 
- 'pcc' {present,notpresent}
- 'ba' {present,notpresent}
- 'bgr'  numeric
- 'bu' numeric
- 'sc' numeric
- 'sod' numeric
- 'pot' numeric
- 'hemo' numeric
- 'pcv' numeric
- 'wbcc' numeric
- 'rbcc' numeric
- 'htn' {yes,no}
- 'dm' {yes,no}
- 'cad' {yes,no}
- 'appet' {good,poor}
- 'pe' {yes,no} 
- 'ane' {yes,no}
- 'class' {ckd,notckd}

## Load Data from arff file

In [2]:
# Load data from given arff file path
data, meta = arff.loadarff('./data/chronic_kidney_disease.arff')

In [3]:
# Convert data to pandas DataFrame
df = pd.DataFrame.from_records(data)

## Data Information

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      400 non-null    object 
 3   al      400 non-null    object 
 4   su      400 non-null    object 
 5   rbc     400 non-null    object 
 6   pc      400 non-null    object 
 7   pcc     400 non-null    object 
 8   ba      400 non-null    object 
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      383 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     329 non-null    float64
 16  wbcc    294 non-null    float64
 17  rbcc    269 non-null    float64
 18  htn     400 non-null    object 
 19  dm      400 non-null    object 
 20  cad     400 non-null    object 
 21  appet   400 non-null    object 
 22  pe

In [5]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,b'1.020',b'1',b'0',b'?',b'normal',b'notpresent',b'notpresent',121.0,...,44.0,7800.0,5.2,b'yes',b'yes',b'no',b'good',b'no',b'no',b'ckd'
1,7.0,50.0,b'1.020',b'4',b'0',b'?',b'normal',b'notpresent',b'notpresent',,...,38.0,6000.0,,b'no',b'no',b'no',b'good',b'no',b'no',b'ckd'
2,62.0,80.0,b'1.010',b'2',b'3',b'normal',b'normal',b'notpresent',b'notpresent',423.0,...,31.0,7500.0,,b'no',b'yes',b'no',b'poor',b'no',b'yes',b'ckd'
3,48.0,70.0,b'1.005',b'4',b'0',b'normal',b'abnormal',b'present',b'notpresent',117.0,...,32.0,6700.0,3.9,b'yes',b'no',b'no',b'poor',b'yes',b'yes',b'ckd'
4,51.0,80.0,b'1.010',b'2',b'0',b'normal',b'normal',b'notpresent',b'notpresent',106.0,...,35.0,7300.0,4.6,b'no',b'no',b'no',b'good',b'no',b'no',b'ckd'


In [6]:
df.shape

(400, 25)

In [7]:
df["class"].value_counts()

b'ckd'       250
b'notckd'    150
Name: class, dtype: int64

## Data Cleaning

In [8]:
df.dropna(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215 entries, 3 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     215 non-null    float64
 1   bp      215 non-null    float64
 2   sg      215 non-null    object 
 3   al      215 non-null    object 
 4   su      215 non-null    object 
 5   rbc     215 non-null    object 
 6   pc      215 non-null    object 
 7   pcc     215 non-null    object 
 8   ba      215 non-null    object 
 9   bgr     215 non-null    float64
 10  bu      215 non-null    float64
 11  sc      215 non-null    float64
 12  sod     215 non-null    float64
 13  pot     215 non-null    float64
 14  hemo    215 non-null    float64
 15  pcv     215 non-null    float64
 16  wbcc    215 non-null    float64
 17  rbcc    215 non-null    float64
 18  htn     215 non-null    object 
 19  dm      215 non-null    object 
 20  cad     215 non-null    object 
 21  appet   215 non-null    object 
 22  pe

In [10]:
df["class"].value_counts()

b'notckd'    128
b'ckd'        87
Name: class, dtype: int64

## Export to CSV file

In [11]:
df.to_csv("./data/chronic_kidney_disease.csv")