# UCI ML Repo CKD Dataset

### Dataset
| Variable Name | Role    | Type        | Description             | Units        | Missing Values |
| ------------- | ------- | ----------- | ----------------------- | ------------ | -------------- |
| age           | Feature | Integer     | age                     | year         | yes            |
| bp            | Feature | Integer     | blood pressure          | mm/Hg        | yes            |
| sg            | Feature | Categorical | specific gravity        |              | yes            |
| al            | Feature | Categorical | albumin                 |              | yes            |
| su            | Feature | Categorical | sugar                   |              | yes            |
| rbc           | Feature | Binary      | red blood cells         |              | yes            |
| pc            | Feature | Binary      | pus cell                |              | yes            |
| pcc           | Feature | Binary      | pus cell clumps         |              | yes            |
| ba            | Feature | Binary      | bacteria                |              | yes            |
| bgr           | Feature | Integer     | blood glucose random    | mgs/dl       | yes            |
| bu            | Feature | Integer     | blood urea              | mgs/dl       | yes            |
| sc            | Feature | Continuous  | serum creatinine        | mgs/dl       | yes            |
| sod           | Feature | Integer     | sodium                  | mEq/L        | yes            |
| pot           | Feature | Continuous  | potassium               | mEq/L        | yes            |
| hemo          | Feature | Continuous  | hemoglobin              | gms          | yes            |
| pcv           | Feature | Integer     | packed cell volume      |              | yes            |
| wbcc          | Feature | Integer     | white blood cell count  | cells/cmm    | yes            |
| rbcc          | Feature | Continuous  | red blood cell count    | millions/cmm | yes            |
| htn           | Feature | Binary      | hypertension            |              | yes            |
| dm            | Feature | Binary      | diabetes mellitus       |              | yes            |
| cad           | Feature | Binary      | coronary artery disease |              | yes            |
| appet         | Feature | Binary      | appetite                |              | yes            |
| pe            | Feature | Binary      | pedal edema             |              | yes            |
| ane           | Feature | Binary      | anemia                  |              | yes            |
| class         | Target  | Binary      | ckd or not ckd          |              | no             |

In [7]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from src.config import RAW_DATA_DIR, TABLES_DIR

In [8]:
df=pd.read_csv(RAW_DATA_DIR / 'tabular_data/chronic_kidney_disease.csv')

In [9]:
# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Median imputation for numeric features
median_imputer = SimpleImputer(strategy='median')
df[numeric_cols] = median_imputer.fit_transform(df[numeric_cols])

# Mode imputation for categorical features
mode_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = mode_imputer.fit_transform(df[categorical_cols])

# Strip
df["class"] = df["class"].str.strip()
df["dm"] = df["dm"].str.strip()

# Rename columns
df = df.rename(columns={
    "bp": "d_bp",
    "appet": "appet_poor",
})

In [10]:
# Encoding
binary_map = {
    "yes": True,
    "no": False,
    "abnormal": True,
    "normal": False,
    "present": True,
    "notpresent": False,
    "poor": True,
    "good": False
}

df["htn"] = df["htn"].apply(lambda x: binary_map[x])
df["dm"] = df["dm"].apply(lambda x: binary_map[x])
df["cad"] = df["cad"].apply(lambda x: binary_map[x])
df["pe"] = df["pe"].apply(lambda x: binary_map[x])
df["ane"] = df["ane"].apply(lambda x: binary_map[x])

df["rbc"] = df["rbc"].apply(lambda x: binary_map[x])
df["pc"] = df["pc"].apply(lambda x: binary_map[x])
df["pcc"] = df["pcc"].apply(lambda x: binary_map[x])
df["ba"] = df["ba"].apply(lambda x: binary_map[x])
df["appet_poor"] = df["appet_poor"].apply(lambda x: binary_map[x])

In [11]:
df.to_csv(TABLES_DIR / 'ucickd.csv', index=False)
df

Unnamed: 0,age,d_bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet_poor,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,False,False,False,False,121.0,...,44.0,7800.0,5.2,True,True,False,False,False,False,ckd
1,7.0,50.0,1.020,4.0,0.0,False,False,False,False,121.0,...,38.0,6000.0,4.8,False,False,False,False,False,False,ckd
2,62.0,80.0,1.010,2.0,3.0,False,False,False,False,423.0,...,31.0,7500.0,4.8,False,True,False,True,False,True,ckd
3,48.0,70.0,1.005,4.0,0.0,False,True,True,False,117.0,...,32.0,6700.0,3.9,True,False,False,True,True,True,ckd
4,51.0,80.0,1.010,2.0,0.0,False,False,False,False,106.0,...,35.0,7300.0,4.6,False,False,False,False,False,False,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,False,False,False,False,140.0,...,47.0,6700.0,4.9,False,False,False,False,False,False,notckd
396,42.0,70.0,1.025,0.0,0.0,False,False,False,False,75.0,...,54.0,7800.0,6.2,False,False,False,False,False,False,notckd
397,12.0,80.0,1.020,0.0,0.0,False,False,False,False,100.0,...,49.0,6600.0,5.4,False,False,False,False,False,False,notckd
398,17.0,60.0,1.025,0.0,0.0,False,False,False,False,114.0,...,51.0,7200.0,5.9,False,False,False,False,False,False,notckd


In [12]:
df.shape

(400, 25)