In [43]:
import pandas as pd
import numpy as np
import re

FULL_PATH = "../data/raw/ckd_full.csv"
V2_PATH   = "../data/raw/ckd-dataset-v2.csv"

print("ckd_full exists:", FULL_PATH)
print("ckd-dataset-v2 exists:", V2_PATH)


ckd_full exists: ../data/raw/ckd_full.csv
ckd-dataset-v2 exists: ../data/raw/ckd-dataset-v2.csv


In [44]:
df_full = pd.read_csv(FULL_PATH)
df_v2   = pd.read_csv(V2_PATH)

print("Full dataset shape:", df_full.shape)
print("V2 dataset shape:", df_v2.shape)

df_v2.head(5)


Full dataset shape: (400, 25)
V2 dataset shape: (202, 29)


Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,...,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete
1,,,,,,,,,,,...,,,,,,,,,class,meta
2,0,0,1.019 - 1.021,1 - 1,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
3,0,0,1.009 - 1.011,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
4,0,0,1.009 - 1.011,≥ 4,ckd,1,< 0,1,0,1,...,0,0,0,1,0,0,127.281 - 152.446,s1,1,< 12


In [45]:
# Drop row 0 (column type descriptors like "discrete")
# Drop row 1 (meta row)
df_v2_clean = df_v2.drop(index=[0, 1]).reset_index(drop=True)

df_v2_clean.head()


Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0,0,1.019 - 1.021,1 - 1,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
1,0,0,1.009 - 1.011,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
2,0,0,1.009 - 1.011,≥ 4,ckd,1,< 0,1,0,1,...,0,0,0,1,0,0,127.281 - 152.446,s1,1,< 12
3,1,1,1.009 - 1.011,3 - 3,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,127.281 - 152.446,s1,1,< 12
4,0,0,1.015 - 1.017,< 0,ckd,0,< 0,0,0,0,...,0,1,0,1,1,0,127.281 - 152.446,s1,1,12 - 20


In [46]:
def convert_range_to_numeric(x):
    """
    Convert ranges like '1.019 - 1.021' → midpoint
    Convert '≥ 227.944' → 227.944
    Convert '< 12' → 6  (Option 1 selected)
    Convert 's1' → 1 etc.
    Convert simple numbers → float
    """
    if pd.isna(x):
        return np.nan
    
    s = str(x).strip().lower()

    # Stage values: s1 - s5
    if re.match(r"s[1-5]", s):
        return int(s[1:])  # "s3" → 3

    # Greater than equal: "≥ 3"
    if s.startswith("≥"):
        num = float(s.replace("≥", "").strip())
        return num

    # Less than: "< 12"
    if s.startswith("<"):
        num = float(s.replace("<", "").strip())
        return num / 2  # midpoint from 0 to X

    # Range: "a - b"
    if "-" in s:
        parts = s.split("-")
        try:
            a = float(parts[0])
            b = float(parts[1])
            return (a + b) / 2
        except:
            pass

    # Single numeric string
    try:
        return float(s)
    except:
        return np.nan


In [47]:
df_v2_numeric = df_v2_clean.copy()

for col in df_v2_numeric.columns:
    if col not in ["class", "affected"]:
        df_v2_numeric[col] = df_v2_numeric[col].apply(convert_range_to_numeric)

df_v2_numeric.head(10)


Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0.0,0.0,1.02,1.0,ckd,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,227.944,1,1,6.0
1,0.0,0.0,1.01,0.0,ckd,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,227.944,1,1,6.0
2,0.0,0.0,1.01,4.0,ckd,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,139.8635,1,1,6.0
3,1.0,1.0,1.01,3.0,ckd,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,139.8635,1,1,6.0
4,0.0,0.0,1.016,0.0,ckd,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,139.8635,1,1,16.0
5,1.0,1.0,1.023,0.0,notckd,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,114.698,1,0,16.0
6,0.0,0.0,1.02,3.0,ckd,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,190.195,1,1,16.0
7,0.0,0.0,1.02,0.0,ckd,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,39.20035,4,1,16.0
8,0.0,0.0,1.023,0.0,notckd,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,39.20035,4,0,23.5
9,1.0,2.0,1.01,4.0,ckd,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,64.3661,3,1,23.5


In [48]:
def map_class(x):
    if pd.isna(x): return np.nan
    x = str(x).strip().lower()
    if x == "ckd": return 1
    if x == "notckd": return 0
    try:
        return int(x)
    except:
        return np.nan

df_v2_numeric["affected"] = df_v2_clean["class"].apply(map_class)

# Drop the raw class column (v2)
df_v2_numeric = df_v2_numeric.drop(columns=["class"])


In [49]:
df_full_clean = df_full.copy()

mapping = {
    "age": "age",
    "blood pressure": "bp limit",
    "specific gravity": "sg",
    "albumin": "al",
    "sugar": "su",
    "red blood cells": "rbc",
    "pus cell": "pc",
    "pus cell clumps": "pcc",
    "bacteria": "ba",
    "blood glucose random": "bgr",
    "blood urea": "bu",
    "serum creatinine": "sc",
    "sodium": "sod",
    "potassium": "pot",
    "hemoglobin": "hemo",
    "packed cell volume": "pcv",
    "red blood cell count": "rbcc",
    "white blood cell count": "wbcc",
    "hypertension": "htn",
    "diabetes mellitus": "dm",
    "coronary artery disease": "cad",
    "appetite": "appet",
    "pedal edema": "pe",
    "anemia": "ane",
    "class": "affected",
}

df_full_clean.columns = df_full_clean.columns.str.lower().str.strip()

df_full_clean = df_full_clean.rename(columns={col: mapping.get(col, col) 
                                              for col in df_full_clean.columns})


In [50]:
binary_map = {
    "yes": 1, "y": 1, "present": 1, "abnormal": 1, "good": 1,
    "no": 0, "n": 0, "notpresent": 0, "normal": 0, "poor": 0
}

def normalize_binary(val):
    if pd.isna(val): return np.nan
    s = str(val).strip().lower()
    return binary_map.get(s, val)

binary_cols = ["rbc","pc","pcc","ba","htn","dm","cad","appet","pe","ane"]

for c in binary_cols:
    if c in df_full_clean.columns:
        df_full_clean[c] = df_full_clean[c].apply(normalize_binary)


In [51]:
df_full_clean["affected"] = df_full_clean["affected"].apply(map_class)


In [52]:
v2_cols = df_v2_numeric.columns.tolist()
full_cols = df_full_clean.columns.tolist()

# Add missing cols to full
for col in v2_cols:
    if col not in df_full_clean.columns:
        df_full_clean[col] = np.nan

# Add missing cols to v2
for col in full_cols:
    if col not in df_v2_numeric.columns:
        df_v2_numeric[col] = np.nan

df_full_clean = df_full_clean[v2_cols]
df_v2_numeric = df_v2_numeric[v2_cols]


In [53]:
df_merged_final = pd.concat([df_v2_numeric, df_full_clean], ignore_index=True)
df_merged_final.shape


(600, 28)

In [54]:
OUTPUT = "../data/processed/ckd_merged_corrected.csv"
df_merged_final.to_csv(OUTPUT, index=False)
print("Saved:", OUTPUT)


Saved: ../data/processed/ckd_merged_corrected.csv


In [55]:
df_merged_final.head()

Unnamed: 0,bp (Diastolic),bp limit,sg,al,rbc,su,pc,pcc,ba,bgr,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0.0,0.0,1.02,1.0,0.0,0.0,0.0,0.0,0.0,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,227.944,1.0,1,6.0
1,0.0,0.0,1.01,0.0,0.0,0.0,0.0,0.0,0.0,133.0,...,0.0,0.0,0.0,0.0,0.0,0.0,227.944,1.0,1,6.0
2,0.0,0.0,1.01,4.0,1.0,0.0,1.0,0.0,1.0,56.0,...,0.0,0.0,0.0,1.0,0.0,0.0,139.8635,1.0,1,6.0
3,1.0,1.0,1.01,3.0,0.0,0.0,0.0,0.0,0.0,133.0,...,0.0,0.0,0.0,0.0,0.0,0.0,139.8635,1.0,1,6.0
4,0.0,0.0,1.016,0.0,0.0,0.0,0.0,0.0,0.0,175.0,...,0.0,1.0,0.0,1.0,1.0,0.0,139.8635,1.0,1,16.0
