In [19]:
import pandas as pd
from src.config import RAW_DATA_DIR, TABLES_DIR
from src.tabular.utils.dataset_utils import gfr_staging, acr_staging, ckd_status

In [20]:
df = pd.read_csv(RAW_DATA_DIR / 'tabular_data/uacr.csv')

In [21]:
df = df.rename(columns={
    "UACR": "acr",
    "BMI": "bmi",
    "urine_albumin": "al",
    "gfr": "egfr"
})
df["male"] = df["sex"] == "m"
df["dm"] = df["dm"] == "Y"
df["gfr_stage"] = df.apply(gfr_staging, axis=1)
df["acr_stage"] = df.apply(acr_staging, axis=1)
df["ckd_status"] = df.apply(ckd_status, axis=1)

In [22]:
df = df[["male", "age", "bmi", "al", "dm", "acr", "egfr", "acr_stage", "gfr_stage", "ckd_status"]]

In [23]:
def remove_outliers_iqr(numeric_cols, multiplier=1.5):
    """Remove outliers from numeric columns using the iqr rule."""
    clean_df = df.copy()
    for col in numeric_cols:
        q1 = clean_df[col].quantile(0.25)
        q3 = clean_df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - multiplier * iqr
        upper = q3 + multiplier * iqr
        clean_df = clean_df[(clean_df[col] >= lower) & (clean_df[col] <= upper)]
    return clean_df

# Outlier removal
num_features = ["age", "bmi", "egfr", "acr"]
df = remove_outliers_iqr(num_features)
df = df.rename(columns={"egfr": "gfr"})

In [24]:
df

Unnamed: 0,male,age,bmi,al,dm,acr,gfr,acr_stage,gfr_stage,ckd_status
1,False,45.88,16.44,2,True,11.4,40.23,A2,G3b,ckd
2,False,52.65,17.33,2,False,12.4,91.64,A2,G1,ckd
3,False,77.96,18.37,2,True,22.9,108.19,A2,G1,ckd
4,False,53.41,19.03,2,True,8.0,48.66,A2,G3a,ckd
5,True,82.72,19.11,2,False,8.4,87.20,A2,G2,ckd
...,...,...,...,...,...,...,...,...,...,...
2915,True,61.61,34.41,3,True,34.0,56.66,A3,G3a,ckd
2916,True,72.61,34.64,3,False,54.6,75.54,A3,G2,ckd
2918,True,66.01,35.16,3,True,146.7,73.10,A3,G2,ckd
2919,False,64.93,35.56,3,True,168.2,102.65,A3,G1,ckd
