# Hidden CKD

#### Variables
- Date of event: Date of the screening
- Gender: Gender of the patient (M: Male, F: Female)
- Ethnicity: The ethnicity of the participant
- D.O.B.: The date of birth of the participant
- Age: Age of the patient (years)
- Height (cm): Height of the participant in cm
- Weight (kg): Weight of the participant in kg
- BMI: BMI of the participant
- BMI Category: Classification of the particpant BMI according to NICE guidelines
- Systolic, Diastolic: The systolic and diastolic of the partcipants
- BP Category: Classification of the particpant BP according to NICE guidelines
- Medical Conditions: Medical conditions the patient has (High blood pressure, Diabetes, Kidney disease, Heart disease and Other
- What medications/tablets are you currently taking?: The kinds of medication the participants are taking (Cholesterol, BP, Diabetes, Other)
- Name of blood pressure medication / Tablets
- Name of blood pressure medication / Tablets
- Name of blood pressure medication / Tablets
- Do you have a family history of kidney disease?: Whether or not the participant has a family history of kidney disease
- uACR: uACR level of the participant (Normal, Abnormal, High Abnormal)

In [1]:
import numpy as np
import pandas as pd
from src.config import RAW_DATA_DIR

In [2]:
raw_data=pd.read_csv(RAW_DATA_DIR / 'hiddenckd_01.csv')

# **Cleaning and Preprocessing Raw Data**

In [3]:
#filtering out all rows with uACR results that are not Normal, Abnormal or High Abnormal
raw_data = raw_data[(raw_data['uACR'] == "Normal") | (raw_data['uACR'] == "Abnormal") | (raw_data['uACR'] == "High abnormal")]

In [4]:
#renaming specific row values
raw_data['uACR']=raw_data['uACR'].replace({'High abnormal': 'High Abnormal'})
raw_data['Ethnicity'] = raw_data['Ethnicity'].replace({'Black African ' : 'Black African (unspecified)',
                                                       'Black African' : 'Black African (unspecified)'})

In [5]:
#adding an "Abnormal uACR" column
raw_data['Abnormal_uACR'] = raw_data['uACR'].map({'Normal' : 0, 'Abnormal' : 1, 'High Abnormal' : 1}).astype('bool')

In [6]:
#adding a "Simplified Ethnicity" column
raw_data['S_Ethnicity'] = raw_data['Ethnicity'].replace({
    'Black African' : 'Black',
    'Black African (Central Africa)' : 'Black',
    'Black African (East Africa)' : 'Black',
    'Black African (North Africa)' : 'Black',
    'Black African (South Africa)' : 'Black',
    'Black African (West Africa)' : 'Black',
    'Black African (unspecified)' : 'Black',
    'Black Caribbean' : 'Black',
    'Black other' : 'Black',
    'Indian' : 'Indian',
    'Mixed White/Asian' : 'Mixed',
    'Mixed White/Black African' : 'Mixed',
    'Mixed White/Black Caribbean' : 'Mixed',
    'Mixed other' : 'Mixed',
    'Pakistani' : 'Indian',
    'White British' : 'White',
    'White Gypsy/Traveller' : 'White',
    'White Irish' : 'White',
    'White other' : 'White',
    'Any other' : 'Other',
    'Asian other' : 'SE Asian',
    'Bangladeshi' : 'Indian'})

In [7]:
#dropping one gender outlier 'Prefer not to say' and another outlier whose date of birth was recorded as 10/09/2023
raw_data.drop(raw_data[raw_data.Gender == 'Prefer not to say'].index, inplace=True)
raw_data.drop(raw_data[raw_data['D.O.B.'] == "10/09/2023"].index, inplace=True)

In [8]:
#splitting the "Medical Conditions" column into the constituent medical conditions
raw_data['Has_HTN'] = raw_data['Medical Conditions'].apply(lambda x: 'High blood pressure' in x)
raw_data['Has_Diabetes'] = raw_data['Medical Conditions'].apply(lambda x: 'Diabetes' in x)
raw_data['Has_KD'] = raw_data['Medical Conditions'].apply(lambda x: 'Kidney disease' in x)
raw_data['Has_CVD'] = raw_data['Medical Conditions'].apply(lambda x: 'Heart disease (heart attack, angina, heart failure)' in x)
raw_data['Has_Other'] = raw_data['Medical Conditions'].apply(lambda x: 'Other' in x)

In [9]:
raw_data.head(5)

Unnamed: 0,PID,Gender,D.O.B.,Age,Height (cm),Weight(kg),Systolic,Diastolic,Ethnicity,Medical Conditions,...,eGFR,True positive or negative,Stage of CKD,Abnormal_uACR,S_Ethnicity,Has_HTN,Has_Diabetes,Has_KD,Has_CVD,Has_Other
0,1,Male,21/05/1946,79,161.0,64.0,118,63,Black Caribbean,None of the above,...,65,True positive,2,True,Black,False,False,False,False,False
1,2,Male,25/01/1970,55,163.0,78.0,128,69,Black African (West Africa),None of the above,...,82,True positive,2,True,Black,False,False,False,False,False
2,5,Male,25/04/1969,56,168.0,87.0,143,81,Black Caribbean,None of the above,...,>90,False positive,False Positive,True,Black,False,False,False,False,False
3,6,Female,03/11/1979,45,187.0,109.0,156,98,Black African (West Africa),None of the above,...,>90,False positive,False Positive,True,Black,False,False,False,False,False
4,7,Male,24/08/1969,55,174.0,114.0,120,70,Black African (West Africa),None of the above,...,64,True positive,2,True,Black,False,False,False,False,False


In [10]:
df = raw_data.rename({"Date of event": "Date",
                        "D.O.B.": "DOB", 
                        "Height (cm)": "Height",
                        "Weight(kg)": "Weight",
                        "BMI Category": "BMI_Category",
                        "BP Category": "BP_Category",
                        "Family history of CKD?":"Family_KD",
                        "True positive or negative": "Status",
                        "Stage of CKD": "CKD_Stage"},
                       axis=1)
df["eGFR"] = df["eGFR"].replace({">": "", "<": ""}, regex=True)
df["eGFR"] = pd.to_numeric(df["eGFR"])
df["CKD_Stage"] = df["CKD_Stage"].replace({"False Positive": None})
df["Status"] = df["Status"].replace({"True positive": True, "True negative": False, "False positive": False, "False negative": True})
df["Family_KD"] = df["Family_KD"].replace({"Definitely yes": "yes", "Definitely not": "no", "Not sure": "unsure"})
df["Male"] = df["Gender"].replace({"Male": True, "Female": False})

  df["Male"] = df["Gender"].replace({"Male": True, "Female": False})


In [11]:
#data['Age'] = (pd.to_datetime(data['Date'], dayfirst = True) - pd.to_datetime(data['DOB'], dayfirst = True)) / np.timedelta64(1, 'D') / 365
#data['Age'] = data['Age'].round(1)

In [12]:
df['Age_Category'] = pd.cut(df['Age'], bins=[0, 25, 40, 55, 70, float('inf')], labels=['<25', '25-40', '41-55', '56-70', '>70']).astype(str)

In [13]:
df = df[['Male', 'Ethnicity', 'S_Ethnicity', 'Age',
         'Height', 'Weight', 'Systolic', 'Diastolic',
         'Has_HTN', 'Has_Diabetes', 'Has_KD', 'Has_CVD', 'Family_KD',
         'uACR', 'eGFR', 'Status', 'CKD_Stage']]

In [14]:
df.columns = df.columns.str.lower()
df = df.dropna(subset=["egfr"])
df = df.reset_index(drop=True)

In [15]:
df.head(5)

Unnamed: 0,male,ethnicity,s_ethnicity,age,height,weight,systolic,diastolic,has_htn,has_diabetes,has_kd,has_cvd,family_kd,uacr,egfr,status,ckd_stage
0,True,Black Caribbean,Black,79,161.0,64.0,118,63,False,False,False,False,no,Abnormal,65.0,True,2.0
1,True,Black African (West Africa),Black,55,163.0,78.0,128,69,False,False,False,False,no,Abnormal,82.0,True,2.0
2,True,Black Caribbean,Black,56,168.0,87.0,143,81,False,False,False,False,no,Abnormal,90.0,False,
3,False,Black African (West Africa),Black,45,187.0,109.0,156,98,False,False,False,False,no,Abnormal,90.0,False,
4,True,Black African (West Africa),Black,55,174.0,114.0,120,70,False,False,False,False,no,Abnormal,64.0,True,2.0


# **ML**

In [16]:
# Import modules
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
import joblib
from src.config import MODELS_DIR

In [None]:
# Creating preprocessing pipelines for both numeric and nominal and ordinal data.
num_features = ['age', 'height', 'weight', 'systolic', 'diastolic']
num_transformer = Pipeline(steps=[
    ('power_transform', PowerTransformer(method='yeo-johnson'))])

nom_features = ['s_ethnicity', 'family_kd', 'uacr']
nom_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

ord_features = ['male', 'has_htn', 'has_diabetes', 'has_kd', 'has_cvd']
ord_categories = [[False, True] for _ in ord_features]
ord_transformer = Pipeline(steps=[
    ('ord_enc', OrdinalEncoder(categories=ord_categories))
])

# Combining everything into one preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('nom', nom_transformer, nom_features),
        ('ord', ord_transformer, ord_features)])

# Import processed data
X = df[num_features + nom_features + ord_features]
y = df['status']
y_encoded = y.replace({True: "ckd", False: "no_ckd"})


In [18]:
# Fit preprocessor
preprocessor.fit(X)
X = preprocessor.transform(X)

In [19]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y_encoded) # type: ignore



In [20]:
import optuna
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from copy import deepcopy

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
def create_xgb_objective(X, y, n_splits=5, random_state=42):
    """
    Returns an Optuna objective function for XGBClassifier
    over a fixed feature set (columns of X).
    """

    def objective(trial):
        # Hyperparameter search space
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 2, 8),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 10.0),
            "gamma": trial.suggest_float("gamma", 0.0, 10.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.5, 10.0),
            "objective": "binary:logistic",
            "eval_metric": "aucpr",
            "tree_method": "hist",
            "random_state": random_state,
        }

        # Stratified CV on the (possibly SMOTEENN‑ed) data
        cv = StratifiedKFold(
            n_splits=n_splits, shuffle=True, random_state=random_state
        )

        ap_scores = []

        for train_idx, valid_idx in cv.split(X, y):
            X_train, X_valid = X[train_idx], X[valid_idx]
            y_train, y_valid = y[train_idx], y[valid_idx]

            model = XGBClassifier(**params)
            model.fit(X_train, y_train)

            y_proba = model.predict_proba(X_valid)[:, 1]
            ap = average_precision_score(y_valid, y_proba)
            ap_scores.append(ap)

        # We maximise mean average precision (AUPRC surrogate)
        return float(np.mean(ap_scores))

    return objective


In [22]:
def boed_feature_selection(
    X,
    y,
    feature_names,
    max_features=None,
    n_initial_features=5,
    n_trials_per_step=20,
    patience=2,
    random_state=42,
):
    """
    BOED-style sequential feature selection using Optuna-based
    XGBoost optimisation as the utility function.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features).
    y : np.ndarray
        Binary target array.
    feature_names : list[str]
        Names of features (len = n_features).
    max_features : int or None
        Maximum number of features to select. If None, can go up to all.
    n_initial_features : int
        Number of features to seed the process with.
    n_trials_per_step : int
        Number of Optuna trials per candidate subset evaluation.
    patience : int
        Stop after this many steps without improvement.
    random_state : int
        For reproducibility.

    Returns
    -------
    selected_features : list[str]
        Final selected feature names.
    history : list[dict]
        List of dicts with step results (subset, score, added_feature, etc.).
    """

    rng = np.random.RandomState(random_state)
    n_total_features = X.shape[1]

    if max_features is None:
        max_features = n_total_features

    # Index-based selection internally
    all_indices = np.arange(n_total_features)

    # Seed with random subset (or you can seed with clinically known features)
    initial_indices = rng.choice(all_indices, size=n_initial_features, replace=False)
    selected_indices = list(initial_indices)

    remaining_indices = [i for i in all_indices if i not in selected_indices]

    # Evaluate initial subset
    X_init = X[:, selected_indices]
    study_init = optuna.create_study(direction="maximize")
    objective_init = create_xgb_objective(X_init, y, random_state=random_state)
    study_init.optimize(objective_init, n_trials=n_trials_per_step)

    best_score = study_init.best_value
    best_subset = deepcopy(selected_indices)

    history = [{
        "step": 0,
        "selected_indices": deepcopy(selected_indices),
        "selected_features": [feature_names[i] for i in selected_indices],
        "score": best_score,
        "added_feature": None,
    }]

    no_improve_steps = 0
    step = 1

    while remaining_indices and len(selected_indices) < max_features:
        candidate_results = []

        # --- Acquisition step: treat each "add one feature" move as an experiment
        for idx in remaining_indices:
            trial_indices = selected_indices + [idx]
            X_candidate = X[:, trial_indices]

            study = optuna.create_study(direction="maximize")
            objective = create_xgb_objective(
                X_candidate, y, random_state=random_state
            )
            study.optimize(objective, n_trials=n_trials_per_step)

            score = study.best_value
            candidate_results.append((idx, score))

        # Pick the feature that maximises the score
        idx_best, score_best = max(candidate_results, key=lambda t: t[1])

        selected_indices.append(idx_best)
        remaining_indices.remove(idx_best)

        history.append({
            "step": step,
            "selected_indices": deepcopy(selected_indices),
            "selected_features": [feature_names[i] for i in selected_indices],
            "score": score_best,
            "added_feature": feature_names[idx_best],
        })

        # Check for global improvement
        if score_best > best_score:
            best_score = score_best
            best_subset = deepcopy(selected_indices)
            no_improve_steps = 0
        else:
            no_improve_steps += 1

        if no_improve_steps >= patience:
            break

        step += 1

    selected_features = [feature_names[i] for i in best_subset]

    return selected_features, history


In [23]:
feature_names = preprocessor.get_feature_names_out().tolist()
y_resampled = y_resampled.replace({"ckd": True, "no_ckd": False})

selected_features, history = boed_feature_selection(
    X_resampled,
    y_resampled,
    feature_names=feature_names,
    max_features=20,
    n_initial_features=5,
    n_trials_per_step=15,  # keep small to control runtime
    patience=2,
)

print("Selected features:", selected_features)


  y_resampled = y_resampled.replace({"ckd": True, "no_ckd": False})
[I 2025-12-13 05:12:32,092] A new study created in memory with name: no-name-fe0c9dcc-83b6-4a53-b3ab-bd01265055b7
[I 2025-12-13 05:12:33,810] Trial 0 finished with value: 0.7185395628909561 and parameters: {'n_estimators': 903, 'max_depth': 5, 'learning_rate': 0.27198619478863667, 'subsample': 0.724766166592661, 'colsample_bytree': 0.7071546803715438, 'min_child_weight': 6.274543957076846, 'gamma': 7.171141089683813, 'reg_lambda': 0.016846755532686622, 'reg_alpha': 0.0024655578968707133, 'scale_pos_weight': 8.65662991507268}. Best is trial 0 with value: 0.7185395628909561.
[I 2025-12-13 05:12:35,629] Trial 1 finished with value: 0.7281962245987014 and parameters: {'n_estimators': 993, 'max_depth': 6, 'learning_rate': 0.23204759864838057, 'subsample': 0.7669693663508133, 'colsample_bytree': 0.9625736779584824, 'min_child_weight': 5.057157747238393, 'gamma': 5.936060518809935, 'reg_lambda': 0.01293343482801883, 'reg_alph

Selected features: ['num__age', 'ord__has_diabetes', 'ord__male', 'num__height', 'nom__s_ethnicity_SE Asian', 'num__systolic', 'nom__uacr_High Abnormal', 'nom__family_kd_no']
