In [1]:
# Cell 1 - imports & config
import os
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from joblib import dump

# Optional Boruta
try:
    from boruta import BorutaPy
    HAS_BORUTA = True
except Exception:
    HAS_BORUTA = False

# PATHS
DATA_PATH = "../data/processed/ckd_merged_corrected.csv"
OUT_PIPELINE = "../models/ckd_knn_pipeline1.joblib"

# Config
RANDOM_STATE = 42
TEST_SIZE = 0.20
use_grid_search = True


In [2]:
# Cell 2 - load
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH, "shape:", df.shape)
display(df.head())
print("\nNull counts:\n", df.isna().sum())


Loaded: ../data/processed/ckd_merged_corrected.csv shape: (600, 28)


Unnamed: 0,bp (Diastolic),bp limit,sg,al,rbc,su,pc,pcc,ba,bgr,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0.0,0.0,1.02,1.0,0.0,0.0,0.0,0.0,0.0,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,227.944,1.0,1,6.0
1,0.0,0.0,1.01,0.0,0.0,0.0,0.0,0.0,0.0,133.0,...,0.0,0.0,0.0,0.0,0.0,0.0,227.944,1.0,1,6.0
2,0.0,0.0,1.01,4.0,1.0,0.0,1.0,0.0,1.0,56.0,...,0.0,0.0,0.0,1.0,0.0,0.0,139.8635,1.0,1,6.0
3,1.0,1.0,1.01,3.0,0.0,0.0,0.0,0.0,0.0,133.0,...,0.0,0.0,0.0,0.0,0.0,0.0,139.8635,1.0,1,6.0
4,0.0,0.0,1.016,0.0,0.0,0.0,0.0,0.0,0.0,175.0,...,0.0,1.0,0.0,1.0,1.0,0.0,139.8635,1.0,1,16.0



Null counts:
 bp (Diastolic)    400
bp limit           12
sg                 47
al                 46
rbc               152
su                 49
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sod                87
sc                 17
pot                88
hemo               52
pcv                71
rbcc              131
wbcc              106
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
grf               401
stage             400
affected            0
age                 9
dtype: int64


In [3]:
# Cell 3 - X/y
target_col = "affected"
if target_col not in df.columns:
    raise ValueError("target column 'affected' not found")

X_raw = df.drop(columns=[target_col]).copy()
y = df[target_col].astype(int)

# Ensure numeric
for c in X_raw.columns:
    X_raw[c] = pd.to_numeric(X_raw[c], errors="coerce")

# Drop columns that are entirely NaN (rare)
non_all_nan_mask = X_raw.notna().any(axis=0)
dropped_cols = X_raw.columns[~non_all_nan_mask].tolist()
if dropped_cols:
    print("Dropping entirely empty columns:", dropped_cols)
X_raw = X_raw.loc[:, non_all_nan_mask]

print("X shape:", X_raw.shape, "y shape:", y.shape)


X shape: (600, 27) y shape: (600,)


In [4]:
# Cell 4 - Iterative Imputer
imputer = IterativeImputer(max_iter=20, random_state=RANDOM_STATE)
X_imp = imputer.fit_transform(X_raw)
print("Imputation done. Remaining NaNs:", np.isnan(X_imp).sum())


Imputation done. Remaining NaNs: 0


In [5]:
# Cell 5 - scaling
robust = RobustScaler()
standard = StandardScaler()
minmax = MinMaxScaler()

X_robust = robust.fit_transform(X_imp)
X_std = standard.fit_transform(X_robust)
X_scaled = minmax.fit_transform(X_std)
print("Scaling done. Shape:", X_scaled.shape)


Scaling done. Shape: (600, 27)


In [6]:
# Cell 6 - Feature Selection (RandomForest median threshold)

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

print("Using RandomForest feature importances (median threshold)...")

rf_base = RandomForestClassifier(
    n_estimators=200,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    class_weight="balanced_subsample"
)

# Fit RF on scaled features
rf_base.fit(X_scaled, y)

# Select features above median importance
selector = SelectFromModel(rf_base, threshold="median", prefit=True)
support_mask = selector.get_support()

# Keep only selected columns
kept_columns = np.array(X_raw.columns)[support_mask].tolist()
X_sel = X_scaled[:, support_mask]

print("Selected features:", kept_columns)
print("Total selected:", len(kept_columns))


Using RandomForest feature importances (median threshold)...
Selected features: ['sg', 'al', 'rbc', 'pc', 'bgr', 'bu', 'sc', 'hemo', 'pcv', 'rbcc', 'htn', 'dm', 'grf', 'stage']
Total selected: 14


In [7]:
# Cell 7 - Train/Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_sel,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train/Test shapes:", X_train.shape, X_test.shape)


Train/Test shapes: (480, 14) (120, 14)


In [8]:
# Cell 8 - KNN Grid Search

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    "n_neighbors": [3,5,7,9,11],
    "metric": ["euclidean", "manhattan"],
    "weights": ["uniform", "distance"],
    "leaf_size": [20, 30],
    "algorithm": ["auto", "kd_tree"]
}

knn = KNeighborsClassifier()

grid = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_knn = grid.best_estimator_
print("Best params found:", grid.best_params_)


Best params found: {'algorithm': 'auto', 'leaf_size': 20, 'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}


In [9]:
# Cell 9 - Evaluate Model Performance

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = best_knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {acc*100:.2f}%")
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))


Test Accuracy: 97.50%
Confusion matrix:
 [[44  0]
 [ 3 73]]

Classification report:
               precision    recall  f1-score   support

           0     0.9362    1.0000    0.9670        44
           1     1.0000    0.9605    0.9799        76

    accuracy                         0.9750       120
   macro avg     0.9681    0.9803    0.9734       120
weighted avg     0.9766    0.9750    0.9752       120



In [10]:
# Cell 10 - Final Training on Full Dataset + Save Pipeline

from joblib import dump

final_model = KNeighborsClassifier(**best_knn.get_params())
final_model.fit(X_sel, y)

pipeline_path = "../models/ckd_knn_pipeline1.joblib"

dump({
    "imputer": imputer,
    "robust": robust,
    "standard": standard,
    "minmax": minmax,
    "feature_mask": support_mask,
    "kept_columns": kept_columns,
    "model": final_model
}, pipeline_path)

print("Saved final pipeline to:", pipeline_path)


Saved final pipeline to: ../models/ckd_knn_pipeline1.joblib
