In [1]:
import pandas as pd
from loguru import logger
from sklearn.model_selection import train_test_split
from src.config import TABLES_DIR

# Load Hidden CKD
df1 = pd.read_csv(TABLES_DIR / "hiddenckd_01.csv")
logger.info(f"Loaded dataset with shape: {df1.shape}")

X1 = df1.drop(columns="ckd_status")
y1 = df1["ckd_status"]

X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X1, y1, test_size=0.2, stratify=y1, random_state=42
)

entry_1 = (
    X_test1
    .reset_index(drop=True)
    .to_dict(orient="records")
)

df2 = pd.read_csv(TABLES_DIR / "hiddenckd_02.csv")
logger.info(f"Loaded dataset with shape: {df2.shape}")


entry_2 = (
    df2
    .reset_index(drop=True)
    .to_dict(orient="records")
)

# Load UCI ML Repo CKD
df3 = pd.read_csv(TABLES_DIR / "ucickd.csv")
logger.info(f"Loaded dataset with shape: {df3.shape}")

X2 = df3.drop(columns="class")
y2 = df3["class"]

X_train2, X_test2, y_train2, y_test = train_test_split(
    X2, y2, test_size=0.2, stratify=y2, random_state=42
)

entry_3 = (
    X_test2
    .reset_index(drop=True)
    .to_dict(orient="records")
)

[32m2025-12-27 08:35:08.327[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoaded dataset with shape: (399, 26)[0m
[32m2025-12-27 08:35:08.337[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m24[0m - [1mLoaded dataset with shape: (565, 20)[0m
[32m2025-12-27 08:35:08.340[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mLoaded dataset with shape: (400, 25)[0m


In [2]:
import json
from bson import ObjectId
from datetime import date, datetime, time

docs = entry_1 + entry_2 + entry_3

fields = [
    "email",
    "postcode",
    "male",
    "age",
    "dob",
    "eth",
    "height_cm",
    "weight_kg",
    "s_bp",
    "d_bp",
    "family_htn",
    "family_dm",
    "family_kd",
    "htn",
    "dm",
    "kd",
    "cvd",
    "cad",
    "rbc",
    "pc",
    "pcc",
    "ba",
    "bgr",
    "bu",
    "sc",
    "su",
    "al",
    "sg",
    "sod",
    "pot",
    "hemo",
    "pcv",
    "wbcc",
    "rbcc",
    "appet_poor",
    "pe",
    "ane",
    "screening_acr",
    "device",
    "acr",
    "egfr",
    "acr_stage",
    "egfr_stage",
]

filtered_docs = []

for idx, d in enumerate(docs):
    new_doc = {}
    for i in fields:
        if i in d:
            new_doc["_id"] = idx+1
            new_doc["patient_id"] = idx+1
            new_doc[i] = d[i]
        else:
            pass
    filtered_docs.append(new_doc)

with open("ckd_docs.json", "w", encoding="utf-8") as f:
    json.dump(filtered_docs, f, indent=2)

In [3]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["ckd_cdss"]
collection = db["patients"]

for doc in filtered_docs:
    if isinstance(doc.get("dob"), str):
        doc["dob"] = datetime.fromisoformat(doc["dob"])

for d in filtered_docs:
    d["_id"] = ObjectId()

collection.drop()
result = collection.insert_many(filtered_docs)

In [4]:
missing_patient_id = sum("patient_id" not in d for d in filtered_docs)
print(missing_patient_id)


0
