In [1]:
import json
import pandas as pd

from sklearn.model_selection import train_test_split

DATA_PATH = "../data/cleaned/diabetes_model_ready.parquet"
FEATURE_SPEC_PATH = "../data/processed/feature_spec.json"

df = pd.read_parquet(DATA_PATH)

with open(FEATURE_SPEC_PATH, "r", encoding="utf-8") as f:
    feature_spec = json.load(f)

target = feature_spec["target"]
num_features = feature_spec["numeric_features"]
cat_features = feature_spec["categorical_features"]

X = df[num_features + cat_features]
y = df[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y 
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train positive rate:", y_train.mean().round(3), "Test positive rate:", y_test.mean().round(3))


Train shape: (614, 12) Test shape: (154, 12)
Train positive rate: 0.349 Test positive rate: 0.351


In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ],
    remainder="drop",
)

baseline_clf = LogisticRegression(
    max_iter=2000,
    solver="liblinear",
    class_weight="balanced",  # simple handling for imbalance
    random_state=42
)

baseline_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", baseline_clf),
    ]
)

baseline_pipeline.fit(X_train, y_train)
print("Baseline pipeline trained.")


Baseline pipeline trained.


In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__C": [0.1, 1.0, 10.0],
    "model__penalty": ["l1", "l2"],
}

grid = GridSearchCV(
    estimator=baseline_pipeline,
    param_grid=param_grid,
    scoring="f1",   # better than accuracy for imbalance
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV F1:", round(grid.best_score_, 4))

best_model = grid.best_estimator_


Best params: {'model__C': 0.1, 'model__penalty': 'l2'}
Best CV F1: 0.7002




# NOTE:
penalty parameter is currently deprecated in newer sklearn versions.
Kept here for clarity and compatibility with current environment.


In [4]:
import os
import joblib
from datetime import datetime

os.makedirs("../results/models", exist_ok=True)

joblib.dump(baseline_pipeline, "../results/models/baseline_logreg.joblib")
joblib.dump(best_model, "../results/models/final_logreg_grid.joblib")

train_meta = {
    "trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "data_path": DATA_PATH,
    "feature_spec_path": FEATURE_SPEC_PATH,
    "test_size": 0.2,
    "random_state": 42,
    "model_family": "LogisticRegression",
    "baseline_params": baseline_clf.get_params(),
    "grid_best_params": grid.best_params_,
    "grid_scoring": "f1",
    "cv_folds": 5
}

with open("../results/models/training_metadata.json", "w", encoding="utf-8") as f:
    json.dump(train_meta, f, indent=2)

print("Saved artifacts to ../results/models/")


Saved artifacts to ../results/models/
