## Credit Risk â€” End-to-end Notebook

This notebook reproduces the Streamlit app workflow in one place:

- Dataset loading (bundled Excel)
- Basic analysis / EDA
- Cleaning (missing values / duplicates / outliers)
- Model training + evaluation
- Save / load a trained model artifact
- Predict on new inputs

> Tip: run this notebook from `streamlit-app-v2/` so paths resolve cleanly.


In [None]:
# --- Setup: imports + project path ---
from __future__ import annotations

import sys
from pathlib import Path

import numpy as np
import pandas as pd

# Make `src/` importable when running from the notebook.
APP_DIR = Path.cwd()  # expected: .../streamlit-app-v2
if (APP_DIR / "src").exists() and str(APP_DIR) not in sys.path:
    sys.path.insert(0, str(APP_DIR))

from src.constants import ARTIFACTS_DIR, DEFAULT_DATASET_PATH, TARGET_COL
from src.data_io import read_dataset_from_path
from src.cleaning import CleaningConfig, clean_dataframe
from src.modeling import MODEL_SPECS, TrainConfig, load_artifact, save_artifact, train_and_evaluate

print("APP_DIR:", APP_DIR)
print("DEFAULT_DATASET_PATH:", DEFAULT_DATASET_PATH)
print("TARGET_COL:", TARGET_COL)
print("ARTIFACTS_DIR:", ARTIFACTS_DIR)


In [None]:
# --- Step 1: Load dataset ---

df = read_dataset_from_path(DEFAULT_DATASET_PATH)

print("shape:", df.shape)
df.head()


In [None]:
# --- Step 2: Quick analysis / EDA ---

# basic info
print(df.dtypes)

print("\nMissing values (top 20):")
na = df.isna().sum().sort_values(ascending=False)
display(na.head(20))

print("\nTarget distribution:")
display(df[TARGET_COL].value_counts(dropna=False))

# describe numeric
num_desc = df.describe(include=[np.number]).T
num_desc["missing"] = df.select_dtypes(include=[np.number]).isna().sum().values
num_desc


In [None]:
# --- Step 3: Cleaning ---

# This mirrors the Streamlit Cleaning page defaults.
clean_cfg = CleaningConfig(
    target_col=TARGET_COL,
    numeric_missing="mean",
    categorical_missing="mode",
    drop_duplicates=True,
    outlier_method="none",
    outlier_cols=[],
    zscore_threshold=3.0,
    mean_std_k=3.0,
)

clean_df, clean_report = clean_dataframe(df, clean_cfg)

print("Rows before:", df.shape[0])
print("Rows after: ", clean_df.shape[0])
print("Nulls before:", int(df.isna().sum().sum()))
print("Nulls after: ", int(clean_df.isna().sum().sum()))

clean_report


In [None]:
# --- Step 4: Train + evaluate models ---

# Identify feature types similarly to the app.
feature_cols = [c for c in clean_df.columns if c != TARGET_COL]
numeric_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(clean_df[c])]
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

print("#numeric:", len(numeric_cols), "#categorical:", len(categorical_cols))

cfg_base = dict(target_col=TARGET_COL, test_size=0.30, random_state=42)

runs = {}
rows = []
for model_id, spec in MODEL_SPECS.items():
    pipe, metrics = train_and_evaluate(
        clean_df,
        numeric_cols=numeric_cols,
        categorical_cols=categorical_cols,
        cfg=TrainConfig(**cfg_base, model_id=model_id),
    )
    runs[model_id] = {"pipeline": pipe, "metrics": metrics}
    rows.append(
        {
            "model_id": model_id,
            "model_label": spec.label,
            "accuracy": metrics.get("accuracy"),
            "precision": metrics.get("precision"),
            "recall": metrics.get("recall"),
            "f1": metrics.get("f1"),
            "roc_auc": metrics.get("roc_auc"),
        }
    )

results_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
results_df


In [None]:
# Pick the best model by F1
best_id = str(results_df.iloc[0]["model_id"])
best_label = MODEL_SPECS[best_id].label

print("Best model:", best_id, "(", best_label, ")")

best = runs[best_id]
print(best["metrics"]["classification_report"])


In [None]:
# --- Step 5: Save + load the trained model artifact ---

artifact_path = ARTIFACTS_DIR / "credit_risk_model.joblib"

metadata = {
    "model_id": best_id,
    "model_label": best_label,
    "metrics": best["metrics"],
    "cleaning_config": clean_cfg.__dict__,
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols,
}

save_artifact(artifact_path, pipeline=best["pipeline"], metadata=metadata)
print("Saved:", artifact_path)

loaded = load_artifact(artifact_path)
loaded.keys()


In [None]:
# --- Step 6: Predict on a new sample ---

pipe = loaded["pipeline"]
meta = loaded.get("metadata", {}) or {}

# Build a single-row payload using medians/modes from the cleaned dataset
payload = {}
for c in meta.get("numeric_cols", numeric_cols):
    payload[c] = float(pd.to_numeric(clean_df[c], errors="coerce").median())
for c in meta.get("categorical_cols", categorical_cols):
    # pick the most frequent category (mode)
    mode = clean_df[c].dropna().astype(str).mode()
    payload[c] = str(mode.iloc[0]) if len(mode) else "UNKNOWN"

x = pd.DataFrame([payload])
pred = pipe.predict(x)[0]

# Decode prediction using target_names when available
names = (meta.get("metrics") or {}).get("target_names", [])
label = names[int(pred)] if isinstance(pred, (int, float)) and int(pred) < len(names) else str(pred)

print("prediction:", label)

if hasattr(pipe, "predict_proba"):
    proba = pipe.predict_proba(x)[0]
    # show classes + probabilities
    classes = []
    if hasattr(pipe, "named_steps") and "clf" in pipe.named_steps and hasattr(pipe.named_steps["clf"], "classes_"):
        classes = list(pipe.named_steps["clf"].classes_)
    elif hasattr(pipe, "classes_"):
        classes = list(pipe.classes_)

    if classes and len(classes) == len(proba):
        def _lbl(c):
            if isinstance(c, (int, float)) and int(c) < len(names):
                return str(names[int(c)])
            return str(c)

        display(
            pd.DataFrame({"class": [_lbl(c) for c in classes], "probability": [float(p) for p in proba]})
            .sort_values("probability", ascending=False)
            .reset_index(drop=True)
        )
