In [None]:

import os
import json
import time
import logging
from datetime import datetime

import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
import joblib


def pick_input_csv():
    cands = [
        os.path.join("data", "bank.csv"),
        os.path.join("data", "raw", "dataset.csv"),
        os.path.join("data", "dataset.csv"),
    ]
    for p in cands:
        if os.path.exists(p):
            return p
    raise FileNotFoundError("File Not Found")


def setup_logger(log_dir="logs"):
    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, "pipeline.log")

    logger = logging.getLogger("capstone_min")
    logger.setLevel(logging.INFO)
    logger.handlers = []

    fmt = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s", "%Y-%m-%d %H:%M:%S")
    fh = logging.FileHandler(log_path, encoding="utf-8")
    fh.setFormatter(fmt)
    sh = logging.StreamHandler()
    sh.setFormatter(fmt)

    logger.addHandler(fh)
    logger.addHandler(sh)
    return logger, log_path


def save_charts(y_test, pred, proba, df_all, out_dir, logger):
    """
    Output 3 charts:
    1) Target distribution histogram
    2) Confusion matrix
    3) Probability distribution (grouped by true label)
    """
    os.makedirs(out_dir, exist_ok=True)

    # 1) target distribution
    if "deposit" in df_all.columns:
        plt.figure()
        df_all["deposit"].value_counts().plot(kind="bar")
        plt.title("Target Distribution (deposit)")
        plt.xlabel("deposit")
        plt.ylabel("count")
        p1 = os.path.join(out_dir, "01_target_distribution.png")
        plt.tight_layout()
        plt.savefig(p1, dpi=160)
        plt.close()
        logger.info(f"Saved chart: {p1}")

    # 2) Confusion matrix
    cm = confusion_matrix(y_test, pred, labels=[0, 1])
    plt.figure()
    plt.imshow(cm, interpolation="nearest")
    plt.title("Confusion Matrix")
    plt.xticks([0, 1], ["pred=0", "pred=1"])
    plt.yticks([0, 1], ["true=0", "true=1"])
    for i in range(2):
        for j in range(2):
            plt.text(j, i, str(cm[i, j]), ha="center", va="center")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    p2 = os.path.join(out_dir, "02_confusion_matrix.png")
    plt.tight_layout()
    plt.savefig(p2, dpi=160)
    plt.close()
    logger.info(f"Saved chart: {p2}")

    # 3) Probability distribution
    if proba is not None:
        y_arr = pd.Series(y_test).to_numpy()
        p_arr = pd.Series(proba).to_numpy()

        plt.figure()
        plt.hist(p_arr[y_arr == 0], bins=30, alpha=0.7, label="true=0")
        plt.hist(p_arr[y_arr == 1], bins=30, alpha=0.7, label="true=1")
        plt.title("Predicted Probability Distribution (by True Label)")
        plt.xlabel("P(deposit=yes)")
        plt.ylabel("count")
        plt.legend()
        p3 = os.path.join(out_dir, "03_probability_distribution.png")
        plt.tight_layout()
        plt.savefig(p3, dpi=160)
        plt.close()
        logger.info(f"Saved chart: {p3}")


def main():

    pd.set_option("future.no_silent_downcasting", True)

    t0 = time.time()
    logger, log_path = setup_logger()

    os.makedirs("outputs", exist_ok=True)
    os.makedirs("models", exist_ok=True)
    os.makedirs("dashboards", exist_ok=True)

    in_path = pick_input_csv()
    logger.info(f"Input: {in_path}")

    df = pd.read_csv(in_path)
    logger.info(f"Raw shape: {df.shape}")

    # Cleaning: Standardize column names, remove duplicates, remove blank lines
    df.columns = [str(c).strip().lower().replace(" ", "_").replace("-", "_") for c in df.columns]
    df = df.drop_duplicates()
    df = df.dropna(how="all")

    target = "deposit"
    if target not in df.columns:
        raise ValueError(f"Target column not found")

    # Target mapping yes/no -> 1/0 (using map + to_numeric here to completely avoid the downcasting warning of replace)
    y_raw = df[target].astype(str).str.strip().str.lower()
    y_mapped = y_raw.map({"yes": 1, "no": 0})
    valid_mask = y_mapped.notna()
    bad = int((~valid_mask).sum())
    if bad > 0:
        logger.info(f"Drop rows with invalid target: {bad}")

    df = df.loc[valid_mask].copy()
    y = y_mapped.loc[valid_mask].astype(int)

    X = df.drop(columns=[target])

    # divide
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 1 else None
    )

    # Process everything by category
    pre = ColumnTransformer(
        transformers=[
            ("cat", Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]), X.columns.tolist())
        ],
        remainder="drop"
    )

    pipe = Pipeline(steps=[
        ("preprocess", pre),
        ("model", LogisticRegression(max_iter=200))
    ])

    pipe.fit(X_train, y_train)

    pred = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, "predict_proba") else None

    acc = float(accuracy_score(y_test, pred))
    f1 = float(f1_score(y_test, pred, zero_division=0))
    auc = float(roc_auc_score(y_test, proba)) if proba is not None and y_test.nunique() > 1 else None

    metrics = {
        "input": in_path,
        "rows": int(df.shape[0]),
        "cols": int(df.shape[1]),
        "train_rows": int(len(X_train)),
        "test_rows": int(len(X_test)),
        "accuracy": acc,
        "f1": f1,
        "roc_auc": auc,
        "run_time_sec": round(time.time() - t0, 3),
        "timestamp": datetime.now().isoformat(timespec="seconds"),
    }
    logger.info(f"Metrics: acc={acc:.4f}, f1={f1:.4f}, auc={auc if auc is not None else 'NA'}")

    # Export metrics
    metrics_path = os.path.join("outputs", "metrics.json")
    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)

    # Export prediction
    out_pred = X_test.copy()
    out_pred["y_true"] = list(y_test.values)
    out_pred["y_pred"] = list(pred)
    if proba is not None:
        out_pred["proba_yes"] = list(proba)
    pred_path = os.path.join("outputs", "predictions.csv")
    out_pred.to_csv(pred_path, index=False)

    # Save model
    model_path = os.path.join("models", "model.joblib")
    joblib.dump(pipe, model_path)

    # Visualization
    save_charts(
        y_test=y_test,
        pred=pred,
        proba=proba,
        df_all=df,
        out_dir="dashboards",
        logger=logger
    )

    logger.info(f"Saved: {metrics_path}")
    logger.info(f"Saved: {pred_path}")
    logger.info(f"Saved: {model_path}")
    logger.info("Charts saved in: dashboards/")
    logger.info(f"Log: {log_path}")
    logger.info("DONE")


if __name__ == "__main__":
    main()
