In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
dbutils.library.restartPython()

In [0]:
import json
import pandas as pd
import os
from datetime import datetime
from ucimlrepo import fetch_ucirepo
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from src.python.domain.domain_services import is_valid_dataset
from src.python.service.utils import (
    test_dataset_binary_numeric,
    convert_to_binary_dataset,
)
from src.python.domain.domain_services import get_nn_model, get_nn_inference
from src.python.domain.domain_services import classification_diagnostics
from src.python.domain.domain_services import create_bootstrap_dataset
from src.python.domain.Algorithms import TemperatureScaler
from src.python.domain.domain_services import white_list_uci_ids
import multiprocessing
from tqdm import tqdm
from itertools import product

dataset_meta_ls = [
    x
    for x in json.load(open("uci_datasets_meta1.json", "rb"))
    if x["uci_id"] in white_list_uci_ids
]



In [0]:
conf_dict = json.loads(dbutils.widgets.get("config"))
print(conf_dict)
for key, value in conf_dict.items():
    globals()[key] = value

In [0]:
def get_results(input_tuple) -> dict:
    dataset_meta, model_params = input_tuple[0], input_tuple[1]
    if is_valid_dataset(dataset_meta):
        print(f"Processing dataset: {dataset_meta['name']}")
        dataset = fetch_ucirepo(id=dataset_meta["uci_id"])
        X, feat_names, y = convert_to_binary_dataset(
            dataset.data.features, dataset.data.targets.iloc[:, 0]
        )
        test_dataset_binary_numeric(X, y)
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.4, random_state=42
        )

        model, scaler = get_nn_model(
            X_train,
            y_train,
            X_val,
            y_val,
            model_params["base_nn_hidden_size"],
            model_params["base_nn_hidden_layers"],
            model_params["base_nn_n_epochs"],
        )
        probs = get_nn_inference(model, X_val, y_val, scaler)

        calib_df = pd.concat(
            [
                X_val.reset_index(drop=True),
                pd.DataFrame(probs, columns=["predicted_prob"]),
                pd.DataFrame(y_val.values, columns=["label"]),
            ],
            axis=1,
        )

        calib_train_df, calib_val_df = train_test_split(
            calib_df, test_size=0.6, random_state=42
        )

        calibration_models = []

        # # forecal plus based calibration
        # bootstrap_df = create_bootstrap_dataset(
        #     calib_train_df,
        #     model_params["calibration_forecal_n_bins"],
        #     model_params["calibration_forecal_n_samples"],
        #     random_state=42,
        # )

        # cols = list(
        #     bootstrap_df.columns
        # )  # make the predicted probability as the first column for monotonic constraint
        # cols.insert(0, cols.pop(cols.index("predicted_prob_mean")))
        # bootstrap_df = bootstrap_df.loc[:, cols]

        # X_train = bootstrap_df.drop(columns=["label_mean"])
        # y_train = bootstrap_df["label_mean"]

        # fc = RandomForestRegressor(
        #     n_estimators=100,
        #     random_state=42,
        #     monotonic_cst=([1] + [0] * (X_train.shape[1] - 1)),
        # )
        # fc.fit(X_train, y_train)
        # calibration_models.append(
        #     {"model": fc, "name": "forecal-plus", "features": X_train.columns.to_list()}
        # )

        # forecal based calibration

        bootstrap_df = create_bootstrap_dataset(
            calib_train_df,
            model_params["calibration_forecal_n_bins"],
            model_params["calibration_forecal_n_samples"],
            random_state=42,
        )
        cols = list(
            bootstrap_df.columns
        )  # make the predicted probability as the first column for monotonic constraint
        cols.insert(0, cols.pop(cols.index("predicted_prob_mean")))
        bootstrap_df = bootstrap_df.loc[:, cols]

        X_train = bootstrap_df[["predicted_prob_mean"]]
        y_train = bootstrap_df["label_mean"]

        fc = RandomForestRegressor(
            n_estimators=100,
            random_state=42,
            monotonic_cst=([1] + [0] * (X_train.shape[1] - 1)),
        )
        fc.fit(X_train, y_train)
        print(fc)
        calibration_models.append(
            {"model": fc, "name": "forecal", "features": ["predicted_prob_mean"]}
        )

        # isotonic regression based calibration
        X_train = calib_train_df["predicted_prob"].values.reshape(-1, 1)
        y_train = calib_train_df["label"].values
        ir = IsotonicRegression(out_of_bounds="clip")
        ir.fit(X_train.flatten(), y_train)
        calibration_models.append(
            {"model": ir, "name": "isotonic", "features": ["predicted_prob"]}
        )

        # logistic regression based calibration called platt scaling
        X_train = calib_train_df["predicted_prob"].values.reshape(-1, 1)
        y_train = calib_train_df["label"].values
        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        calibration_models.append(
            {"model": lr, "name": "platt", "features": ["predicted_prob"]}
        )

        # # temperature scaling
        X_train = calib_train_df["predicted_prob"].values.reshape(-1, 1)
        y_train = calib_train_df["label"].values
        temp_scaler = TemperatureScaler()
        temp_scaler.fit(X_train, y_train)
        calibration_models.append(
            {"model": temp_scaler, "name": "tempscaler", "features": ["predicted_prob"]}
        )

        X_val = calib_val_df.drop(columns=["label"])
        y_val = calib_val_df["label"].values

        result_ls = []
        for models in [
            {"model": None, "name": "baseline", "features": None}
        ] + calibration_models:
            if models["name"] == "baseline":
                calibrated_probs = X_val["predicted_prob"]
            elif models["name"].startswith("fore"):
                X = X_val.rename(columns=lambda x: x + "_mean")[models["features"]]
                calibrated_probs = models["model"].predict(X)
            elif models["name"] == "platt":
                calibrated_probs = models["model"].predict_proba(
                    X_val[models["features"]]
                )[:, 1]
            elif models["name"] == "isotonic":
                calibrated_probs = models["model"].predict(X_val[models["features"]])
            elif models["name"] == "tempscaler":
                calibrated_probs = models["model"].predict(X_val["predicted_prob"])
            else:
                raise ValueError(f"Unknown calibration model: {models['name']}")

            plt, ece, auc = classification_diagnostics(
                y_val, calibrated_probs, n_bins=10
            )
            result_ls.append({"name": models["name"], "ece": ece, "auc": auc})
            print(
                f"Expected Calibration Error (ECE) for {models['name']}: {ece:.4f} AUC: {auc:.4f}"
            )

        res = {
            "dataset": dataset_meta["name"],
            "results": result_ls,
            "uri_id": dataset_meta["uci_id"],
            "dataset_meta": dataset_meta,
            "model_params": model_params,
        }
        return res
    else:
        return None


## NN Model training

In [0]:
arg_list = list(product(dataset_meta_ls, model_params))
total_ = len(arg_list)
print(f"Total number of runs: {total_}")

In [0]:
with multiprocessing.pool.ThreadPool(total_) as pool:
    with tqdm(total=total_) as pbar:
        result_list = []
        for result in pool.imap_unordered(get_results, arg_list):
            result_list.append(result)
            pbar.update()

In [0]:
with open(run_params["run_output"], "w") as f:
    json.dump({"meta": run_params, "data": result_list}, f)
    print(f"Results saved at: {run_params['run_output']}")