In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [5]:
# Input paths
oracle_time_path = Path("results/oracle_time.csv")
automl_runtime_path = Path("results/automl_runtime_results.csv")
cvi_times_path = Path("results/cvi_times.csv")
deep_model_time_path = Path("results/DAPINet_inference_time.csv")
mapped_table_path = Path("results/method_oracle_mapped_table.csv")

# Output
output_path = Path("results/full_results.csv")

(
    oracle_time_path,
    automl_runtime_path,
    cvi_times_path,
    deep_model_time_path,
    mapped_table_path,
    output_path,
)

(WindowsPath('results/oracle_time.csv'),
 WindowsPath('results/automl_runtime_results.csv'),
 WindowsPath('results/cvi_times.csv'),
 WindowsPath('results/DAPINet_inference_time.csv'),
 WindowsPath('results/method_oracle_mapped_table.csv'),
 WindowsPath('results/full_results.csv'))

In [6]:
# Load inputs
oracle_time = pd.read_csv(oracle_time_path)
automl_runtime = pd.read_csv(automl_runtime_path)
cvi_times = pd.read_csv(cvi_times_path)
deep_model_time = pd.read_csv(deep_model_time_path)
mapped = pd.read_csv(mapped_table_path)

# Ensure numeric values for oracle times and convert s -> ms
oracle_time_cols = [c for c in oracle_time.columns if c != "dataset"]
oracle_time[oracle_time_cols] = oracle_time[oracle_time_cols].apply(pd.to_numeric, errors="coerce")
oracle_time[oracle_time_cols] = oracle_time[oracle_time_cols] * 1000.0

# Total oracle time per dataset (run all algorithms)
oracle_time["oracle_total_time_ms"] = oracle_time[oracle_time_cols].sum(axis=1)

oracle_time.head()

Unnamed: 0,dataset,k-means,k-medians,spectral_clustering,ward,agglomerative,dbscan,hdbscan,optics,birch,gaussian,mean_shift,affinity_propagation,oracle_total_time_ms
0,BreastTissue,3385.61524,6541.783777,1257.88207,421.593547,439.76418,171.741381,203.490962,2662.695869,543.809507,438.987908,4821.374566,283.681242,21172.420249
1,BreastWisconsin,2729.940403,12285.473916,1555.949,1836.820812,2112.10293,250.642674,1024.125081,11659.775571,1514.692398,441.377467,33158.943862,5933.494961,74503.339075
2,Ecoli,5479.108339,15633.884101,1399.169299,1309.251392,1209.291186,264.436781,317.011985,7644.9975,504.48019,630.778279,20831.806201,1417.678951,56641.894204
3,Glass,4654.451582,9918.541519,1102.301955,826.348139,733.414874,208.130892,251.173928,4920.204752,365.686926,830.235008,9809.554015,846.314223,34466.357813
4,Haberman,2234.226658,2441.518599,800.003935,736.120278,966.87027,212.668546,275.416966,6908.336785,404.572794,479.673532,23836.151713,1031.344819,40326.904895


In [7]:
# Merge datasets
df = mapped.merge(
    oracle_time[["dataset", "oracle_total_time_ms", *oracle_time_cols]], on="dataset", how="left"
)
df = df.merge(cvi_times, on="dataset", how="left")
df = df.merge(automl_runtime, on="dataset", how="left")
df = df.merge(deep_model_time, on="dataset", how="left")

df.head()

Unnamed: 0,dataset,oracle_best_algorithm,oracle_best_ari,DeepModel_algorithm,DeepModel_oracle_ari,ML2DAC_statistical+info-theory+general_algorithm,ML2DAC_statistical+info-theory+general_oracle_ari,AML4C_DBCV_algorithm,AML4C_DBCV_oracle_ari,AML4C_COP_algorithm,...,dunn_time_ms,AML4C_COP_runtime_ms,AML4C_DBCV_runtime_ms,AutoClust_runtime_ms,AutoCluster_CH_runtime_ms,AutoCluster_DBI_runtime_ms,AutoCluster_SIL_runtime_ms,ML2DAC_statistical+general_runtime_ms,ML2DAC_statistical+info-theory+general_runtime_ms,inference_time_ms
0,BreastTissue,spectral_clustering,0.415214,spectral_clustering,0.415214,k-means,0.259222,k-means,0.259222,k-means,...,9.2809,370.30983,381.003141,144.170761,132.81846,154.639244,395.386934,6.690502,453.813314,428.947
1,BreastWisconsin,spectral_clustering,0.78591,spectral_clustering,0.78591,k-means,0.670721,gaussian,0.774016,birch,...,192.4592,950.505972,842.879534,2157.743454,2411.898613,2401.755333,2342.360973,109.139204,845.066786,16.9042
2,Ecoli,birch,0.795428,spectral_clustering,0.720195,gaussian,0.650539,gaussian,0.650539,ward,...,82.8905,224.824667,230.48234,665.667295,247.705221,249.371052,1331.945658,6.528139,360.613108,17.9686
3,Glass,birch,0.291949,spectral_clustering,0.225414,gaussian,0.225683,gaussian,0.225683,ward,...,32.5396,390.702724,379.758596,341.969728,186.126947,208.799839,1222.349167,9.271145,399.825335,13.0419
4,Haberman,mean_shift,0.183416,mean_shift,0.183416,dbscan,0.157664,dbscan,0.157664,ward,...,49.716,33.637047,33.964157,906.628609,193.393707,242.584467,1473.00601,4.646301,33.579826,11.3663


In [8]:
# Helper to get algorithm time from oracle table
oracle_time_lookup = df.set_index("dataset")[oracle_time_cols]


def algo_time_ms(dataset: str, algo: str | float | None) -> float:
    if algo is None or (isinstance(algo, float) and np.isnan(algo)):
        return np.nan
    if algo not in oracle_time_lookup.columns:
        return np.nan
    return float(oracle_time_lookup.loc[dataset, algo])


# CVI timing: run all algos + CVI selection
cvi_methods = ["silhouette", "calinski_harabasz", "davies_bouldin", "dunn"]
for cvi in cvi_methods:
    cvi_time_col = f"{cvi}_time_ms"
    out_col = f"{cvi}_time_ms_total"
    if cvi_time_col in df.columns:
        df[out_col] = df["oracle_total_time_ms"] + df[cvi_time_col]

# Deep model timing: inference + chosen algorithm
if "DeepModel_algorithm" in df.columns:
    df["DeepModel_time_ms"] = df["inference_time_ms"]
    #  df.apply(
    #     lambda row: row.get("inference_time_ms", np.nan)
    #     + algo_time_ms(row["dataset"], row["DeepModel_algorithm"]),
    #     axis=1,
    # )

# AutoML timing: use automl runtime columns
automl_runtime_cols = [c for c in df.columns if c.endswith("_runtime_ms")]
for col in automl_runtime_cols:
    method = col.replace("_runtime_ms", "")
    df[f"{method}_time_ms"] = df[col]

# Oracle timing
df["oracle_time_ms"] = df["oracle_total_time_ms"]

df.filter(regex="_time_ms").head()

Unnamed: 0,oracle_total_time_ms,silhouette_time_ms,calinski_harabasz_time_ms,davies_bouldin_time_ms,dunn_time_ms,inference_time_ms,silhouette_time_ms_total,calinski_harabasz_time_ms_total,davies_bouldin_time_ms_total,dunn_time_ms_total,DeepModel_time_ms,AML4C_COP_time_ms,AML4C_DBCV_time_ms,AutoClust_time_ms,AutoCluster_CH_time_ms,AutoCluster_DBI_time_ms,AutoCluster_SIL_time_ms,ML2DAC_statistical+general_time_ms,ML2DAC_statistical+info-theory+general_time_ms,oracle_time_ms
0,21172.420249,6.2504,2.1395,10.0493,9.2809,428.947,21178.670649,21174.559749,21182.469549,21181.701149,428.947,370.30983,381.003141,144.170761,132.81846,154.639244,395.386934,6.690502,453.813314,21172.420249
1,74503.339075,39.4106,3.1417,11.397,192.4592,16.9042,74542.749675,74506.480775,74514.736075,74695.798275,16.9042,950.505972,842.879534,2157.743454,2411.898613,2401.755333,2342.360973,109.139204,845.066786,74503.339075
2,56641.894204,15.5992,2.8413,12.1839,82.8905,17.9686,56657.493404,56644.735504,56654.078104,56724.784704,17.9686,224.824667,230.48234,665.667295,247.705221,249.371052,1331.945658,6.528139,360.613108,56641.894204
3,34466.357813,9.4182,2.3837,11.2938,32.5396,13.0419,34475.776013,34468.741513,34477.651613,34498.897413,13.0419,390.702724,379.758596,341.969728,186.126947,208.799839,1222.349167,9.271145,399.825335,34466.357813
4,40326.904895,11.1749,1.7929,5.3678,49.716,11.3663,40338.079795,40328.697795,40332.272695,40376.620895,11.3663,33.637047,33.964157,906.628609,193.393707,242.584467,1473.00601,4.646301,33.579826,40326.904895


In [9]:
# Order columns: keep method results next to their timing columns
ordered_cols: list[str] = []

# Start with dataset, then oracle summary
for col in ["dataset", "oracle_best_algorithm", "oracle_best_ari", "oracle_time_ms"]:
    if col in df.columns:
        ordered_cols.append(col)

# DeepModel block
deep_model_cols = ["DeepModel_algorithm", "DeepModel_oracle_ari", "DeepModel_time_ms"]
for col in deep_model_cols:
    if col in df.columns:
        ordered_cols.append(col)

# Other methods from mapped table (preserve column order)
method_prefixes: list[str] = []
for col in df.columns:
    if col.endswith("_algorithm") and col not in {"oracle_best_algorithm", "DeepModel_algorithm"}:
        prefix = col[: -len("_algorithm")]
        if prefix not in method_prefixes:
            method_prefixes.append(prefix)

for prefix in method_prefixes:
    algo_col = f"{prefix}_algorithm"
    ari_col = f"{prefix}_oracle_ari"
    if prefix in cvi_methods:
        time_col = f"{prefix}_time_ms_total"
    else:
        time_col = f"{prefix}_time_ms"
    for col in [algo_col, ari_col, time_col]:
        if col in df.columns and col not in ordered_cols:
            ordered_cols.append(col)

# Exclude raw timing columns from the tail
raw_time_cols = set(oracle_time_cols)
raw_time_cols.update(automl_runtime_cols)
raw_time_cols.update(["oracle_total_time_ms", "inference_time_ms"])
raw_time_cols.update([f"{cvi}_time_ms" for cvi in cvi_methods])

# Append remaining columns (extras, diagnostics)
remaining = [c for c in df.columns if c not in ordered_cols and c not in raw_time_cols]
df = df[ordered_cols + remaining]

# Save full results
df.to_csv(output_path, index=False)

print(f"Saved full results to: {output_path}")
df.head()

Saved full results to: results\full_results.csv


Unnamed: 0,dataset,oracle_best_algorithm,oracle_best_ari,oracle_time_ms,DeepModel_algorithm,DeepModel_oracle_ari,DeepModel_time_ms,ML2DAC_statistical+info-theory+general_algorithm,ML2DAC_statistical+info-theory+general_oracle_ari,ML2DAC_statistical+info-theory+general_time_ms,...,davies_bouldin_time_ms_total,calinski_harabasz_algorithm,calinski_harabasz_oracle_ari,calinski_harabasz_time_ms_total,silhouette_algorithm,silhouette_oracle_ari,silhouette_time_ms_total,dunn_algorithm,dunn_oracle_ari,dunn_time_ms_total
0,BreastTissue,spectral_clustering,0.415214,21172.420249,spectral_clustering,0.415214,428.947,k-means,0.259222,453.813314,...,21182.469549,k-means,0.259222,21174.559749,hdbscan,0.181302,21178.670649,hdbscan,0.181302,21181.701149
1,BreastWisconsin,spectral_clustering,0.78591,74503.339075,spectral_clustering,0.78591,16.9042,k-means,0.670721,845.066786,...,74514.736075,k-means,0.670721,74506.480775,hdbscan,0.155987,74542.749675,hdbscan,0.155987,74695.798275
2,Ecoli,birch,0.795428,56641.894204,spectral_clustering,0.720195,17.9686,gaussian,0.650539,360.613108,...,56654.078104,optics,0.519053,56644.735504,optics,0.519053,56657.493404,hdbscan,0.40425,56724.784704
3,Glass,birch,0.291949,34466.357813,spectral_clustering,0.225414,13.0419,gaussian,0.225683,399.825335,...,34477.651613,dbscan,0.250715,34468.741513,dbscan,0.250715,34475.776013,dbscan,0.250715,34498.897413
4,Haberman,mean_shift,0.183416,40326.904895,mean_shift,0.183416,11.3663,dbscan,0.157664,33.579826,...,40332.272695,k-means,-0.001102,40328.697795,affinity_propagation,0.029962,40338.079795,spectral_clustering,0.079278,40376.620895
