In [None]:
import glob
import logging
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from dapinet.analysis import (
    CVI_METRICS,
    create_per_cvi_matrix,
    evaluate_dataset,
    find_best_algorithms,
)

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("cvi_evaluation")

In [2]:
data_path = Path("datasets/real_world")
output_path = Path("results")

# Find dataset files
dataset_files = sorted(glob.glob(str(data_path / "*.npz")))
dataset_files = [f for f in dataset_files if not Path(f).stem.startswith("benchmark")]

logger.info(f"Found {len(dataset_files)} datasets")
logger.info(f"Calculating CVIs: {list(CVI_METRICS.keys())}")

# Calculate CVIs for all datasets
all_results = []
for ds_file in tqdm(dataset_files, desc="Calculating CVIs"):
    result = evaluate_dataset(Path(ds_file), scale=True)
    all_results.append(result)
    logger.debug(f"Processed {Path(ds_file).stem}")

# Create full results DataFrame
df_full = pd.DataFrame(all_results)

# Create per-CVI matrices (datasets × algorithms)
cvi_matrices = create_per_cvi_matrix(df_full)

# Save per-CVI CSVs
for cvi_name, df_cvi in cvi_matrices.items():
    output_file = output_path / f"cvi_{cvi_name}.csv"
    df_cvi.to_csv(output_file, index=False)
    logger.info(f"Saved {cvi_name} matrix to {output_file}")

# Save per-CVI timing (datasets × CVIs)
time_columns = [f"{cvi_name}_time_ms" for cvi_name in CVI_METRICS.keys()]
df_time = df_full[["dataset", *time_columns]]
output_time = output_path / "cvi_times.csv"
df_time.to_csv(output_time, index=False)
logger.info(f"Saved CVI timing matrix to {output_time}")

# Find best algorithms per CVI
df_best = find_best_algorithms(df_full)
output_best = output_path / "cvi_results.csv"
df_best.to_csv(output_best, index=False)
logger.info(f"Saved best algorithms to {output_best}")

# Print summaries
print("\n" + "=" * 60)
print("CVI EVALUATION COMPLETE")
print("=" * 60)

for cvi_name, (_, higher_is_better) in CVI_METRICS.items():
    direction = "higher" if higher_is_better else "lower"
    print(f"\n=== {cvi_name.upper()} ({direction} is better) ===")

    # Count wins per algorithm
    if f"{cvi_name}_algorithm" in df_best.columns:
        wins = df_best[f"{cvi_name}_algorithm"].value_counts()
        print(wins.to_string())

print("\n" + "=" * 60)
print("Output files:")
for cvi_name in CVI_METRICS.keys():
    print(f"  - cvi_{cvi_name}.csv")
print("  - cvi_times.csv")
print("  - cvi_results .csv")
print("=" * 60)

2026-01-19 12:47:52,014 INFO: Found 20 datasets
2026-01-19 12:47:52,016 INFO: Calculating CVIs: ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn']
Calculating CVIs:   0%|          | 0/20 [00:00<?, ?it/s]

Calculating CVIs: 100%|██████████| 20/20 [00:29<00:00,  1.48s/it]
2026-01-19 12:48:21,700 INFO: Saved silhouette matrix to results\cvi_silhouette.csv
2026-01-19 12:48:21,706 INFO: Saved calinski_harabasz matrix to results\cvi_calinski_harabasz.csv
2026-01-19 12:48:21,710 INFO: Saved davies_bouldin matrix to results\cvi_davies_bouldin.csv
2026-01-19 12:48:21,714 INFO: Saved dunn matrix to results\cvi_dunn.csv
2026-01-19 12:48:21,723 INFO: Saved CVI timing matrix to results\cvi_times.csv
2026-01-19 12:48:21,735 INFO: Saved best algorithms to results\cvi_results.csv



CVI EVALUATION COMPLETE

=== SILHOUETTE (higher is better) ===
silhouette_algorithm
hdbscan                 10
optics                   4
affinity_propagation     2
dbscan                   1
k-means                  1
spectral_clustering      1
agglomerative            1

=== CALINSKI_HARABASZ (higher is better) ===
calinski_harabasz_algorithm
k-means    14
optics      3
hdbscan     2
dbscan      1

=== DAVIES_BOULDIN (lower is better) ===
davies_bouldin_algorithm
mean_shift              6
hdbscan                 5
affinity_propagation    2
agglomerative           2
spectral_clustering     2
dbscan                  2
optics                  1

=== DUNN (higher is better) ===
dunn_algorithm
hdbscan                12
spectral_clustering     3
optics                  3
dbscan                  1
agglomerative           1

Output files:
  - cvi_silhouette.csv
  - cvi_calinski_harabasz.csv
  - cvi_davies_bouldin.csv
  - cvi_dunn.csv
  - cvi_times.csv
  - cvi_results .csv
