In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from typing import List
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import silhouette_score

import utils_HardnessMDL as utils_MDL

In [None]:
def plot_distribution_by_error(dfs: List[pd.DataFrame], measure_cols: List[str], output_path: str = None):
    df_all = pd.concat(dfs, ignore_index=True)
    
    n_measures = len(measure_cols)
    fig, axes = plt.subplots(2, (n_measures + 1)//2, figsize=(4*(n_measures//2), 8))
    axes = axes.flatten()
    
    for i, col in enumerate(measure_cols):
        sns.violinplot(
            data=df_all, x="is_error", y=col, 
            ax=axes[i], palette="Set2", inner="box", hue="is_error"
        )
        axes[i].set_title(f"{col} distribution")
        axes[i].set_xlabel("is_error (0=ok, 1=error)")
        axes[i].set_ylabel(col)
    
    plt.tight_layout()
    if output_path:
      plt.savefig(output_path)
    plt.close()

In [None]:
def plot_correlation_heatmap(df: pd.DataFrame, title: str = "corr", method: str = 'kendall', output_path: str = None, figsize=(10, 8)):
    corr = df.corr(method=method)

    plt.figure(figsize=figsize)
    sns.heatmap(
        corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, square=True, cbar_kws={"shrink": 0.8}
    )
    plt.title(title, fontsize=14)
    plt.tight_layout()
    if output_path:
      plt.savefig(output_path)
    plt.close()

In [None]:
hardness_mdl_cols = ['r_min', 'r_med', 'relative_position', 'pseudo_probability', 'normalized_entropy', 'description_length_margin', 'description_length_true_cost', 'kullback_leibler_divergence']
pyhard_cols = ['feature_kDN', 'feature_DS', 'feature_DCP', 'feature_TD_P', 'feature_TD_U', 'feature_CL', 'feature_CLD', 'feature_MV', 'feature_CB', 'feature_N1', 'feature_N2', 'feature_LSC', 'feature_LSR', 'feature_Harmfulness', 'feature_Usefulness', 'feature_F1', 'feature_F2', 'feature_F3', 'feature_F4', 'instance_hardness']

In [None]:
uci_ids = {
    "fertility": 244,
    "iris": 53,
    "wine": 109,
    "libras": 181,
    "wdbc": 17,  # Wisconsin Diagnostic Breast Cancer
    "contrac": 23,  # Contraceptive Method Choice
    "wine-red": 186,
    "wine-white": 187,
    "letter": 59,
}

datasets = list(uci_ids.keys())
dataset_config = utils_MDL.load_dataset_config('dataset.yaml')
[{e: dataset_config[e]['n_instances']} for e in datasets]

In [None]:
hardness_mdl_suffix = '_hardness_mdl.csv'
pyhard_suffix = '_pyhard.csv'

In [None]:
def compare_hardnessmdl_pyhard(dataset_name: str):
  df_pyhard = pd.read_csv(f'results/{dataset_name}_pyhard.csv')
  df_hardness_mdl = pd.read_csv(f'results/{dataset_name}_hardness_mdl.csv')
  plot_distribution_by_error(dfs=[df_hardness_mdl], measure_cols=hardness_mdl_cols, output_path=f'results/{dataset_name}_distribution_by_error.png')
  df_concat = pd.concat([df_hardness_mdl, df_pyhard], axis=1)[hardness_mdl_cols + pyhard_cols]
  plot_correlation_heatmap(df=df_concat, method='kendall', figsize=(20, 16), output_path=f'results/{dataset_name}_kendall_correlation.png')
  plot_correlation_heatmap(df=df_concat[hardness_mdl_cols+['instance_hardness']], method='kendall', figsize=(10, 9), output_path=f'results/{dataset_name}_kendall_ih_correlation.png')
  plot_correlation_heatmap(df=df_concat, method='spearman', figsize=(20, 16), output_path=f'results/{dataset_name}_spearman_correlation.png')
  plot_correlation_heatmap(df=df_concat[hardness_mdl_cols+['instance_hardness']], method='spearman', figsize=(10, 8), output_path=f'results/{dataset_name}_spearman_ih_correlation.png')

## AUC

In [None]:
dfs = []
for dataset_name in datasets:
  label_col = dataset_config[dataset_name]['class_attribute']
  
  df_pyhard = pd.read_csv(f'results/{dataset_name}_pyhard.csv')
  df_hardness_mdl = pd.read_csv(f'results/{dataset_name}_hardness_mdl.csv')
  df_pyhard['is_comittee_wrong'] = np.where(df_pyhard['comittee_vote'] != df_pyhard[label_col], 1, 0)
  
  df_concat = pd.concat([df_hardness_mdl, df_pyhard], axis=1)[hardness_mdl_cols + pyhard_cols + ["comittee_vote", "is_comittee_wrong"] ]
  dfs.append(df_concat)
  compare_hardnessmdl_pyhard(dataset_name)

df_all = pd.concat(dfs, ignore_index=True)

y_true__is_error = df_all["is_comittee_wrong"]

auc_scores = {}
for col in hardness_mdl_cols:
    y_score = df_all[col]
    auc = roc_auc_score(y_true__is_error, y_score)
     
    auc_scores[col] = auc
    print(f"AUC para '{col}': {auc:.4f}")


In [None]:
lst = []
for dataset in datasets:
  dataset_name = dataset
  label_col = dataset_config[dataset_name]['class_attribute']
  df_pyhard = pd.read_csv(f'results/{dataset_name}_pyhard.csv')
  df_hardness_mdl = pd.read_csv(f'results/{dataset_name}_hardness_mdl.csv')
  y_true__is_error = np.where(df_pyhard['comittee_vote'] != df_pyhard[label_col], 1, 0)

  auc_scores = {}
  auc_scores['dataset'] = dataset
  for col in hardness_mdl_cols:
      y_score = df_hardness_mdl[col]

      auc = roc_auc_score(y_true__is_error, y_score)
      
      auc_scores[col] = auc
  lst.append(auc_scores)

pd.DataFrame(lst).sort_values(by='dataset').reset_index(drop=True)

## Spearman Correlation

In [None]:
corr = df_all[hardness_mdl_cols + pyhard_cols].corr(method='spearman')

plt.figure(figsize=(20, 16))
sns.heatmap(
    corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, square=True, cbar_kws={"shrink": 0.8}
)
plt.title("Correlação", fontsize=14)
plt.tight_layout()

plt.savefig('results/uci_all_correlation_spearman.png')
plt.close()

### Spearman Correlation Clustering

In [None]:
sns.set_theme(style="white", font_scale=0.9)
g = sns.clustermap(
    corr,
    method="average",          
    metric="euclidean",        
    cmap="coolwarm",           
    center=0,
    annot=True,               
    linewidths=0.3,
    figsize=(20, 16),
    cbar_kws={"label": "Correlação de Spearman"}
)

g.ax_row_dendrogram.set_visible(False)

mask = np.tril(np.ones_like(g.data2d, dtype=bool))

g.ax_heatmap.clear()

sns.heatmap(
    g.data2d,
    mask=mask,
    ax=g.ax_heatmap,
    cmap="coolwarm",
    center=0,
    annot=True,
    fmt=".2f",
    linewidths=0.3,
    cbar=False
)

#plt.suptitle("Agrupamento hierárquico das medidas de dificuldade", fontsize=14, y=1.02)
g.savefig("results/uci_all_clustering_correlation_spearman.png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
dist = 1 - abs(corr)
flat_dist = squareform(dist, checks=False)
Z = linkage(flat_dist, method='average')

scores = []
for k in range(4, 25):
    labels = fcluster(Z, k, criterion='maxclust')
    score = silhouette_score(dist, labels, metric='precomputed')
    scores.append(score)

plt.plot(range(4, 25), scores, marker='o')
plt.xlabel("Número de clusters")
plt.ylabel("Silhouette Score")
plt.show()

In [None]:
Z = linkage(1 - abs(corr), method='average')

distances = Z[:, 2]

diffs = np.diff(distances[::-1])
k_opt = np.argmax(diffs) + 1
print("Elbow Melhod:", k_opt)


In [None]:
diffs

In [None]:
order = g.dendrogram_row.reordered_ind
labels = corr.columns[order]

clusters = fcluster(g.dendrogram_row.linkage, t=8, criterion='maxclust')

grouped_measures = pd.DataFrame({'measure': labels, 'cluster': clusters}).sort_values('cluster')
print(grouped_measures)

In [None]:
for cluster_id in grouped_measures['cluster'].unique():
    cols = grouped_measures.query("cluster == @cluster_id")['measure']
    sub_corr = corr.loc[cols, cols]
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(sub_corr, annot=True, fmt=".2f", cmap="coolwarm", center=0)
    plt.title(f"Cluster {cluster_id} – Correlação interna")
    plt.tight_layout()
    plt.show()


In [None]:
corr = df_all[hardness_mdl_cols + pyhard_cols].corr(method="spearman")

intra_mdl = corr.loc[hardness_mdl_cols, hardness_mdl_cols].mean().mean()
intra_pyhard = corr.loc[pyhard_cols, pyhard_cols].mean().mean()
inter = corr.loc[hardness_mdl_cols, pyhard_cols].mean().mean()

summary = pd.DataFrame({
    "Grupo": ["MDL × MDL", "PyHard × PyHard", "MDL × PyHard"],
    "Correlação média": [intra_mdl, intra_pyhard, inter]
})
print(summary.round(3))