In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from typing import List

import utils_HardnessMDL as utils_MDL

In [2]:
def plot_distribution_by_error(dfs: List[pd.DataFrame], measure_cols: List[str], output_path: str = None):
    df_all = pd.concat(dfs, ignore_index=True)
    
    n_measures = len(measure_cols)
    fig, axes = plt.subplots(2, (n_measures + 1)//2, figsize=(4*(n_measures//2), 8))
    axes = axes.flatten()
    
    for i, col in enumerate(measure_cols):
        sns.violinplot(
            data=df_all, x="is_error", y=col, 
            ax=axes[i], palette="Set2", inner="box", hue="is_error"
        )
        axes[i].set_title(f"{col} distribution")
        axes[i].set_xlabel("is_error (0=ok, 1=error)")
        axes[i].set_ylabel(col)
    
    plt.tight_layout()
    if output_path:
      plt.savefig(output_path)
    plt.close()

In [3]:
def plot_correlation_heatmap(df: pd.DataFrame, title: str = "corr", method: str = 'kendall', output_path: str = None, figsize=(10, 8)):
    corr = df.corr(method=method)

    plt.figure(figsize=figsize)
    sns.heatmap(
        corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, square=True, cbar_kws={"shrink": 0.8}
    )
    plt.title(title, fontsize=14)
    plt.tight_layout()
    if output_path:
      plt.savefig(output_path)
    plt.close()

In [4]:
hardness_mdl_cols = ['r_min', 'r_med', 'relative_position', 'pseudo_probability', 'normalized_entropy', 'description_length_margin', 'description_length_true_cost', 'kullback_leibler_divergence']
pyhard_cols = ['feature_kDN', 'feature_DS', 'feature_DCP', 'feature_TD_P', 'feature_TD_U', 'feature_CL', 'feature_CLD', 'feature_MV', 'feature_CB', 'feature_N1', 'feature_N2', 'feature_LSC', 'feature_LSR', 'feature_Harmfulness', 'feature_Usefulness', 'feature_F1', 'feature_F2', 'feature_F3', 'feature_F4', 'instance_hardness']

In [5]:
uci_ids = {
    "fertility": 244,
    "iris": 53,
    "wine": 109,
    "libras": 181,
    "wdbc": 17,  # Wisconsin Diagnostic Breast Cancer
    "contrac": 23,  # Contraceptive Method Choice
    "wine-red": 186,
    #"wine-white": 187,
    #"letter": 59,
    #"adult": 2,
    # "miniboone": 199,
    # "skin": 229,
    # "covertype": 31,
    # "hill-valey": 606,
    # "susy": 279,
    # "ht-sensor": 362,  # Gas sensors for home activity monitoring
}

datasets = list(uci_ids.keys())
dataset_config = utils_MDL.load_dataset_config('dataset.yaml')
[{e: dataset_config[e]['n_instances']} for e in datasets]

[{'fertility': 100},
 {'iris': 150},
 {'wine': 178},
 {'libras': 360},
 {'wdbc': 569},
 {'contrac': 1473},
 {'wine-red': 1599}]

In [6]:
hardness_mdl_suffix = '_hardness_mdl.csv'
pyhard_suffix = '_pyhard.csv'

In [7]:
def compare_hardnessmdl_pyhard(dataset_name: str):
  df_pyhard = pd.read_csv(f'results/{dataset_name}_pyhard.csv')
  df_hardness_mdl = pd.read_csv(f'results/{dataset_name}_hardness_mdl.csv')
  plot_distribution_by_error(dfs=[df_hardness_mdl], measure_cols=hardness_mdl_cols, output_path=f'results/{dataset_name}_distribution_by_error.png')
  df_concat = pd.concat([df_hardness_mdl, df_pyhard], axis=1)[hardness_mdl_cols + pyhard_cols]
  plot_correlation_heatmap(df=df_concat, method='kendall', figsize=(20, 16), output_path=f'results/{dataset_name}_kendall_correlation.png')
  plot_correlation_heatmap(df=df_concat[hardness_mdl_cols+['instance_hardness']], method='kendall', figsize=(10, 9), output_path=f'results/{dataset_name}_kendall_ih_correlation.png')
  plot_correlation_heatmap(df=df_concat, method='spearman', figsize=(20, 16), output_path=f'results/{dataset_name}_spearman_correlation.png')
  plot_correlation_heatmap(df=df_concat[hardness_mdl_cols+['instance_hardness']], method='spearman', figsize=(10, 8), output_path=f'results/{dataset_name}_spearman_ih_correlation.png')

In [8]:
dfs = []
for dataset_name in datasets:
  label_col = dataset_config[dataset_name]['class_attribute']
  
  df_pyhard = pd.read_csv(f'results/{dataset_name}_pyhard.csv')
  df_hardness_mdl = pd.read_csv(f'results/{dataset_name}_hardness_mdl.csv')
  df_pyhard['is_comittee_wrong'] = np.where(df_pyhard['comittee_vote'] != df_pyhard[label_col], 1, 0)
  
  df_concat = pd.concat([df_hardness_mdl, df_pyhard], axis=1)[hardness_mdl_cols + pyhard_cols + ["comittee_vote", "is_comittee_wrong"] ]
  dfs.append(df_concat)
  # compare_hardnessmdl_pyhard(dataset_name)

df_all = pd.concat(dfs, ignore_index=True)

y_true__is_error = df_all["is_comittee_wrong"]

auc_scores = {}
for col in hardness_mdl_cols:
    y_score = df_all[col]
    auc = roc_auc_score(y_true__is_error, y_score)
     
    auc_scores[col] = auc
    print(f"AUC para '{col}': {auc:.4f}")


AUC para 'r_min': 0.7468
AUC para 'r_med': 0.7586
AUC para 'relative_position': 0.7273
AUC para 'pseudo_probability': 0.7785
AUC para 'normalized_entropy': 0.7043
AUC para 'description_length_margin': 0.6076
AUC para 'description_length_true_cost': 0.6155
AUC para 'kullback_leibler_divergence': 0.7654


In [15]:
lst = []
for dataset in datasets:
  dataset_name = dataset#'wine-red'
  label_col = dataset_config[dataset_name]['class_attribute']
  df_pyhard = pd.read_csv(f'results/{dataset_name}_pyhard.csv')
  df_hardness_mdl = pd.read_csv(f'results/{dataset_name}_hardness_mdl.csv')
  y_true__is_error = np.where(df_pyhard['comittee_vote'] != df_pyhard[label_col], 1, 0)

  auc_scores = {}
  auc_scores['dataset'] = dataset
  for col in hardness_mdl_cols:
      y_score = df_hardness_mdl[col]

      auc = roc_auc_score(y_true__is_error, y_score)
      
      auc_scores[col] = auc
      #print(f"AUC para '{col}': {auc:.4f}")
  lst.append(auc_scores)

In [16]:
pd.DataFrame(lst)

Unnamed: 0,dataset,r_min,r_med,relative_position,pseudo_probability,normalized_entropy,description_length_margin,description_length_true_cost,kullback_leibler_divergence
0,fertility,0.937255,0.937255,0.882745,0.955294,0.551373,0.529412,0.955294,0.955294
1,iris,0.983796,0.953704,0.914352,0.994213,0.814815,0.827546,0.953704,0.994213
2,wine,0.99435,0.983051,0.99435,0.99435,0.943503,0.943503,0.983051,0.99435
3,libras,0.829015,0.698739,0.824519,0.832355,0.648914,0.675762,0.698739,0.741446
4,wdbc,0.95031,0.95031,0.840094,0.9152,0.84262,0.854676,0.957641,0.931818
5,contrac,0.616669,0.638878,0.630597,0.663644,0.549412,0.532031,0.653005,0.663644
6,wine-red,0.653515,0.647068,0.689831,0.727913,0.506928,0.513614,0.647485,0.727908


'diagnosis'