In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from typing import List

import utils_HardnessMDL as utils_MDL

In [None]:
def plot_distribution_by_error(dfs: List[pd.DataFrame], measure_cols: List[str], output_path: str = None):
    df_all = pd.concat(dfs, ignore_index=True)
    
    n_measures = len(measure_cols)
    fig, axes = plt.subplots(2, (n_measures + 1)//2, figsize=(4*(n_measures//2), 8))
    axes = axes.flatten()
    
    for i, col in enumerate(measure_cols):
        sns.violinplot(
            data=df_all, x="is_error", y=col, 
            ax=axes[i], palette="Set2", inner="box", hue="is_error"
        )
        axes[i].set_title(f"{col} distribution")
        axes[i].set_xlabel("is_error (0=ok, 1=error)")
        axes[i].set_ylabel(col)
    
    plt.tight_layout()
    if output_path:
      plt.savefig(output_path)
    plt.close()

In [None]:
def plot_correlation_heatmap(df: pd.DataFrame, title: str = "corr", method: str = 'kendall', output_path: str = None, figsize=(10, 8)):
    corr = df.corr(method=method)

    plt.figure(figsize=figsize)
    sns.heatmap(
        corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, square=True, cbar_kws={"shrink": 0.8}
    )
    plt.title(title, fontsize=14)
    plt.tight_layout()
    if output_path:
      plt.savefig(output_path)
    plt.close()

In [None]:
hardness_mdl_cols = ['r_min', 'r_med', 'relative_position', 'pseudo_probability', 'normalized_entropy', 'description_length_margin', 'description_length_true_cost', 'kullback_leibler_divergence']
pyhard_cols = ['feature_kDN', 'feature_DS', 'feature_DCP', 'feature_TD_P', 'feature_TD_U', 'feature_CL', 'feature_CLD', 'feature_MV', 'feature_CB', 'feature_N1', 'feature_N2', 'feature_LSC', 'feature_LSR', 'feature_Harmfulness', 'feature_Usefulness', 'feature_F1', 'feature_F2', 'feature_F3', 'feature_F4', 'instance_hardness']

In [None]:
uci_ids = {
    "fertility": 244,
    "iris": 53,
    "wine": 109,
    "libras": 181,
    "wdbc": 17,  # Wisconsin Diagnostic Breast Cancer
    "contrac": 23,  # Contraceptive Method Choice
    "wine-red": 186,
    "wine-white": 187,
    "letter": 59,
    "adult": 2,
    # "miniboone": 199,
    # "skin": 229,
    # "covertype": 31,
    # "hill-valey": 606,
    # "susy": 279,
    # "ht-sensor": 362,  # Gas sensors for home activity monitoring
}

datasets = list(uci_ids.keys())
dataset_config = utils_MDL.load_dataset_config('dataset.yaml')
[{e: dataset_config[e]['n_instances']} for e in datasets]

In [None]:
hardness_mdl_suffix = '_hardness_mdl.csv'
pyhard_suffix = '_pyhard.csv'

In [None]:
def compare_hardnessmdl_pyhard(dataset_name: str):
  df_pyhard = pd.read_csv(f'results/{dataset_name}_pyhard.csv')
  df_hardness_mdl = pd.read_csv(f'results/{dataset_name}_hardness_mdl.csv')
  plot_distribution_by_error(dfs=[df_hardness_mdl], measure_cols=hardness_mdl_cols, output_path=f'results/{dataset_name}_distribution_by_error.png')
  df_concat = pd.concat([df_hardness_mdl, df_pyhard], axis=1)[hardness_mdl_cols + pyhard_cols]
  plot_correlation_heatmap(df=df_concat, method='kendall', figsize=(20, 16), output_path=f'results/{dataset_name}_kendall_correlation.png')
  plot_correlation_heatmap(df=df_concat[hardness_mdl_cols+['instance_hardness']], method='kendall', figsize=(10, 9), output_path=f'results/{dataset_name}_kendall_ih_correlation.png')
  plot_correlation_heatmap(df=df_concat, method='spearman', figsize=(20, 16), output_path=f'results/{dataset_name}_spearman_correlation.png')
  plot_correlation_heatmap(df=df_concat[hardness_mdl_cols+['instance_hardness']], method='spearman', figsize=(10, 8), output_path=f'results/{dataset_name}_spearman_ih_correlation.png')

In [None]:
for dataset in datasets:
  compare_hardnessmdl_pyhard(dataset)