In [None]:
!pip install matplotlib
!pip install seaborn
!pip install plotly

In [None]:
import os
import sys

path_to_research = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

if path_to_research not in sys.path:
    sys.path.insert(0, path_to_research)

print(f"sys.path: {path_to_research}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from typing import Any, Dict, List
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler

import hardnessmdl


In [None]:
df1 = pd.read_csv('two_classes/test1.csv')
df2 = pd.read_csv('two_classes/test2.csv')
df3 = pd.read_csv('two_classes/test3.csv')
df4 = pd.read_csv('two_classes/test4.csv')

In [None]:

fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)
fig.suptitle('Scatter Plots of DataFrames')

sns.scatterplot(ax=axes[0], data=df1, x='X', y='Y', hue='class')
axes[0].set_title('DataFrame 1')

sns.scatterplot(ax=axes[1], data=df2, x='X', y='Y', hue='class')
axes[1].set_title('DataFrame 2')

sns.scatterplot(ax=axes[2], data=df3, x='X', y='Y', hue='class')
axes[2].set_title('DataFrame 3')

sns.scatterplot(ax=axes[3], data=df4, x='X', y='Y', hue='class')
axes[3].set_title('DataFrame 4')

plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to prevent title overlap
plt.show()

In [None]:
def _compute_single(
    test_index: int,
    df: pd.DataFrame,
    feature_cols: List[str],
    label_col: str,
    class_map: Dict[str, int],
    n_classes: int,
    n_dims: int,
    kwargs: Dict[str, Any],
):
    """Auxiliary function"""
    test_df = df.loc[[test_index]]
    train_df = df.drop(index=test_index)

    model = hardnessmdl.HardnessMDL(n_classes=n_classes, n_dims=n_dims)

    model.set_learning_rate(kwargs.get("learning_rate", 0.01))
    model.set_momentum(kwargs.get("momentum", 0.9))
    model.set_tau(kwargs.get("tau", 0))
    model.set_omega(kwargs.get("omega", 32.0))
    model.set_forgetting_factor(kwargs.get("forgetting_factor", 1.0))
    model.set_sigma(kwargs.get("sigma", 1.0))

    X_train = train_df[feature_cols].to_numpy()
    y_train_names = train_df[label_col].to_numpy()

    X_test = test_df[feature_cols].to_numpy()
    y_test_name = test_df[label_col].to_numpy()[0]
    true_label = class_map[y_test_name]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    
    for j in range(len(X_train_scaled)):
        features = X_train_scaled[j]
        label = class_map[y_train_names[j]]
        model.train(features, label)

    hardness_dict = model.hardness(X_test_scaled[0], true_label)

    feature_dict = {col: val for col, val in zip(feature_cols, X_test[0])}

    return {
        "index": test_index,
        **feature_dict,
        **hardness_dict,
    }


def compute_loo_hardness(
    df: pd.DataFrame,
    feature_cols: List[str],
    label_col: str,
    n_jobs: int = os.cpu_count() // 2,
    **kwargs: Dict[str, Any],
) -> List[Any]:
    """
    Parallel Leave-One-Out hardness computation using joblib + tqdm.
    Computes hardness measures for each sample in a dataframe using
    Leave-One-Out cross-validation.

    Args:
        df: The full dataframe containing all samples.
        feature_cols: A list of column names to be used as features.
        label_col: The name of the column containing the class label.
        **kwargs: Hyperparameters for the GMDL model.

    Returns:
        A list of hardness measures, one for each sample in the original dataframe.
    """
    class_names = sorted(df[label_col].unique().tolist())
    class_map = {name: i for i, name in enumerate(class_names)}
    n_classes = len(class_names)
    n_dims = len(feature_cols)

    results = Parallel(n_jobs=n_jobs, batch_size="auto")(
        delayed(_compute_single)(
            test_index,
            df,
            feature_cols,
            label_col,
            class_map,
            n_classes,
            n_dims,
            kwargs,
        )
        for test_index in df.index
    )

    return results



In [None]:
def _compute_description_lenght_single_instance(
    test_index: int,
    df: pd.DataFrame,
    feature_cols: List[str],
    label_col: str,
    class_map: Dict[str, int],
    n_classes: int,
    n_dims: int,
    kwargs: Dict[str, Any],
):
    """Auxiliary function"""
    test_df = df.loc[[test_index]]
    train_df = df.drop(index=test_index)

    model = hardnessmdl.HardnessMDL(n_classes=n_classes, n_dims=n_dims)

    model.set_learning_rate(kwargs.get("learning_rate", 0.01))
    model.set_momentum(kwargs.get("momentum", 0.9))
    model.set_tau(kwargs.get("tau", 0))
    model.set_omega(kwargs.get("omega", 32.0))
    model.set_forgetting_factor(kwargs.get("forgetting_factor", 1.0))
    model.set_sigma(kwargs.get("sigma", 1.0))

    X_train = train_df[feature_cols].to_numpy()
    y_train_names = train_df[label_col].to_numpy()

    X_test = test_df[feature_cols].to_numpy()
    y_test_name = test_df[label_col].to_numpy()[0]
    true_label = class_map[y_test_name]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    
    for j in range(len(X_train_scaled)):
        features = X_train_scaled[j]
        label = class_map[y_train_names[j]]
        model.train(features, label)

    prediction_dict = model.predict(X_test_scaled[0])

    feature_dict = {col: val for col, val in zip(feature_cols, X_test[0])}

    return {
        "index": test_index,
        "true_label": true_label,
        **feature_dict,
        **prediction_dict,
    }


def compute_loo_description_lenght(
    df: pd.DataFrame,
    feature_cols: List[str],
    label_col: str,
    n_jobs: int = os.cpu_count() // 2,
    **kwargs: Dict[str, Any],
) -> List[Any]:
    """
    Parallel Leave-One-Out Description Lenght computation using joblib + tqdm.
    Computes Description Lenght for each sample in a dataframe using
    Leave-One-Out cross-validation.

    Args:
        df: The full dataframe containing all samples.
        feature_cols: A list of column names to be used as features.
        label_col: The name of the column containing the class label.
        **kwargs: Hyperparameters for the GMDL model.

    Returns:
        A list of Description Lenght, one for each sample in the original dataframe.
    """
    class_names = sorted(df[label_col].unique().tolist())
    class_map = {name: i for i, name in enumerate(class_names)}
    n_classes = len(class_names)
    n_dims = len(feature_cols)

    results = Parallel(n_jobs=n_jobs, batch_size="auto")(
        delayed(_compute_description_lenght_single_instance)(
            test_index,
            df,
            feature_cols,
            label_col,
            class_map,
            n_classes,
            n_dims,
            kwargs,
        )
        for test_index in df.index
    )

    return results


In [None]:
feature_columns = ['X', 'Y']
label_column = 'class'

In [None]:
#measures1 = compute_loo_hardness(df1, feature_columns, label_column)
measures1 = compute_loo_description_lenght(df1, feature_columns, label_column)
df1_result = pd.DataFrame(measures1)
df1_result.to_csv("two_classes/results/test1_description_lenght.csv", index=False)


In [None]:
#measures2 = compute_loo_hardness(df2, feature_columns, label_column)
measures2 = compute_loo_description_lenght(df2, feature_columns, label_column)
df2_result = pd.DataFrame(measures2)
df2_result.to_csv("two_classes/results/test2_description_lenght.csv", index=False)

In [None]:
#measures3 = compute_loo_hardness(df3, feature_columns, label_column)
measures3 = compute_loo_description_lenght(df3, feature_columns, label_column)
df3_result = pd.DataFrame(measures3)
df3_result.to_csv("two_classes/results/test3_description_lenght.csv", index=False)

In [None]:
#measures4 = compute_loo_hardness(df4, feature_columns, label_column)
measures4 = compute_loo_description_lenght(df4, feature_columns, label_column)
df4_result = pd.DataFrame(measures4)
df4_result.to_csv("two_classes/results/test4_description_lenght.csv", index=False)

In [None]:
df1_result.head(5)

In [None]:
df2_result.head(5)

In [None]:
df3_result.head(5)

In [None]:
df4_result.head(5)

In [None]:
abs(df1_result.label - df1_result.true_label).sum()

In [None]:
raise

In [None]:
dfs = [df1_result, df2_result, df3_result, df4_result]

df_meta_feats_dict = {}

for i, df in enumerate(dfs):
    df_meta_feats_dict[f'DataFrame {i+1}'] = df

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=4, subplot_titles=(
    "DataFrame 1",
    "DataFrame 2",
    "DataFrame 3",
    "DataFrame 4"
))

for i, df in enumerate(dfs):
    df_name = f'DataFrame {i+1}'
    dcp_values = df_meta_feats_dict[df_name]['r_min']

    fig.add_trace(go.Scattergl(x=df["X"], y=df["Y"], mode='markers',
                             marker=dict(color=dcp_values,
                                         colorscale='viridis',
                                         cmin=0, cmax=1, 
                                         showscale=True if i == 3 else False, 
                                         colorbar=dict(title='r_min', x=1.02)),
                             name=df_name),
                  row=1, col=i+1)

fig.update_layout(title_text="Scatter Plots of DataFrames 1 to 4", showlegend=False)
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=4, subplot_titles=(
    "DataFrame 1",
    "DataFrame 2",
    "DataFrame 3",
    "DataFrame 4"
))

for i, df in enumerate(dfs):
    df_name = f'DataFrame {i+1}'
    dcp_values = df_meta_feats_dict[df_name]['r_med']

    fig.add_trace(go.Scattergl(x=df["X"], y=df["Y"], mode='markers',
                             marker=dict(color=dcp_values,
                                         colorscale='viridis',
                                         cmin=0, cmax=1, 
                                         showscale=True if i == 3 else False, 
                                         colorbar=dict(title='r_med', x=1.02)),
                             name=df_name),
                  row=1, col=i+1)

fig.update_layout(title_text="Scatter Plots of DataFrames 1 to 4", showlegend=False)
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=4, subplot_titles=(
    "DataFrame 1",
    "DataFrame 2",
    "DataFrame 3",
    "DataFrame 4"
))

for i, df in enumerate(dfs):
    df_name = f'DataFrame {i+1}'
    dcp_values = df_meta_feats_dict[df_name]['relative_position']

    fig.add_trace(go.Scattergl(x=df["X"], y=df["Y"], mode='markers',
                             marker=dict(color=dcp_values,
                                         colorscale='viridis',
                                         cmin=0, cmax=1, 
                                         showscale=True if i == 3 else False, 
                                         colorbar=dict(title='relative_position', x=1.02)),
                             name=df_name),
                  row=1, col=i+1)

fig.update_layout(title_text="Scatter Plots of DataFrames 1 to 4", showlegend=False)
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=4, subplot_titles=(
    "DataFrame 1",
    "DataFrame 2",
    "DataFrame 3",
    "DataFrame 4"
))

for i, df in enumerate(dfs):
    df_name = f'DataFrame {i+1}'
    dcp_values = df_meta_feats_dict[df_name]['pseudo_probability']

    fig.add_trace(go.Scattergl(x=df["X"], y=df["Y"], mode='markers',
                             marker=dict(color=dcp_values,
                                         colorscale='viridis',
                                         cmin=0, cmax=1, 
                                         showscale=True if i == 3 else False, 
                                         colorbar=dict(title='pseudo_probability', x=1.02)),
                             name=df_name),
                  row=1, col=i+1)

fig.update_layout(title_text="Scatter Plots of DataFrames 1 to 4", showlegend=False)
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=4, subplot_titles=(
    "DataFrame 1",
    "DataFrame 2",
    "DataFrame 3",
    "DataFrame 4"
))

for i, df in enumerate(dfs):
    df_name = f'DataFrame {i+1}'
    dcp_values = df_meta_feats_dict[df_name]['normalized_entropy']

    fig.add_trace(go.Scattergl(x=df["X"], y=df["Y"], mode='markers',
                             marker=dict(color=dcp_values,
                                         colorscale='viridis',
                                         cmin=0, cmax=1, 
                                         showscale=True if i == 3 else False, 
                                         colorbar=dict(title='normalized_entropy', x=1.02)),
                             name=df_name),
                  row=1, col=i+1)

fig.update_layout(title_text="Scatter Plots of DataFrames 1 to 4", showlegend=False)
fig.show()