## Computing Description Lenghts using PyGMDL


In [7]:
import os
import sys

path_to_research = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

if path_to_research not in sys.path:
    sys.path.insert(0, path_to_research)

print(f"sys.path: {path_to_research}")


sys.path: /home/emanu/ubuntu/ita/HardnessMDL/research


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from typing import Any, Dict, List
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler

import hardnessmdl

In [9]:
def _compute_description_lenght_single_instance(
    test_index: int,
    df: pd.DataFrame,
    feature_cols: List[str],
    label_col: str,
    class_map: Dict[str, int],
    n_classes: int,
    n_dims: int,
    kwargs: Dict[str, Any],
):
    """Auxiliary function"""
    test_df = df.loc[[test_index]]
    train_df = df.drop(index=test_index)

    model = hardnessmdl.HardnessMDL(n_classes=n_classes, n_dims=n_dims)

    model.set_learning_rate(kwargs.get("learning_rate", 0.01))
    model.set_momentum(kwargs.get("momentum", 0.9))
    model.set_tau(kwargs.get("tau", 0))
    model.set_omega(kwargs.get("omega", 32.0))
    model.set_forgetting_factor(kwargs.get("forgetting_factor", 1.0))
    model.set_sigma(kwargs.get("sigma", 1.0))

    X_train = train_df[feature_cols].to_numpy()
    y_train_names = train_df[label_col].to_numpy()

    X_test = test_df[feature_cols].to_numpy()
    y_test_name = test_df[label_col].to_numpy()[0]
    true_label = class_map[y_test_name]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    
    for j in range(len(X_train_scaled)):
        features = X_train_scaled[j]
        label = class_map[y_train_names[j]]
        model.train(features, label)

    prediction_dict = model.predict(X_test_scaled[0])

    feature_dict = {col: val for col, val in zip(feature_cols, X_test[0])}

    return {
        "index": test_index,
        "true_label": true_label,
        **feature_dict,
        **prediction_dict,
    }


def compute_loo_description_lenght(
    df: pd.DataFrame,
    feature_cols: List[str],
    label_col: str,
    n_jobs: int = os.cpu_count() - 1, # // 2
    **kwargs: Dict[str, Any],
) -> List[Any]:
    """
    Parallel Leave-One-Out Description Lenght computation using joblib + tqdm.
    Computes Description Lenght for each sample in a dataframe using
    Leave-One-Out cross-validation.

    Args:
        df: The full dataframe containing all samples.
        feature_cols: A list of column names to be used as features.
        label_col: The name of the column containing the class label.
        **kwargs: Hyperparameters for the GMDL model.

    Returns:
        A list of Description Lenght, one for each sample in the original dataframe.
    """
    class_names = sorted(df[label_col].unique().tolist())
    class_map = {name: i for i, name in enumerate(class_names)}
    n_classes = len(class_names)
    n_dims = len(feature_cols)

    results = Parallel(n_jobs=n_jobs, batch_size="auto")(
        delayed(_compute_description_lenght_single_instance)(
            test_index,
            df,
            feature_cols,
            label_col,
            class_map,
            n_classes,
            n_dims,
            kwargs,
        )
        for test_index in df.index
    )

    return results


In [10]:
folder = 'five_classes'
df1 = pd.read_csv(f'{folder}/test1.csv')
df2 = pd.read_csv(f'{folder}/test2.csv')
df3 = pd.read_csv(f'{folder}/test3.csv')
df4 = pd.read_csv(f'{folder}/test4.csv')

In [11]:
feature_columns = ['X', 'Y']
label_column = 'class'

In [12]:
#measures1 = compute_loo_hardness(df1, feature_columns, label_column)
measures1 = compute_loo_description_lenght(df1, feature_columns, label_column)
df1_result = pd.DataFrame(measures1)
df1_result.to_csv(f"{folder}/results/test1_description_lenght_unnormalized.csv", index=False)
# 116m 6.6s
# 214m 35.8s
# 386m 27.2s

In [13]:
#measures2 = compute_loo_hardness(df2, feature_columns, label_column)
measures2 = compute_loo_description_lenght(df2, feature_columns, label_column)
df2_result = pd.DataFrame(measures2)
df2_result.to_csv(f"{folder}/results/test2_description_lenght_unnormalized.csv", index=False)
# 109m 7.3s
# 200m 49.1a
# 388m 48.2s

In [14]:
#measures3 = compute_loo_hardness(df3, feature_columns, label_column)
measures3 = compute_loo_description_lenght(df3, feature_columns, label_column)
df3_result = pd.DataFrame(measures3)
df3_result.to_csv(f"{folder}/results/test3_description_lenght_unnormalized.csv", index=False)
# 101m 2.0s
# 194m 12.0s
# 358m 43.8s

In [15]:
#measures4 = compute_loo_hardness(df4, feature_columns, label_column)
measures4 = compute_loo_description_lenght(df4, feature_columns, label_column)
df4_result = pd.DataFrame(measures4)
df4_result.to_csv(f"{folder}/results/test4_description_lenght_unnormalized.csv", index=False)
# 96m 53.3s
# 187m 53.8s
# 331m 34.6s