# `crowd-hpo`: Results

This notebook allows us to query tabular results via [`mlflow`](https://mlflow.org/). As a prerequisite, the experiments have to be performed in a first step. If this is the case, we can load the results for the different tables in the accompanied article. Update
- `MLRUNS_PATH` to the path, where the results are stored according to your config file [`experiment.yaml`](../conf/experiment.yaml), 
- `FIGURES_PATH` to the path, where any figures are to be saved,
- `CACHE_PATH` to the path, where any intermediate results loaded via mlfow are to be cached.

In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib.colors import LinearSegmentedColormap
from typing import Optional
from hydra.utils import to_absolute_path
from mlflow import set_tracking_uri, get_experiment_by_name, search_runs
from scipy.stats import percentileofscore

# Adjust global constants as needed.
MLRUNS_PATH = "/mnt/work/mherde/maml/crowd_hpo/"
FIGURES_PATH = "/mnt/home/mherde/projects/github/multi-annotator-machine-learning/empirical_evaluation/figures/"
CACHE_PATH = "/mnt/home/mherde/projects/github/multi-annotator-machine-learning/empirical_evaluation/python_scripts/results_hpo_backup"

# Filter meaningless warnings.
warnings.filterwarnings("ignore")

# Define global color setings.
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"
colors = [
    (0.0, (0, 128/255, 128/255)),   
    (0.5, (1.0, 1.0, 1.0)),           
    (1.0, (127/255, 0, 127/255)), 
]
custom_cmap = LinearSegmentedColormap.from_list("DivergingTealPurple", colors)

# Increase maximum number of displayed rows.
pd.set_option("display.max_rows", 1000)

# Flags for output formatting
SHOW_LATEX = True
SHOW_INDIVIDUAL_SCORES = False

# Define global constants for the visualization of results.
UPDATE_COLUMNS = {
    "params.data.optimizer.gt_params.lr": "gt_lr",
    "params.data.optimizer.gt_params.weight_decay": "gt_wd",
    "params.data.optimizer.ap_params.lr": "ap_lr",
    "params.data.optimizer.ap_params.weight_decay": "ap_wd",
    "params.data.train_batch_size": "bs",
    "params.data.max_epochs": "me",
    "params.data.lr_scheduler.params.T_max": "T_max",
    "params.architecture.params.dropout_rate": "dr",
    "params.classifier.params.lmbda": "lmbda",
    "params.classifier.params.eta": "eta",
    "params.classifier.params.alpha": "alpha",
    "params.classifier.params.beta": "beta",
    "params.classifier.embed_size": "dim",
    "params.classifier.params.epsilon": "epsilon",
}
INV_UPDATE_COLUMNS = {v: k for k, v in UPDATE_COLUMNS.items()}
APPROACHES = [
    "ground-truth",
    "majority-vote",
    "dawid-skene",
    "crowd-layer",
    "trace-reg",
    "conal",
    "union-net-a",
    "union-net-b",
    "geo-reg-w",
    "geo-reg-f",
    "madl",
    "crowd-ar",
    "annot-mix",
    "coin-net",
]
DATASETS = []
for v in ["worst-1", "worst-2", "worst-var", "rand-1", "rand-2", "rand-var", "full"]:
    DATASETS.extend(
        [
        f"music_genres_{v}",
        f"label_me_{v}",
        f"dopanim_{v}",
        f"reuters_{v}",
        f"spc_{v}",
        ]
    )
    
LOSS_FUNC = "zero_one_loss"

def process_df(df, clf_col='clf', agg_col='agg'):
    """
    Process a DataFrame by combining classifier and aggregator columns, cleaning specific substrings, 
    dropping the aggregator column, and reordering columns.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing at least the columns specified by `clf_col` and `agg_col`.
    clf_col : str, optional
        The name of the classifier column. Default is 'clf'.
    agg_col : str, optional
        The name of the aggregator column. Default is 'agg'.

    Returns
    -------
    pandas.DataFrame
        The processed DataFrame with a combined column (named after `clf_col`) placed at the beginning, 
        with the original `agg_col` column removed and specific substrings ('aggregate_' and '_None') replaced 
        with empty strings.
    """
    # Reorder columns so that the combined column comes first
    cols = [clf_col] + [col for col in df.columns if col != clf_col]
    return df[cols]

def evaluate(
    mlruns_path: str,
    experiment_name: str,
    update_columns: dict = None,
    perf_type: str = "gt",
    version: str = "test",
    metric: str = "acc",
    epoch: str = "best",
    loss_func: str = "zero_one_loss",
    print_number_of_runs: bool = True,
    cache_path: Optional[str] = None,
):
    """
    Queries the evaluation results via mlflow.

    Parameters
    ----------
    mlruns_path : str
        Path to the results saved via mlflow.
    experiment_name : str
        Name of the mlflow experiment.
    update_columns : dict, default=None
        Optional dictionary of columns to be included in the output table.
        If None, the global UPDATE_COLUMNS is used.
    perf_type : str, default="gt"
        'gt' for ground-truth estimates or 'ap' for annotator performance estimates.
    version : str, default="test"
        One of 'train', 'valid', or 'test' for different data subsets.
    metric : str, default="acc"
        Performance metric: 'acc', 'brier_score', or 'tce'.
    epoch : str, default="best"
        Either 'last' or 'best' epoch.
    print_number_of_runs : bool, default=False
        Whether to print the count of runs per classifier.
    cache_path : Optional[str], default=None
        If provided, results are cached/read from the given path.

    Returns
    -------
    runs: pd.DataFrame or None
        Table of results or None if no experiment/runs found.
    """
    runs = None
    
    # Use the provided update_columns or fall back to the default global constant.
    if update_columns is None:
        update_columns = UPDATE_COLUMNS.copy()
    
    # Attempt to load from cache if available.
    if cache_path:
        cache_file = os.path.join(cache_path, f"{experiment_name}.csv")
        if os.path.isfile(cache_file):
            runs = pd.read_csv(cache_file)
            if "hyperparameter_search" in experiment_name:
                print(cache_file)
                runs = runs.append(pd.read_csv(cache_file.replace("hyperparameter_search", "default")))
    
    if runs is None:
        # Set mlflow tracking URI.
        abs_mlruns_path = to_absolute_path(mlruns_path)
        set_tracking_uri(uri=f"file://{abs_mlruns_path}")

        exp = get_experiment_by_name(experiment_name)
        if exp is None:
            return None

        query = "status = 'FINISHED'"
    
        runs = search_runs(experiment_ids=exp.experiment_id, filter_string=query, output_format="pandas")
        if runs.empty:
            return None
        if cache_path:
            try:
                runs.to_csv(cache_file, index=False)
            except Exception as e:
                print(f"Warning: Unable to cache results due to error: {e}")
                
    # Base column renaming.
    rename_columns = {
        "params.data.class_definition._target_": "data",
        "params.classifier.name": "clf",
        "params.classifier.aggregation_method": "agg",
    }
    # Update renaming mapping if provided columns exist in runs.
    for col, new_name in update_columns.items():
        if col in runs.columns:
            rename_columns[col] = new_name

    # Build aggregation dictionary.
    agg_dict = {
        f"metrics.{perf_type}_true_{loss_func}_{epoch}_test": ["mean", "std"],
        "params.seed": ["sum"],
    }
    bayes_columns = []
    for col in runs.columns:
        if col.startswith("metrics.") and col.endswith("_valid") and "bayes" not in col:
            agg_dict[col] = ["mean", "std"]

    # Drop duplicates using selected columns.
    # Note: Ensure that 'params.data.class_definition.realistic_split' exists in runs.
    dup_keys = list(rename_columns.keys()) + ["params.seed", "params.data.class_definition.realistic_split"]
    runs = runs.drop_duplicates(subset=dup_keys)#.fillna(np.inf)
    
    if print_number_of_runs:
        # Actually print the value counts.
        print(runs["params.classifier.name"].value_counts().sum())

    runs = runs.sort_values(by="params.seed")
    runs = runs.groupby(list(rename_columns.keys()), as_index=False, dropna=False).agg(agg_dict, skipna=False)

    # Flatten multi-index columns if necessary.
    if isinstance(runs.columns, pd.MultiIndex):
        runs.columns = ["".join(map(str, col)).strip() for col in runs.columns.values]

    # Round 'metrics' columns for accuracy if they exist.
    for col in runs.columns:
        if col.startswith("metrics") and "zero_one_loss" in col:
            runs[col] = np.round(runs[col] * 100, 3)

    # Add an extra column mapping.
    rename_columns["params.seedsum"] = "n_runs"

    # Build a combined renaming mapping for any remaining columns.
    additional_rename = {
        col: col.replace("mean", "")
        .replace("max", "")
        .replace("std", "_std")
        .replace("-", "_")
        .replace("metrics.", "")
        for col in runs.columns
        if col not in rename_columns
    }
    # Merge both dictionaries. (Note: The "|" operator requires Python 3.9+)
    full_rename = rename_columns | additional_rename
    runs = runs.rename(columns=full_rename)
    return runs
    
def plot_rankings(ranking_df, file_path, approaches_key="clf", mean_axis="columns"):
    # Extract metric labels from the DataFrame columns (skip the first column)
    metrics = [c.replace('_', '-').replace('-target', '') for c in ranking_df.columns[1:]]
    # Extract the list of approaches (from the first column of the DataFrame)
    approaches = ranking_df[approaches_key].values

    # Compute the ranking matrix from numeric columns
    ranking_matrix = ranking_df.iloc[:, 1:].values.round(2)
    
    if mean_axis == "columns":
        # Compute the mean and standard deviation for each column (exclude the first row)
        means = ranking_df.iloc[1:, 1:].mean(axis=0).values.round(2)
        stds  = ranking_df.iloc[1:, 1:].std(axis=0).values.round(2)
        # Create a spacer row (filled with NaN) to visually separate the aggregated row
        spacer = np.full((1, means.shape[0]), np.nan)
        # Append the spacer and then the mean row to the ranking matrix
        ranking_matrix = np.vstack([ranking_matrix, spacer, means])
        # Append empty label for spacer and 'Mean' for the aggregated row to the list of approaches
        approaches = np.append(approaches, ['', 'Mean'])
        # The index of the "Mean" row in the augmented matrix
        mean_row_index = ranking_matrix.shape[0] - 1

    elif mean_axis == "rows":
        # Compute the mean and std for each row (over all numeric columns)
        means = ranking_df.iloc[:, 1:].mean(axis=1).values.round(2)
        stds  = ranking_df.iloc[:, 1:].std(axis=1).values.round(2)
        # Create a spacer column (filled with NaN)
        spacer = np.full((means.shape[0], 1), np.nan)
        # Append the spacer and then the mean column to the ranking matrix
        ranking_matrix = np.column_stack([ranking_matrix, spacer, means])
        # Append empty label and 'Mean' to metrics
        metrics.append('')
        metrics.append('Mean')
        # The index of the "Mean" column in the augmented matrix
        mean_col_index = ranking_matrix.shape[1] - 1

    else:
        raise ValueError("mean_axis must be either 'rows' or 'columns'.")

    # Plotting
    # We use imshow on the transpose so that the x-axis displays the aggregated row labels (approaches)
    plt.figure(figsize=(0.8*ranking_matrix.shape[0], 0.6*ranking_matrix.shape[1]))
    plt.imshow(ranking_matrix.T, cmap=custom_cmap, vmin=0, vmax=1, alpha=1, aspect="auto")

    n_rows, n_cols = ranking_matrix.shape

    # Annotate each cell in the matrix
    for i in range(n_rows):
        for j in range(n_cols):
            val = ranking_matrix[i, j]
            # Skip annotation for spacer cells (which are NaN)
            if np.isnan(val):
                continue
            # For the aggregated mean cells, add the standard deviation info
            #if mean_axis == "columns" and i == mean_row_index:
            #    # The j-th column's std deviation from the computed stds array
            #    txt = f"{val:.2f}\n±{stds[j]:.2f}"
            #elif mean_axis == "rows" and j == mean_col_index:
            #    # For each row, annotate the aggregated column with mean and std dev
            #    txt = f"{val:.2f}\n±{stds[i]:.2f}"
            #else:
            txt = f"{val:.2f}"
            # Because we used ranking_matrix.T in imshow, the text positions are swapped.
            plt.text(i, j, txt, ha="center", va="center", color='black', fontsize=16)

    # Adjust tick labels.
    # With imshow(ranking_matrix.T), the x-axis corresponds to the original rows (approaches),
    # and the y-axis to the original columns (metrics).
    plt.xticks(np.arange(n_rows), approaches, rotation=90)
    plt.tick_params(top=True, labeltop=True, bottom=False, labelbottom=False)
    plt.yticks(np.arange(n_cols), metrics)
    #plt.colorbar()

    plt.tight_layout()
    plt.savefig(file_path)
    plt.show()

## Hyperparameter Study

Print the across all dataset variants, *learning from crowds* (LFC) approaches, and *hyperparameter* (HPS) selection criteria.

In [None]:
df_dict = {}
approach_rankings = None
metric_rankings = None
for ds in DATASETS:
    print(ds)
    results_df_dict = {}
    for experiment_type in ["hyperparameter_search", "default",  f"default_data"]:
        print(experiment_type)
        # Load results per dataset.
        exp_name = f"{experiment_type}_{ds}"
        runs_df = evaluate(
            mlruns_path=os.path.join(MLRUNS_PATH, exp_name),
            experiment_name=exp_name,
            update_columns=UPDATE_COLUMNS,
            perf_type="class",
            version="valid",
            epoch="full",
            loss_func=LOSS_FUNC,
            cache_path=CACHE_PATH,
        )
        if runs_df is None:
            break

        # Preprocess columns.
        df = runs_df.drop(columns=["data"])
        df['clf'] = df['clf'].astype(str) + '-' + df['agg'].astype(str)
        df.drop(columns=['agg'], inplace=True)

        # Identify '_valid' columns & rank them within each group
        target_col = f"class_true_{LOSS_FUNC}_full_test"
        group_cols = ["clf"]
        valid_cols = [c for c in df.columns if c.endswith("valid")]

        if SHOW_INDIVIDUAL_SCORES:

            # Identify non-valid and group columns
            other_cols = [col for col in df.columns if not col.endswith("valid") and col not in group_cols]

            # Function that, for each group, returns rows corresponding 
            # to the max values for each "valid" column.
            def get_max_rows(grp):
                d = {}
                for col in valid_cols:
                    idx = grp[col].idxmin()
                    if not np.isnan(idx):
                        d[col] = grp.loc[idx, other_cols]
                return pd.DataFrame(d).T

            # Apply to each group
            df_max = df.groupby(group_cols).apply(get_max_rows)

            # df_max now has a multi-index with the group keys + the valid column name
            # If you want a flat DataFrame, reset the index and rename columns
            df_max = df_max.reset_index()
            df_max.columns = group_cols + ["valid_col"] + other_cols
            df_max = df_max.sort_values(by=["clf", f"class_true_{LOSS_FUNC}_full_test"])
            df_max["rank"] = df_max.groupby("clf")[f"class_true_{LOSS_FUNC}_full_test"].rank(
                ascending=True, pct=True, method="average"
            )
            df_max = df_max[(df_max["clf"] == "aggregate-ground-truth") & (df_max["valid_col"] == f"class_true_{LOSS_FUNC}_cv_valid")]
            df_max = df_max.rename(columns=INV_UPDATE_COLUMNS)
            drop_col = [c for c in df_max.columns if not "params" in c]
            df_max = df_max.drop(columns=drop_col)
            df_max = df_max.dropna(axis=1)
            #print(df_max.to_markdown(tablefmt="github"))
            df_max = df_max.to_dict("list")
            df_max = {k[7:]: v for k, v in df_max.items() if "alpha" not in k}
            print(df_max)
            print()
            continue


        for col in valid_cols:
            df[col + "_rank"] = df.groupby(group_cols)[col].rank(method="min")

        # Define subsets & compute mean rank per subset
        subsets = {
            # Lower and upper baseline scores.
            "TRUE": [f"class_true_{LOSS_FUNC}_cv_valid"],
            
            "AEU": [f"class_mv_unif_{LOSS_FUNC}_cv_valid"],
            "AEC": [f"class_mv_unif_weighted_{LOSS_FUNC}_cv_valid"],
            "ALU": [f"class_mv_log_odds_1_{LOSS_FUNC}_cv_valid"],
            "ALC": [f"class_mv_log_odds_1_weighted_{LOSS_FUNC}_cv_valid"],
            
            "CXU": [f"annot_unif_{LOSS_FUNC}_cv_valid"],
            "CEC": [f"annot_unif_weighted_{LOSS_FUNC}_cv_valid"],
            "CLC": [f"annot_log_odds_1_weighted_{LOSS_FUNC}_cv_valid"],
 
            "ENS": [
                #f"class_mv_unif_{LOSS_FUNC}_cv_valid",
                #f"class_mv_unif_weighted_{LOSS_FUNC}_cv_valid",
                f"class_mv_log_odds_1_{LOSS_FUNC}_cv_valid",
                #f"class_mv_log_odds_1_weighted_{LOSS_FUNC}_cv_valid",
                
                #f"annot_unif_{LOSS_FUNC}_cv_valid",
                #f"annot_unif_weighted_{LOSS_FUNC}_cv_valid",
                #f"annot_log_odds_1_weighted_{LOSS_FUNC}_cv_valid",
            ],
        }
        for subset_name, subset_cols in subsets.items():
            rank_cols = [c + "_rank" for c in subset_cols]
            try:
                df[f"{subset_name}_mean_rank"] = df[rank_cols].mean(axis=1, skipna=True)
            except:
                df[f"{subset_name}_mean_rank"] = 100.0
            #if len(subset_cols) > 1:
            #    df[f"{subset_name}_mean_rank"] = StandardScaler().fit_transform(df[subset_cols].values).mean(axis=1)
            if len(subset_cols) > 1:
                try:
                    #for clf in np.unqiue(df["clf"].values):
                    #    from scipy.stats import kendalltau
                    #    df_madl = df[df["clf"].values == f"{clf}-None"]
                    #    tau = np.zeros((len(rank_cols), len(rank_cols)))
                    #    for i, c_i in enumerate(rank_cols):
                    #        for j, c_j in enumerate(rank_cols):
                    #            tau[i, j] = kendalltau(df_madl[c_i], df_madl[c_j])[0]
                    #    ones_vector = np.ones_like(rank_cols, dtype=float)
                    #    tau_inv = np.linalg.pinv(tau)
                    #    w =  (tau_inv @ ones_vector) / (ones_vector @ tau_inv @ ones_vector)
                    #    df.loc[df["clf"].values == f"{clf}-None", f"{subset_name}_mean_rank"] = np.sum(df_madl[rank_cols].values * w, axis=1)
                    df[f"{subset_name}_mean_rank"] -= 0.001 * df[rank_cols].std(axis=1, skipna=True)
                except:
                    pass
            #mean_cols = [c for c in subset_cols]
            #df[f"{subset_name}_mean_rank"] = df[mean_cols].mean(axis=1, skipna=False)
            #if len(subset_cols) > 1:
            #    df[f"{subset_name}_mean_rank"] += df[mean_cols].std(axis=1, skipna=False)
            # Weighted average (row-wise):
            # weighted_mean = sum(w_i * x_i) / sum(w_i)
            #weight_cols = [c + "_std" for c in subset_cols]
            #ranks = df[rank_cols].values
            #weights = 1/df[weight_cols]
            if len(subset_cols) > 100:
                if "-1" in ds or "image" in ds:
                    try:
                        df[f"{subset_name}_mean_rank"] = df[rank_cols[2:]].mean(axis=1, skipna=False)
                    except:
                        df[f"{subset_name}_mean_rank"] = 100.0
                        
            if subset_name == "ENSN":
                print("weights:")
                print(len(subset_cols))
                try:
                    ranks = df[rank_cols].values
                    for approach, w in w_dict.items():
                        #weights = w#np.array([0.22, 0.09, 0.32, 0.06, 0.31])
                        # build the boolean mask
                        app = f"aggregate-{approach}" if approach == "dawid-skene" else f"{approach}-None"

                        # compute the weighted mean for just those rows
                        # • if `ranks` is a NumPy array:
                        weights = w["_".join(str(x) for x in ds.split("_")[:-1])]
                        print(weights)
                        #weights = w[ds]
                        weighted = (df[rank_cols].values[mask] * weights).sum(axis=1)

                        # • if `ranks` is itself a DataFrame, you can do:
                        # weighted = ranks.loc[mask].mul(w, axis=1).sum(axis=1) / w.sum()

                        # assign it back with .loc
                        df.loc[mask, f"{subset_name}_mean_rank"] = weighted
                except:
                    df[f"{subset_name}_mean_rank"] = 100
                
        # Start with a DataFrame of unique group columns.
        results_df_dict[experiment_type] = {
            "mean": df.drop_duplicates(group_cols)[group_cols].sort_values(group_cols).reset_index(drop=True)
        }
        results_df_dict[experiment_type]["std"] = results_df_dict[experiment_type]["mean"].copy()

        # Loop over subsets to find best row -> then attach the 'target'.        
        for subset_name in subsets:
            mean_rank_col = f"{subset_name}_mean_rank"
            best_idx = df.groupby(group_cols)[mean_rank_col].idxmin().dropna()

            # Extract columns for merging, rename 'target' -> subset-specific
            best_for_subset = df.loc[best_idx, group_cols + [target_col]].copy()
            best_for_subset.rename(columns={target_col: f"{subset_name}_target"}, inplace=True)
            results_df_dict[experiment_type]["mean"] = results_df_dict[experiment_type]["mean"].merge(best_for_subset, on=group_cols, how="left")

            best_for_subset = df.loc[best_idx, group_cols + [target_col + "_std"]].copy()
            best_for_subset.rename(columns={target_col + "_std": f"{subset_name}_target"}, inplace=True)
            results_df_dict[experiment_type]["std"] = results_df_dict[experiment_type]["std"].merge(best_for_subset, on=group_cols, how="left")

            
        # Reformat resulting data frames.
        for k in results_df_dict[experiment_type].keys():
            results_df_dict[experiment_type][k].columns = [f"{c.replace('_', '-').replace('-target', '')}" for c in results_df_dict[experiment_type][k].columns]
            results_df_dict[experiment_type][k]['clf'] = results_df_dict[experiment_type][k]['clf'].replace({"aggregate-": "", "-None": "", "_": "-"}, regex=True)
            results_df_dict[experiment_type][k]['sort_key'] = results_df_dict[experiment_type][k]['clf'].apply(lambda x: APPROACHES.index(x))
            results_df_dict[experiment_type][k] = results_df_dict[experiment_type][k].sort_values('sort_key').drop(columns='sort_key')
            results_df_dict[experiment_type][k] = results_df_dict[experiment_type][k].set_index(["clf"])
    if SHOW_INDIVIDUAL_SCORES:
        continue
    perf_cols = ["DEF", "DEF-DATA"] + list(subsets.keys())
    perf_cols[0] = "TRUE"
    perf_cols[2] = "DEF"
    results_df_dict["hyperparameter_search"]["mean"]["DEF"] = results_df_dict["default"]["mean"]["TRUE"].values
    results_df_dict["hyperparameter_search"]["std"]["DEF"] = results_df_dict["default"]["std"]["TRUE"].values
    results_df_dict["hyperparameter_search"]["mean"]["DEF-DATA"] = results_df_dict[f"default_data"]["mean"]["TRUE"].values
    results_df_dict["hyperparameter_search"]["std"]["DEF-DATA"] = results_df_dict[f"default_data"]["std"]["TRUE"].values
    results_df_dict = {"mean": results_df_dict["hyperparameter_search"]["mean"].reindex(perf_cols, axis=1), "std": results_df_dict["hyperparameter_search"]["std"].reindex(perf_cols, axis=1)}
    for stat in ["mean", "std"]:
        results_df_dict[stat].loc["ground-truth"]["DEF-DATA"] = results_df_dict[stat].loc["ground-truth"]["TRUE"]
    df_dict[ds] = results_df_dict
    
    #results_df_dict["mean"]["regression"] = 0
    #results_df_dict["std"]["regression"] = 0
    #for test_approach, test_approach_dict in test_losses.items():
    #    results_df_dict["mean"].loc[test_approach.replace("_", "-"), "regression"] = test_approach_dict[ds]
    #perf_cols.append("regression")
    
    # Print results as markdown.
    df_markdown = results_df_dict["mean"].applymap(lambda x: f"{x:.2f}" if isinstance(x, (float, int)) else x) + " ± " + results_df_dict["std"].applymap(lambda x: f"{x:.2f}" if isinstance(x, (float, int)) else x)
    print(df_markdown.to_markdown(tablefmt="github", floatfmt=".4f"))

    if SHOW_LATEX:
        df_latex = results_df_dict["mean"].applymap(lambda x: f"${x:.2f}" if isinstance(x, (float, int)) else x) + "_{\pm " + results_df_dict["std"].applymap(lambda x: f"{x:.2f}" + "}$" if isinstance(x, (float, int)) else x)
        df_latex.index = df_latex.index.map(lambda x: f"\\texttt{{{x}}}")
        df_latex = df_latex.replace("$nan_{\pm nan}$", "--")
        #print(df_latex.style.to_latex())

        # Get the underlying DataFrames for means and standard deviations
        mean_df = results_df_dict["mean"]
        std_df = results_df_dict["std"]

        # Compute the minimum value for each column (all columns)
        col_min = mean_df[1:].min()

        # Compute the minimum for each row only considering columns starting from the third column
        row_min = mean_df.iloc[:, 1:].min(axis=1)

        # Create a copy to store our LaTeX formatted strings
        df_latex = mean_df.copy()

        # Iterate over every cell to apply the formatting
        for i in mean_df.index:
            for j in mean_df.columns:
                m_val = mean_df.loc[i, j]
                s_val = std_df.loc[i, j]
                if isinstance(m_val, (int, float)):
                    # Format the mean value with two decimals
                    formatted_mean = f"{m_val:.2f}"
                    # Bold if it is the minimum in its column
                    is_bold = m_val == col_min[j]
                    # Underline only if this column is not one of the first two
                    col_index = mean_df.columns.get_loc(j)
                    is_underline = (col_index >= 1) and (m_val == row_min[i]) and i != "ground-truth"

                    # Apply nested formatting if both conditions hold
                    if is_bold and is_underline:
                        formatted_mean = f"\\underline{{\\textBF{{{formatted_mean}}}}}"
                    elif is_bold:
                        formatted_mean = f"\\textBF{{{formatted_mean}}}"
                    elif is_underline:
                        formatted_mean = f"\\underline{{{formatted_mean}}}"

                    # Format the standard deviation value
                    if isinstance(s_val, (int, float)):
                        formatted_std = f"{s_val:.2f}" if s_val < 10 else f"{s_val:.1f}"
                    else:
                        formatted_std = s_val
                    # Combine into one LaTeX math mode string
                    df_latex.loc[i, j] = f"${formatted_mean}_{{\\pm {formatted_std}}}$"
                else:
                    df_latex.loc[i, j] = m_val

        # Format the index using typewriter font
        approaches_replace = {
            "ground-truth": "gt",
            "majority-vote": "mv",
            "dawid-skene": "ds",
            "crowd-layer": "cl",
            "trace-reg": "trace",
            "conal": "conal",
            "union-net-a": "union-a",
            "union-net-b": "union-b",
            "geo-reg-w": "geo-w",
            "geo-reg-f": "geo-f",
            "madl": "madl",
            "crowd-ar": "crowd-ar",
            "annot-mix": "annot-mix",
            "coin-net": "coin",
            
        }
        df_latex.index = df_latex.index.map(
            lambda x: f"\\texttt{{{approaches_replace[x]}}}"
        )

        # Replace any problematic nan strings with a placeholder
        df_latex = df_latex.replace("$nan_{\\pm nan}$", "N/A")

        # Print the LaTeX output from the styled DataFrame
        print(df_latex.style.to_latex())
        
    # For each performance metric, compute the rank.
    approach_rankings = {metric: {} for metric in perf_cols} if approach_rankings is None else approach_rankings
    for metric in perf_cols:
        results_df_dict["mean"][f'{metric}_rank'] = results_df_dict["mean"][metric].rank(method='average', pct=False)-1#results_df_dict["mean"][metric].apply(lambda x: (results_df_dict["mean"][metric] < x).sum()) / len(results_df_dict["mean"][metric])#
        results_df_dict["mean"][f'{metric}_rank'] /= results_df_dict["mean"][metric].rank(method='average', pct=False).max()
        
    # For each metric, rank the approaches.
    for key, row in results_df_dict["mean"].iterrows():
        for metric in perf_cols:
            if key not in approach_rankings[metric]:
                approach_rankings[metric][key] = []
            approach_rankings[metric][key].append(row[f'{metric}_rank'])

    # For each approach, rank its performance metrics.
    metric_rankings = {} if metric_rankings is None else metric_rankings
    grouped = results_df_dict["mean"].groupby(group_cols)[perf_cols].mean().reset_index()
    grouped['sort_key'] = grouped['clf'].apply(lambda x: APPROACHES.index(x))
    grouped = grouped.sort_values('sort_key').drop(columns='sort_key')
    for approach, row in grouped.iterrows():
        approach = row["clf"]
        perf_values = row[perf_cols]
        metric_ranks = perf_values.rank(method='average', pct=False)-1#perf_values.apply(lambda x: (perf_values < x).sum()) / len(perf_values)#
        metric_ranks /= perf_values.rank(method='average', pct=False).max()
        if approach not in metric_rankings:
            metric_rankings[approach] = {metric: [] for metric in perf_cols}
        for metric in perf_cols:
            metric_rankings[approach][metric].append(metric_ranks[metric])
                
# Plot ranking of approaches per metric.
final_rankings = [
    {
        'clf': key,
        **{metric: np.mean(approach_rankings[metric][key]) for metric in perf_cols}
    }
    for key in approach_rankings[perf_cols[0]].keys() if key != "ground-truth"
]
final_rank_df = pd.DataFrame(final_rankings)
print("Average approach ranking (from approach_rankings)")
plot_rankings(final_rank_df, mean_axis="rows", file_path=f"ranking_metrics_{LOSS_FUNC}.pdf")

# Plot ranking of metrics per approach.
final_data = [
    {
        'clf': key,
        **{metric: np.mean(metric_dict[metric]) for metric in perf_cols}
    }
    for key, metric_dict in metric_rankings.items() if key != "ground-truth"
]
final_rank_df = pd.DataFrame(final_data)
print("Average approach ranking (from metric_rankings)")
plot_rankings(final_rank_df, file_path=f"ranking_approaches_{LOSS_FUNC}.pdf")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Simulate performance data for 12 approaches.
np.random.seed(0)
approaches = APPROACHES[1:]

# reuters
def_performance = np.array([24.32, 19.88, 14.90, 16.61, 17.07, 23.36, 15.79, 20.16, 15.10, 14.99, 16.11, 27.09, 20.50])
opt_performance = np.array([16.71, 11.64, 10.52, 11.50, 11.53, 15.53, 12.13, 09.45, 12.11, 10.22, 11.78, 10.33, 10.11])


# Compute rankings (1=best) based on performance (higher is better).
default_rank = np.argsort(def_performance).argsort() + 1
optimized_rank = np.argsort(opt_performance).argsort() + 1

# Sort approaches by default performance for clearer visualization.
order = np.argsort(def_performance)
approaches_sorted = np.array(approaches)[order]
default_sorted = def_performance[order]
optimized_sorted = opt_performance[order]
default_rank_sorted = default_rank[order]
optimized_rank_sorted = optimized_rank[order]

fig, ax = plt.subplots(figsize=(11, 4.25))

# Plot dumbbell lines for performance gain.
ax.hlines(y=approaches_sorted, xmin=default_sorted, xmax=optimized_sorted, color='gray', alpha=0.7, zorder=-3)

# Annotate each marker with its ranking.
for i, approach in enumerate(approaches_sorted):
    # Annotate the default performance marker with its rank.
    ax.text(default_sorted[i] + 0.2, approaches_sorted[i],
            f'$\\#{default_rank_sorted[i]}$', color=(127/255, 0, 127/255), va='center', ha='left', fontsize=12)
    # Annotate the optimized performance marker with its rank.
    ax.text(optimized_sorted[i] - 0.2, approaches_sorted[i],
            f'$\\#{optimized_rank_sorted[i]}$', color=(0, 127/255, 127/255), va='center', ha='right', fontsize=12)
    
# Plot markers for default and optimized performance.
ax.scatter(default_sorted, approaches_sorted, color=(100/255, 0, 100/255), s=100, label='Default')
ax.scatter(optimized_sorted, approaches_sorted, color=(0, 127/255, 127/255), s=100, label='Optimized')

ax.set_xlabel('Zero-one Loss [%]')
ax.set_xticks(np.arange(8, 30, 1))
ax.set_xticklabels(np.arange(8, 30, 1), fontsize=12)
ax.legend(loc='lower right')
plt.tight_layout()
plt.savefig("graphical_abstract.pdf")
plt.show()


In [None]:
# Initialize DataFrames to hold win counts and total comparisons for each pair (i,j)
win_counts = None
total_counts = None

ordered_criteria = list(subsets.keys())
ordered_criteria.insert(1, "DEF-DATA")
ordered_criteria.insert(2, "DEF")

# Loop over each dataset and each variant (row)
for ds_name, df in df_dict.items():
    df = df["mean"][ordered_criteria][1:]
    if win_counts is None or total_counts is None:
        # Initialize DataFrames to hold win counts and total comparisons for each pair (i,j)
        win_counts = pd.DataFrame(0, index=df.columns, columns=df.columns, dtype=float)
        total_counts = pd.DataFrame(0, index=df.columns, columns=df.columns, dtype=float)
    for idx, row in df.iterrows():
        #if idx not in ["madl"]:
        #    continue
        for i in df.columns:
            for j in df.columns:
                # Only compare if both values are not NaN
                if pd.notnull(row[i]) and pd.notnull(row[j]):
                    total_counts.loc[i, j] += 1
                    if row[i] < row[j]:
                        win_counts.loc[i, j] += 1
                    #elif row[i] == row[j]:
                    #    win_counts.loc[i, j] += 0.5

# Compute winning percentage matrix (expressed as a percentage)
winning_percentage = win_counts / total_counts * 100

winning_diffs = ((winning_percentage.values.T - winning_percentage.values) > 0).astype(float)
np.fill_diagonal(winning_diffs, np.nan)



#winning_percentage["mean"] = winning_percentage.values.mean(axis=-1)



np.fill_diagonal(winning_percentage.values, np.nan)
fig, ax = plt.subplots(figsize=(15, 5.25))
ax.pcolormesh(winning_diffs, cmap=custom_cmap, vmin=0, vmax=1, alpha=0.3, edgecolors="white", linewidth=2, shading="flat")
ax.set_aspect("auto")
for i in range(winning_percentage.shape[0]):
    for j in range(winning_percentage.shape[1]):
        formatted_text = r'${:.2f}$'.format(winning_percentage.iloc[i, j]).replace("$nan$", "N/A")
        plt.text(j+0.5, i+0.56, formatted_text, ha="center", va="center", color="black", fontsize=18)#, bbox=dict(facecolor='white', edgecolor='none', pad=0))
ax.set_xticks(np.arange(len(winning_percentage))+0.5, winning_percentage.index.values, fontsize=15)
ax.set_yticks(np.arange(len(winning_percentage))+0.5, winning_percentage.index.values, fontsize=15)
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(f"criteria_winning_matrix_{LOSS_FUNC}.pdf")
plt.show()

In [None]:
# Initialize DataFrames to hold win counts and total comparisons for each pair (i,j)
win_counts = None
total_counts = None

# Loop over each dataset and each variant (row)
for ds_name, df in df_dict.items():
    #if "dopanim" not in ds_name:
    #    continue
    #df_tmp = df["mean"]
    df = df["mean"][["ENS"]][1:]
    #df.loc["annot-mix"] = df_tmp["ENS"].loc["annot-mix"]
    if win_counts is None or total_counts is None:
        # Initialize DataFrames to hold win counts and total comparisons for each pair (i,j)
        win_counts = pd.DataFrame(0, index=df.index, columns=df.index, dtype=float)
        total_counts = pd.DataFrame(0, index=df.index, columns=df.index, dtype=float)
    for i, row_i in df.iterrows():
        for j, row_j in df.iterrows():
            if pd.notnull(row_i.values[0]) and pd.notnull(row_j.values[0]):
                total_counts.loc[i, j] += 1
                if row_i.values[0] < row_j.values[0]:
                    win_counts.loc[i, j] += 1
                    #elif row[i] == row[j]:
                    #    win_counts.loc[i, j] += 0.5

# Compute winning percentage matrix (expressed as a percentage)
winning_percentage = win_counts / total_counts * 100

winning_diffs = ((winning_percentage.values.T - winning_percentage.values) > 0).astype(float)
np.fill_diagonal(winning_diffs, np.nan)



#winning_percentage["mean"] = winning_percentage.values.mean(axis=-1)



np.fill_diagonal(winning_percentage.values, np.nan)
fig, ax = plt.subplots(figsize=(15, 6))
ax.pcolormesh(winning_diffs, cmap=custom_cmap, vmin=0, vmax=1, alpha=0.3, edgecolors="white", linewidth=2, shading="flat")
ax.set_aspect("auto")
for i in range(winning_percentage.shape[0]):
    for j in range(winning_percentage.shape[1]):
        formatted_text = r'${:.2f}$'.format(winning_percentage.iloc[i, j]).replace("$nan$", "N/A")
        plt.text(j+0.5, i+0.55, formatted_text, ha="center", va="center", color="black", fontsize=18)#, bbox=dict(facecolor='white', edgecolor='none', pad=0))
ax.set_xticks(np.arange(len(winning_percentage))+0.5, winning_percentage.index.values, fontsize=15)
ax.set_yticks(np.arange(len(winning_percentage))+0.5, winning_percentage.index.values, fontsize=15)
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(f"approaches_winning_matrix_{LOSS_FUNC}.pdf")
plt.show()

In [None]:
import scipy.stats as stats
import scikit_posthocs as sp
import re
from itertools import combinations
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests 

# Specifiy test case, which is either "all" or one of the LFC approaches.
test_approach = "all"
alpha = 0.05
ordered_criteria = list(subsets.keys())
ordered_criteria.insert(1, "DEF-DATA")
ordered_criteria.insert(2, "DEF")

# Combine all datasets into one DataFrame.
df_mean_dict = {k: v["mean"][["DEF", "DEF-DATA"] + list(subsets.keys())] for k, v in df_dict.items()}
data_all = pd.concat(df_mean_dict, names=['dataset', 'variant'])
if test_approach != "all":
    is_approach = [test_approach in idx_tuple[1] for idx_tuple in data_all.index]
    data_all = data_all[is_approach]
if data_all.isnull().any().any():
    data_all = data_all.dropna()
data_all = data_all.reset_index()
data_long = data_all.melt(id_vars=['dataset', 'variant'], var_name='approach', value_name='loss')
data_long['block'] = data_long['dataset'].astype(str) + "_" + data_long['variant'].astype(str)
data_long = data_long[["block", "approach", "loss"]]
data_long['block_id'] = data_long.groupby('block').ngroup()
wide = data_long.pivot(index='block', columns='approach', values='loss')

# Perform paired Wilcoxon signed rank test.
approaches = wide.columns
n_approaches = len(approaches)
p_matrix = pd.DataFrame(np.ones((n_approaches, n_approaches)), index=approaches, columns=approaches)
sign_matrix = pd.DataFrame(np.zeros((n_approaches, n_approaches)), index=approaches, columns=approaches)
p_list, approach_pair_list = [], []
for i, j in combinations(approaches, 2):
    _, p = wilcoxon(wide[i].values, wide[j].values, alternative="two-sided")
    p_list.append(p)
    approach_pair_list.append((i, j))
    diffs = wide[i].values - wide[j].values
    wins = (diffs < 0).sum()
    losses = (diffs > 0).sum() 
    if wins > losses:
        sign_matrix.loc[i, j] = 1
        sign_matrix.loc[j, i] = -1
    elif wins < losses:
        sign_matrix.loc[i, j] = -1
        sign_matrix.loc[j, i] = 1
p_adj_list = multipletests(p_list, method="holm")[1]
for (i, j), p_adj in zip(approach_pair_list, p_adj_list):
    p_matrix.loc[i, j] = p_matrix.loc[j, i] = p_adj
sign_matrix.values[:] = np.where(p_matrix.values < alpha, -1*sign_matrix.values, 0)
    
    
# Print critical difference diagram.
avg_rank = data_long.groupby('block')["loss"].rank(pct=False, method="average").groupby(data_long["approach"]).mean()[ordered_criteria]
std_rank = data_long.groupby('block')["loss"].rank(pct=False, method="average").groupby(data_long["approach"]).sem()[ordered_criteria]
print("Average ranks:")
print(avg_rank.round(2))
print("Standard deviation of ranks:")
print(std_rank.round(2))
plt.figure(figsize=(10, 3), dpi=100)
plt.title('Critical difference diagram of average score ranks')
ax = sp.critical_difference_diagram(avg_rank, (p_matrix >= 0.05).astype(float), text_h_margin=0.3, label_fmt_left='{label} ({rank:.2f})', label_fmt_right='({rank:.2f}) {label}')
plt.tight_layout()
plt.savefig(f"cd_plot_criteria_{test_approach}.pdf")
plt.show()

# Plot matrix of p-values.
plt.figure(figsize=(10, 5))
custom_cmap = LinearSegmentedColormap.from_list("bone", colors)
plt.imshow(sign_matrix, cmap=custom_cmap, vmin=-1, vmax=1, alpha=0.5, aspect="auto")
n_rows, n_cols = p_matrix.shape
for i in range(n_rows):
    for j in range(n_cols):
        val = p_matrix.iloc[i, j]
        if np.isnan(val):
            continue
        txt = f"{val:.4f}"
        txt = re.sub(r"^(-?)0\.", r"\1.", txt)
        plt.text(i, j, txt, ha="center", va="center", color='black', fontsize=16)

plt.xticks(np.arange(n_rows), p_matrix.columns, rotation=0)
plt.tick_params(top=False, labeltop=False, bottom=True, labelbottom=True)
plt.yticks(np.arange(n_cols), p_matrix.columns)
plt.tight_layout()
plt.savefig("p_matrix_criteria.pdf")
plt.show()

gains = wide.copy()
gains.iloc[:] = ((wide["DEF"].values[:, None] - wide.values))# / wide["DEF"].values[:, None]) * 100
print(gains[ordered_criteria].mean(axis=0).round(2))
print(gains[ordered_criteria].sem(axis=0).round(2))

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import scikit_posthocs as sp
import matplotlib.pyplot as plt

# Specifiy test case, which is either "all" or one of the model selection criteria.
test_approach = "ENS"
alpha = 0.05
ordered_approaches = APPROACHES[1:]

# Combine all datasets into one DataFrame.
rows = []
for ds_name, v in df_dict.items():
    wide = v["mean"][["DEF", "DEF-DATA"] + list(subsets.keys())]
    wide.loc["majority-vote"].fillna(wide.loc["majority-vote"]["AEU"], inplace=True)
    wide = wide.assign(variant=wide.index)                     # <─ key change
    wide.loc["majority-vote-def"] = v["mean"]["DEF"].loc["majority-vote"]
    wide.loc["majority-vote-def", "variant"] = "majority-vote-def"
    long = wide.melt(id_vars="variant",
                     var_name="approach",
                     value_name="loss")

    long["dataset"] = ds_name
    rows.append(long)
data_long = pd.concat(rows, ignore_index=True)
data_long = data_long.loc["ground-truth" != data_long["variant"]]
if test_approach != "all":
    is_approach = data_long["approach"] == test_approach
    data_long = data_long[is_approach]

if test_approach != "all":
    data_long["block"] = data_long["dataset"]
else:
    data_long["block"] = data_long["dataset"] + "_" + data_long["approach"]
data_long = data_long.dropna(subset=["loss"])
data_long["block_id"] = data_long.groupby("block").ngroup()
wide = data_long.pivot(index='block', columns='variant', values='loss')

# Perform paired Wilcoxon signed rank test.
approaches = wide.columns
n_approaches = len(approaches)
p_matrix = pd.DataFrame(np.ones((n_approaches, n_approaches)), index=approaches, columns=approaches)
sign_matrix = pd.DataFrame(np.zeros((n_approaches, n_approaches)), index=approaches, columns=approaches)
p_list, approach_pair_list = [], []
for i, j in combinations(approaches, 2):
    _, p = wilcoxon(wide[i].values, wide[j].values, alternative="two-sided")
    p_list.append(p)
    approach_pair_list.append((i, j))
    diffs = wide[i].values - wide[j].values
    wins = (diffs < 0).sum()
    losses = (diffs > 0).sum() 
    if wins > losses:
        sign_matrix.loc[i, j] = 1
        sign_matrix.loc[j, i] = -1
    elif wins < losses:
        sign_matrix.loc[i, j] = -1
        sign_matrix.loc[j, i] = 1
p_adj_list = multipletests(p_list, method="holm")[1]
for (i, j), p_adj in zip(approach_pair_list, p_adj_list):
    p_matrix.loc[i, j] = p_matrix.loc[j, i] = p_adj
    
sign_matrix.values[:] = np.where(p_matrix.values < alpha, -1*sign_matrix.values, 0)

# Print critical difference diagram.
avg_rank = data_long.groupby('block')["loss"].rank(pct=False, method="average").groupby(data_long["variant"]).mean()
std_rank = data_long.groupby('block')["loss"].rank(pct=False, method="average").groupby(data_long["variant"]).sem()
print("Average ranks:")
print(avg_rank[ordered_approaches].round(2))
print("Standard deviation of ranks:")
print(std_rank[ordered_approaches].round(2))
plt.figure(figsize=(10, 3), dpi=100)
plt.title('Critical difference diagram of average score ranks')
ax = sp.critical_difference_diagram(avg_rank, (p_matrix >= 0.05).astype(float), text_h_margin=0.3, label_fmt_left='{label} ({rank:.2f})', label_fmt_right='({rank:.2f}) {label}')
plt.tight_layout()
plt.savefig(f"cd_plot_approaches_{test_approach}.pdf")
plt.show()

# Plot matrix of p-values.
plt.figure(figsize=(12, 5))
custom_cmap = LinearSegmentedColormap.from_list("bone", colors)
plt.imshow(sign_matrix, cmap=custom_cmap, vmin=-1, vmax=1, alpha=0.5, aspect="auto")
n_rows, n_cols = p_matrix.shape
for i in range(n_rows):
    for j in range(n_cols):
        val = p_matrix.iloc[i, j]
        if np.isnan(val):
            continue
        txt = f"{val:.4f}"
        txt = re.sub(r"^(-?)0\.", r"\1.", txt)
        plt.text(i, j, txt, ha="center", va="center", color='black', fontsize=16)

plt.xticks(np.arange(n_rows), p_matrix.columns, rotation=0)
plt.tick_params(top=False, labeltop=False, bottom=True, labelbottom=True)
plt.yticks(np.arange(n_cols), p_matrix.columns)
plt.tight_layout()
plt.savefig("p_matrix_approaches.pdf")
plt.show()

gains = wide.copy()
gains.iloc[:] = ((wide["majority-vote-def"].values[:, None] - wide.values) / wide["majority-vote-def"].values[:, None]) * 100
print(gains[ordered_approaches].mean(axis=0).round(2))
print(gains[ordered_approaches].sem(axis=0).round(2))

In [None]:
import itertools
from scipy.stats import kendalltau, spearmanr
import seaborn as sns, matplotlib.pyplot as plt

rank_matrices = []
criteria = ["TRUE", "DEF", "DEF-DATA", "ENS"]
for test_approach in criteria:
    alpha = 0.05
    ordered_approaches = APPROACHES[1:]

    # Combine all datasets into one DataFrame.
    rows = []
    for ds_name, v in df_dict.items():
        wide = v["mean"][["DEF", "DEF-DATA"] + list(subsets.keys())]
        wide.loc["majority-vote"].fillna(wide.loc["majority-vote"]["AEU"], inplace=True)
        wide = wide.assign(variant=wide.index)                     # <─ key change
        #wide.loc["majority-vote-def"] = v["mean"]["DEF"].loc["majority-vote"]
        #wide.loc["majority-vote-def", "variant"] = "majority-vote-def"
        long = wide.melt(id_vars="variant",
                         var_name="approach",
                         value_name="loss")

        long["dataset"] = ds_name
        rows.append(long)
    data_long = pd.concat(rows, ignore_index=True)
    data_long = data_long.loc["ground-truth" != data_long["variant"]]
    if test_approach != "all":
        is_approach = data_long["approach"] == test_approach
        data_long = data_long[is_approach]

    if test_approach != "all":
        data_long["block"] = data_long["dataset"]
    else:
        data_long["block"] = data_long["dataset"] + "_" + data_long["approach"]
    data_long = data_long.dropna(subset=["loss"])
    data_long["block_id"] = data_long.groupby("block").ngroup()
    wide = data_long.pivot(index='block', columns='variant', values='loss')
    # Plot ranking values.
    avg_rank = data_long.groupby('block')["loss"].rank(pct=False, method="average")
    rank_matrices.append(wide.rank(pct=False, method="average", axis="columns"))
R = np.stack(rank_matrices, axis=1)

pairs  = list(itertools.combinations(range(len(criteria)), 2))
tau    = np.zeros((35, len(pairs)))      # 35 datasets × 6 pairs
for d in range(35):
    for k, (c1, c2) in enumerate(pairs):
        tau[d, k], _ = kendalltau(R[d, c1], R[d, c2])
        
# Heat-map of mean τ
mean_tau = tau.mean(axis=0)
mean_sem = tau.std(axis=0) / (tau.shape[0]-1)
M = np.full((len(criteria), len(criteria)), 1.0)
for (k,(i,j)) in enumerate(pairs):
    M[i,j] = M[j,i] = mean_tau[k]
sns.heatmap(M, annot=True, vmin=-1, vmax=1, cmap=custom_cmap,
            xticklabels=criteria,
            yticklabels=criteria)
plt.title("Mean Kendall τ between criteria");
plt.show()

# Boxplot of τ distributions
plt.figure(figsize=(5, 6))
plt.axvline(0, color='grey', linestyle=':')
for (k,(i,j)) in enumerate(pairs):
    plt.scatter(tau[:, k], np.full_like(tau[:, k], fill_value=k+1), c="blue", s=8)
plt.violinplot(tau, showmeans=True, bw_method=0.3, vert=False)
plt.yticks(range(1, len(pairs)+1), [f"{i+1}–{j+1}" for i,j in pairs])
plt.xticks(np.arange(-1, 1.25, 0.25))
plt.xlabel("Kendall τ")
plt.tight_layout()
plt.savefig("kendall_violinplot.pdf")
plt.show()

In [None]:
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
# SETTINGS
# ------------------------------------------------------------------
scatter_type = "relative"
approaches      = list(subsets.keys())
approaches.insert(1, "DEF-DATA")
approaches.insert(2, "DEF")
baseline        = "DEF"
comp_approaches = [a for a in approaches if a != baseline]

# ------------------------------------------------------------------
# 1 · COLLECT GAPS PER GROUP
# ------------------------------------------------------------------
abs_by_group = {}   # {group: {approach: [gaps]}}
rel_by_group = {}   # same for % gaps

for ds_name, df in df_dict.items():
    # sub: DataFrame whose *rows are groups* and columns are the approaches
    sub = df["mean"][approaches]

    for group_name, row in sub.iterrows():
        base_val = row[baseline]
        if pd.isnull(base_val):
            continue

        # make sure the inner dicts exist
        abs_by_group.setdefault(group_name, {a: [] for a in comp_approaches})
        rel_by_group.setdefault(group_name, {a: [] for a in comp_approaches})

        for a in comp_approaches:
            val = row[a]
            if pd.notnull(val):
                gap     = base_val - val                      # +ve ⇒ better than DEF
                gap_pct = (gap / base_val) * 100
                abs_by_group[group_name][a].append(gap)
                rel_by_group[group_name][a].append(gap_pct)

# ------------------------------------------------------------------
# 2 · BUILD SUMMARY TABLE  (mean ± SD)
# ------------------------------------------------------------------
rows = []
for group in sorted(abs_by_group):                      # one row per group
    row_dict = {"group": group}
    for a in comp_approaches:
        abs_vals = abs_by_group[group][a]
        rel_vals = rel_by_group[group][a]

        # If the group lacks observations for an approach, fill with NaN
        if abs_vals:
            abs_mean = np.mean(abs_vals)
            abs_sd   = np.std(abs_vals, ddof=1) / len(abs_vals)
            rel_mean = np.mean(rel_vals)
            rel_sd   = np.std(rel_vals, ddof=1) / len(abs_vals)
        else:
            abs_mean = abs_sd = rel_mean = rel_sd = np.nan

        # column names like "AEU_abs_mean", "AEU_abs_sd", ...
        row_dict[f"{a}_abs_m"] = abs_mean
        row_dict[f"{a}_abs_s"]   = abs_sd
        row_dict[f"{a}_rel_m"] = rel_mean
        row_dict[f"{a}_rel_s"]   = rel_sd

    rows.append(row_dict)

summary_df = pd.DataFrame(rows)

# nicer column order: group first, then each approach’s stats
ordered_cols = (
    ["group"] +
    [f"{a}_{suffix}"
     for a in comp_approaches
     for suffix in ("abs_m", "abs_s", "rel_m", "rel_s")]
)
summary_df = summary_df[ordered_cols]

# ------------------------------------------------------------------
# 3 · DISPLAY OR SAVE
# ------------------------------------------------------------------
pd.set_option("display.float_format", "{:,.4f}".format)
#print(summary_df.to_markdown(tablefmt="github", floatfmt=".2f"))            # to screen


categories      = list(abs_by_group.keys())[1:]       # 12 categorical groups
n_groups        = len(categories)
n_approaches    = len(comp_approaches)

x_base          = np.arange(n_groups)                # positions on x‑axis
offset          = 0.8 / n_approaches                  # spread approaches across the group slot
markers         = ["o", "v", "s"]#, "^", "v", "P", "X"] # enough unique markers
legend = {
    "TRUE": ["o", "#949494ff"],
    "DEF": ["v", "#949494ff"],
    "DEF-DATA": ["s", "#949494ff"],
    
    "AEU": ["o", "#4d4da6ff"],
    "AEC": ["v", "#4d4da6ff"],
    "ALU": ["s", "#4d4da6ff"],
    "ALC": ["D", "#4d4da6ff"],
    
    "CXU": ["o", "#a64da6ff"],
    "CEC": ["v", "#a64da6ff"],
    "CLC": ["s", "#a64da6ff"],
    
    "ENS": ["s", "#4da6a6ff"],
}


plt.figure(figsize=(max(6, n_groups * 1.5), 5))

for idx, approach in enumerate(comp_approaches):
    means, ses = [], []
    for g in categories:
        if scatter_type == "relative":
            vals = rel_by_group[g][approach]
            x_ticks = np.arange(-10, 30, 5)
        else:
            vals = abs_by_group[g][approach]
            x_ticks = np.arange(-4, 10, 2)
        if vals:
            means.append(np.mean(vals))
            ses.append(np.std(vals, ddof=1) / np.sqrt(len(vals)))
        else:
            means.append(np.nan)
            ses.append(np.nan)

    # x positions shifted per approach
    x_pos = x_base - 0.4 + offset/2 + idx * offset
    plt.errorbar(x_pos, means, yerr=ses, fmt=legend[approach][0], color=legend[approach][1],
                 capsize=3, label=approach, linestyle='None')

# Aesthetics
plt.axhline(0, color='grey', linestyle=':')
plt.xlim(-0.5, n_groups-0.5)
plt.xticks(x_base, categories, ha='center', fontsize=15)
plt.yticks(x_ticks, fontsize=15)
plt.legend(ncol=min(n_approaches, 11), fontsize=15)
plt.tight_layout()
plt.savefig(f"criteria_{scatter_type}_diff_per_approach.pdf")
plt.show()

In [None]:
import pandas as pd
import numpy as np
data_set_meta_features = pd.read_csv("../python_scripts/metafeatures.csv", index_col="dataset")

# ------------------------------------------------------------------
# SETTINGS
# ------------------------------------------------------------------
scatter_type = "absolute"
criteria      = list(subsets.keys())
criteria.insert(1, "DEF-DATA")
criteria.insert(2, "DEF")
baseline        = "DEF"
comp_criteria = [c for c in criteria if c != baseline]
x_label = "aggregation_noise"
keep_indices = []
abs_by_group = {c: [] for c in comp_criteria}
rel_by_group = {c: [] for c in comp_criteria}

# ------------------------------------------------------------------
# 1 · COLLECT GAPS PER GROUP
# ------------------------------------------------------------------
for ds_idx, (ds_name, ds_features) in enumerate(data_set_meta_features.iterrows()):
    if ds_name not in df_dict:
        continue
    keep_indices.append(ds_idx)
    sub = df_dict[ds_name]["mean"][criteria]
    sub = sub.dropna(how='any')
    x_val = ds_features[x_label]

    abs_diffs = sub[baseline].values[:, None] - sub[comp_criteria].values
    rel_diffs = (abs_diffs / sub[baseline].values[:, None]) * 100
    for criterion_idx, criterion in enumerate(comp_criteria):
        abs_by_group[criterion].append(abs_diffs[:, criterion_idx].ravel().tolist())
        rel_by_group[criterion].append(rel_diffs[:, criterion_idx].ravel().tolist())

# Scatter plot
legend = {
    "TRUE": ["o", "#949494ff"],
    "DEF": ["v", "#949494ff"],
    "DEF-DATA": ["s", "#949494ff"],
    
    "AEU": ["o", "#4d4da6ff"],
    "AEA": ["v", "#4d4da6ff"],
    "ALU": ["s", "#4d4da6ff"],
    "ALA": ["D", "#4d4da6ff"],
    
    "CXU": ["o", "#a64da6ff"],
    "CEA": ["v", "#a64da6ff"],
    "CLA": ["s", "#a64da6ff"],
    
    "ENS": ["s", "#4da6a6ff"],
}


# Compute bin edges and indices
bins = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90])
num_bins = len(bins)-1
x_values = data_set_meta_features[x_label].values[keep_indices]
bin_indices = np.digitize(x_values, bins) - 1  # subtract 1 to get 0-based index
bin_centers = (bins[:-1] + bins[1:]) / 2
print(f"Bins: {np.unique(bin_indices, return_counts=True)}")


x_base          = np.arange(num_bins)                
offset          = 0.8 / len(comp_criteria)
plt.figure(figsize=(max(6, len(comp_criteria) * 1.5), 5))
for criterion_idx, criterion in enumerate(comp_criteria):
    means, ses = [], []
    y = np.array(abs_by_group[criterion]) if scatter_type == "absolute" else np.array(rel_by_group[criterion])
    for bin_idx, bin_center in enumerate(bin_centers):
        y_ravel = y[bin_indices == bin_idx].ravel()
        means.append(y_ravel.mean())
        ses.append(y_ravel.std() / np.sqrt(len(y_ravel)))
        y_ticks = np.arange(-4, 10, 2)

    # x positions shifted per approach
    x_pos = x_base - 0.4 + offset/2 + criterion_idx * offset
    plt.errorbar(x_pos, means, yerr=ses, fmt=legend[criterion][0], color=legend[criterion][1],
                 capsize=3, label=criterion, linestyle='None')

# Aesthetics
plt.axhline(0, color='grey', linestyle=':')
plt.xlim(-0.5, num_bins-0.5)
plt.xticks(x_base, bin_centers, ha='center', fontsize=15)
plt.yticks(y_ticks, fontsize=15)
plt.legend(ncol=min(2, 11), fontsize=15)
plt.tight_layout()
plt.savefig(f"criteria_{scatter_type}_diff_per_noise_level.pdf")
plt.show()


In [None]:
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
# SETTINGS
# ------------------------------------------------------------------
scatter_type = "absolute"
approaches      = APPROACHES[1:]
baseline        = "majority-vote"
comp_approaches = approaches
criteria        = list(subsets.keys())
criteria.insert(1, "DEF-DATA")
criteria.insert(2, "DEF")
print(criteria)

# ------------------------------------------------------------------
# 1 · COLLECT GAPS PER GROUP
# ------------------------------------------------------------------
abs_by_group = {}   # {group: {approach: [gaps]}}
rel_by_group = {}   # same for % gaps

for ds_name, df in df_dict.items():
    # sub: DataFrame whose *rows are groups* and columns are the approaches
    sub = df["mean"][criteria].T[approaches]
    base_val = sub["majority-vote"]["DEF"]

    for group_name, row in sub.iterrows():
        #base_val = row[baseline]
        if pd.isnull(base_val):
            continue

        # make sure the inner dicts exist
        abs_by_group.setdefault(group_name, {a: [] for a in comp_approaches})
        rel_by_group.setdefault(group_name, {a: [] for a in comp_approaches})

        for a in comp_approaches:
            val = row[a]
            if pd.notnull(val):
                gap     = base_val - val                      # +ve ⇒ better than DEF
                gap_pct = (gap / base_val) * 100
                abs_by_group[group_name][a].append(gap)
                rel_by_group[group_name][a].append(gap_pct)

# ------------------------------------------------------------------
# 2 · BUILD SUMMARY TABLE  (mean ± SD)
# ------------------------------------------------------------------
rows = []
for group in sorted(abs_by_group):                      # one row per group
    row_dict = {"group": group}
    for a in comp_approaches:
        abs_vals = abs_by_group[group][a]
        rel_vals = rel_by_group[group][a]

        # If the group lacks observations for an approach, fill with NaN
        if abs_vals:
            abs_mean = np.mean(abs_vals)
            abs_sd   = np.std(abs_vals, ddof=1) / len(abs_vals)
            rel_mean = np.mean(rel_vals)
            rel_sd   = np.std(rel_vals, ddof=1) / len(abs_vals)
        else:
            abs_mean = abs_sd = rel_mean = rel_sd = np.nan

        # column names like "AEU_abs_mean", "AEU_abs_sd", ...
        row_dict[f"{a}_abs_m"] = abs_mean
        row_dict[f"{a}_abs_s"]   = abs_sd
        row_dict[f"{a}_rel_m"] = rel_mean
        row_dict[f"{a}_rel_s"]   = rel_sd

    rows.append(row_dict)

summary_df = pd.DataFrame(rows)

# nicer column order: group first, then each approach’s stats
ordered_cols = (
    ["group"] +
    [f"{a}_{suffix}"
     for a in comp_approaches
     for suffix in ("abs_m", "abs_s", "rel_m", "rel_s")]
)
summary_df = summary_df[ordered_cols]

# ------------------------------------------------------------------
# 3 · DISPLAY OR SAVE
# ------------------------------------------------------------------
pd.set_option("display.float_format", "{:,.4f}".format)
#print(summary_df.to_markdown(tablefmt="github", floatfmt=".2f"))            # to screen



# ------------------------------------------------------------------
# 4 · COLLAPSE mean & sd  →  "$μ_{\pm σ}$"
# ------------------------------------------------------------------
def make_cell(mu, sd):
    if np.isnan(mu) or np.isnan(sd):
        return "--"
    if mu > 0:
        return fr"$+{mu:.4f}_{{\pm {sd:.4f}}}$"
    else:
        return fr"${mu:.4f}_{{\pm {sd:.4f}}}$"

# If they don’t, raise a gentle error.
if 'abs_by_group' not in globals():
    raise NameError("`abs_by_group` not found – run the gap‑collection cell first.")

categories      = abs_by_group.keys()        # 12 categorical groups
n_groups        = len(categories)
print(n_groups)
n_approaches    = len(comp_approaches)

x_base          = np.arange(n_groups)                # positions on x‑axis
offset          = 0.8 / n_approaches                  # spread approaches across the group slot
markers         = ["o", "v", "s"]#, "^", "v", "P", "X"] # enough unique markers
legend = {
    "majority-vote": ["o", "#949494ff"],
    "dawid-skene": ["v", "#949494ff"],
    
    "crowd-layer": ["o", "#4d4da6ff"],
    "trace-reg": ["v", "#4d4da6ff"],
    "conal": ["s", "#4d4da6ff"],
    "union-net-a": ["D", "#4d4da6ff"],
    "union-net-b": ["X", "#4d4da6ff"],
    "geo-reg-w": ["P", "#4d4da6ff"],
    "geo-reg-f": ["^", "#4d4da6ff"],
    
    "crowd-ar": ["o", "#a64da6ff"],
    "madl": ["v", "#a64da6ff"],
    "annot-mix": ["s", "#a64da6ff"],
    "coin-net": ["D", "#a64da6ff"],
}


plt.figure(figsize=(max(6, n_groups * 1.5), 5))

for idx, approach in enumerate(comp_approaches):
    means, ses = [], []
    for g in ordered_criteria:
        if scatter_type == "relative":
            vals = rel_by_group[g][approach]
            yticks = np.arange(-5, 40, 5)
        else:
            vals = abs_by_group[g][approach]
            yticks = np.arange(-2, 14, 2)
        if vals:
            means.append(np.mean(vals))
            ses.append(np.std(vals, ddof=1) / np.sqrt(len(vals)))
        else:
            means.append(np.nan)
            ses.append(np.nan)

    # x positions shifted per approach
    x_pos = x_base - 0.4 + offset/2 + idx * offset
    plt.errorbar(x_pos, means, yerr=ses, fmt=legend[approach][0], color=legend[approach][1],
                 capsize=3, label=approach, linestyle='None')

# Aesthetics
plt.axhline(0, color='grey', linestyle=':')
plt.xlim(-0.5, n_groups-0.5)
plt.xticks(x_base, categories, ha='center', fontsize=15)
plt.yticks(yticks, fontsize=15)
plt.legend(ncol=min(n_approaches, 6), fontsize=15)
plt.tight_layout()
plt.savefig(f"approaches_{scatter_type}_diff_per_criterion.pdf")
plt.show()

In [None]:
training_times = np.array([154.3700, 153.3405, 154.2420, 154.5105, 154.6445])
risk_measurement_times = np.array([1.7215, 1.6655, 1.6910, 1.6225, 1.6180])

# Calculate means
means = [training_times.mean(), risk_measurement_times.mean()]
x_labels = ['Training Times', 'Risk Measurement Times']

# Create vertical bar plot
fig, ax = plt.subplots(figsize=(5, 3))
x_pos = range(len(labels))
ax.bar(x_pos, means)
ax.set_xticks(x_pos)
ax.set_xticklabels(x_labels)
ax.set_yticks(np.arange(0, 200, 30))
ax.set_ylabel('Mean Time')

# Annotate bars with mean values
for i, v in enumerate(means):
    ax.text(i, v, f'${v:.2f}$s', ha='center', va='bottom')

plt.tight_layout()
plt.savefig("computation_time_comparison.pdf")
plt.show()

In [None]:
import numpy as np
import pandas as pd
from itertools import product
from typing import List, Sequence, Tuple, Iterable

import numpy as np
import pandas as pd
from itertools import product
from typing import List, Tuple

df_dict = {}
approach_rankings = None
metric_rankings = None
approach_list = [
    #"ground_truth",
    #"majority_vote",
    "dawid-skene",
    "crowd_layer",
    "trace_reg",
    "conal",
    "union_net_a",
    "union_net_b",
    "madl",
    "geo_reg_w",
    "geo_reg_f",
    "crowd_ar",
    "annot_mix",
    "coin_net",
]

dataset_list = []
for v in ["worst-1", "worst-2", "worst-var", "rand-1", "rand-2", "rand-var", "full"]:
    dataset_list.extend(
        [
        f"spc_{v}",
        f"reuters_{v}",
        f"music_genres_{v}",
        f"label_me_{v}",
        f"dopanim_{v}",
        ]
    )
for approach in approach_list:
    df_dict[approach] = {}
    for ds in dataset_list:
        results_df_dict = {}
        # Load results per dataset.
        exp_name = f"hyperparameter_search_{ds}"
        runs_df = evaluate(
            mlruns_path=os.path.join(MLRUNS_PATH, exp_name),
            experiment_name=exp_name,
            update_columns=UPDATE_COLUMNS,
            perf_type="class",
            version="valid",
            epoch="full",
            loss_func=LOSS_FUNC,
            cache_path=CACHE_PATH,
        )
        if runs_df is None:
            continue

        # Preprocess columns.
        df = runs_df.drop(columns=["data"])
        df['clf'] = df['clf'].astype(str) + '-' + df['agg'].astype(str)
        df.drop(columns=['agg'], inplace=True)

        # Identify '_valid' columns & rank them within each group
        app = f"aggregate-{approach}" if approach == "dawid-skene" else f"{approach}-None"
        df_subset = df[df["clf"] == app]
        df_dict[approach][ds] = df_subset
        continue
        
print(df_dict.keys())

In [None]:
from math import ceil, floor
# -------------------------------------------------------------------
# 1.  utilities
# -------------------------------------------------------------------
def simplex_grid(
        K: int,
        step: float      = 0.05,
        w_min: float     = 0.1,
        w_max: float     = 0.9,
) -> Iterable[np.ndarray]:
    """
    Evenly–spaced points on the K-simplex  Σ w_k = 1  with

        w_min ≤ w_k ≤ w_max   for every k.

    Parameters
    ----------
    K      : int      number of dimensions
    step   : float    grid resolution (weights are multiples of `step`)
    w_min  : float    lower bound  (0 ≤ w_min ≤ 1/K)
    w_max  : float    upper bound  (1/K ≤ w_max ≤ 1)

    Yields
    ------
    ndarray shape (K,) – one feasible weight vector per iteration
    """
    if not (0.0 <= w_min <= w_max <= 1.0):
        raise ValueError("Require 0 ≤ w_min ≤ w_max ≤ 1.")
    if K * w_min - 1 > 1e-12 or K * w_max + 1e-12 < 1:
        raise ValueError(
            "Bounds incompatible with Σw_k=1: need  K·w_min ≤ 1 ≤ K·w_max."
        )

    m        = int(round(1 / step))              # denominator of the grid
    lo       = ceil(w_min * m)                   # min count per dim
    hi       = floor(w_max * m)                  # max count per dim
    for counts in product(range(lo, hi + 1), repeat=K):
        if sum(counts) == m:                     # simplex condition
            yield np.array(counts, dtype=float) / m


def mean_test_loss(
    w: np.ndarray,
    R: np.ndarray,      # (n_datasets, B, K)  - ranks (1 = best)
    L: np.ndarray,      # (n_datasets, B)     - 0-1 loss (lower = better)
) -> float:
    """Average loss obtained when the weighted-rank rule w selects 1 config."""
    score   = np.tensordot(R, w, axes=(2, 0))       # (n_datasets, B)
    chosen  = score.argmin(1)                       # lowest rank wins
    return L[np.arange(L.shape[0]), chosen].mean()  # minimiser

def mean_rank_error(
    w: np.ndarray,
    R: np.ndarray,     # shape = (n_datasets, B, K)  – predicted ranks (1 = best)
    L: np.ndarray,     # shape = (n_datasets, B)     – ground-truth loss (lower = better)
    metric: str = "kendall"      # "kendall" | "spearman"
) -> float:
    """
    Average *ranking* error of the weighted rule w over all datasets.
    Smaller is better (negative correlation == large positive agreement).
    """

    # 1. score each config with the weighted-Borda rule
    score = np.tensordot(R, w, axes=(2, 0))          # (n_datasets, B)

    # 2. convert scores & true losses into full orderings (0 = best)
    pred_ord = score.argsort(axis=1)
    true_ord = L.argsort(axis=1)                     # lower loss → better

    # 3. compute per-dataset rank correlation / distance
    errs = []
    for p, t in zip(pred_ord, true_ord):
        if metric == "kendall":
            # τb ∈ [-1,1]; negate so optimiser can *minimise*
            errs.append(-stats.kendalltau(p, t, variant="b").correlation)
        elif metric == "spearman":
            errs.append(-stats.spearmanr(p, t).correlation)
        else:
            raise ValueError("metric must be 'kendall' or 'spearman'")
    return float(np.nanmean(errs))     # mean across datasets


# -------------------------------------------------------------------
# 2.  main optimiser
# -------------------------------------------------------------------
def optimise_weights_cv(
    dfs: List[pd.DataFrame],
    valid_cols: Sequence[str],
    cv_splits: List[Tuple[Sequence[int], Sequence[int]]],
    test_col: str                       = "test_loss",
    grid_step: float                    = 0.05,
    ascending_rank: bool                = True,
) -> Tuple[np.ndarray, float, List[float]]:
    """
    Parameters
    ----------
    dfs         : list of length N, each DataFrame shape (B, K+1)
    valid_cols  : names of the K validation-metric columns
    test_col    : column containing **loss** (lower = better)
    cv_splits   : list of (train_idx, valid_idx) tuples
                  – exactly what sklearn splitters yield
    grid_step   : mesh size for simplex grid (0.05 ⇒ 10 626 pts for K=5)
    ascending_rank : True  → rank 1 = *smallest* validation score (loss)
                     False → rank 1 = *largest*  validation score (accuracy)

    Returns
    -------
    w_bar       : ndarray (K,)   – averaged weight vector over folds
    cv_mean     : float          – mean held-out loss across folds
    fold_scores : list[float]    – held-out loss per fold
    """
    N, K = len(dfs), len(valid_cols)

    # 1.  convert validation scores → ranks (1 = best)
    ranks, losses = [], []
    for df in dfs:
        rnk = df[valid_cols].rank(ascending=ascending_rank, method="min").to_numpy(dtype=int)
        ranks.append(rnk)                       # (B, K)
        losses.append(df[test_col].to_numpy())  # (B,)

    ranks  = np.stack(ranks)   # (N, B, K)
    losses = np.stack(losses)  # (N, B)

    # 2. prepare weight grid once
    grid = list(simplex_grid(K, step=grid_step))

    # 3. cross-validation loop
    w_sum, fold_scores = np.zeros(K), []
    tested_losses = []
    for train_idx, valid_idx in cv_splits:
        R_tr, L_tr = ranks[train_idx], losses[train_idx]

        # grid search
        best_w, best_obj = None, np.inf
        for w in grid:
            obj = mean_test_loss(w, R_tr, L_tr)
            tested_losses.append(obj)
            if obj < best_obj:
                best_w, best_obj = w, obj
            elif obj == best_obj:
                best_w_diff = ((np.full_like(best_w, 1/len(best_w)) - best_w)**2).sum()
                w_diff = ((np.full_like(w, 1/len(w)) - w)**2).sum()
                if w_diff < best_w_diff:
                    best_w, best_obj = w, obj
                

        # evaluate on validation datasets of this fold
        R_val, L_val = ranks[valid_idx], losses[valid_idx]
        val_loss     = mean_rank_error(best_w, R_val, L_val)
        fold_scores.append(val_loss)
        w_sum += best_w

    w_bar   = w_sum / len(cv_splits)
    cv_mean = float(np.mean(fold_scores))
    return w_bar, cv_mean, fold_scores

w_dict = {}
valid_scores = subsets["ENS"]
test_score = f"class_true_{LOSS_FUNC}_full_test"
for approach in approach_list:
    print(approach)
    w_dict[approach] = {}
    for ds in ["spc", "reuters", "music_genres", "label_me", f"dopanim"]:
        print(ds)
        w, cv_acc, fold_acc = optimise_weights_cv(
            [v for k, v in sorted(df_dict[approach].items()) if not ds in k],
            valid_cols=valid_scores,
            test_col=test_score,
            cv_splits = [(range(7, 28), range(0, 7)),
                         (list(range(0, 7)) + list(range(14, 28)), range(7, 14)),
                         (list(range(0, 14)) + list(range(21, 28)), range(14, 21)),
                         (range(0, 21), range(21, 28)),
                        ],
            #cv_splits=[([j for j in range(28) if j != i], [i]) for i in range(28)],
            grid_step=0.05,
        )
        w_dict[approach][ds] = w

print("Optimal averaged weights :", np.round(w, 3))
print("LODO mean test accuracy  :", cv_acc)

In [None]:
import numpy as np
import torch, torch.nn as nn, torch.nn.functional as F
from torch.optim import RAdam
from sklearn.preprocessing import StandardScaler

class Meta2Weights(nn.Module):
    """
    meta (P,)  ─►  linear  ─►  softmax  ─►  w (K,)  with Σw=1, w≥0
    """
    def __init__(self, P: int, K: int):
        super().__init__()
        self.linear = nn.Linear(P + 1, K, bias=False)      # +1 for bias via concat‑1

    def forward(self, meta: torch.Tensor) -> torch.Tensor:
        x = torch.cat([meta, torch.ones_like(meta[:, :1])], 1)  # append bias term
        return torch.softmax(self.linear(x), 1)                 # (batch,K)

def expected_test_loss(weights,    # (n,K)
                       V_batch,    # (n,B,K)
                       T_batch,    # (n,B)
                       tau: float = .08) -> torch.Tensor:
    """
    Expected test loss via a softmin with temperature τ.
    Lower = better.  Outputs a scalar.
    """
    score = (weights[:, None, :] * V_batch).sum(2)       # (n,B)
    p     = F.softmax(-score / tau, 1)                   # prob. of picking config
    return (p * T_batch).sum(1).mean()

def learn_weights(M, V, T,
                  splits,          # list of (train_idx, valid_idx)
                  n_iter = 3500,
                  lr     = 0.001,
                  l2     = 1,
                  ent    = 5e-3):
    """
    Returns:
        w_bar      (K,)  – average of fold‑specific optima
        cv_loss          – mean held‑out test loss
        fold_losses      – list per fold
    """
    device = "cpu";   # "cuda:0" if you have a GPU

    # tensors
    M_t = torch.as_tensor(M, dtype=torch.float32, device=device)
    V_t = torch.as_tensor(V, dtype=torch.float32, device=device)
    T_t = torch.as_tensor(T, dtype=torch.float32, device=device)

    N, B, K = V.shape;  D = M.shape[1]
    w_sum, fold_losses = torch.zeros(K, device=device), []

    for tr, va in splits:
        net = Meta2Weights(D, K).to(device)
        opt = RAdam(net.parameters(), lr=lr, weight_decay=l2)

        for _ in range(n_iter):
            w_tr = net(M_t[tr])                                  # (n_tr,K)
            loss = expected_test_loss(w_tr, V_t[tr], T_t[tr])

            # entropy regulariser keeps weights spread out early on
            ent_pen = ent * (w_tr * w_tr.clamp_min(1e-9).log()).sum(1).mean()

            (loss + ent_pen).backward()
            opt.step(); opt.zero_grad()

        # ---------- validation ----------
        with torch.no_grad():
            w_va   = net(M_t[va])                                 # (n_va,K)
            val_loss = expected_test_loss(w_va, V_t[va], T_t[va]).item()
            fold_losses.append(val_loss)
            w_sum += w_va.mean(0)

    w_bar   = (w_sum / len(splits)).cpu().numpy()
    cv_loss = float(np.mean(fold_losses))
    return w_bar, cv_loss, fold_losses

def fit_final_network(M, V, T,
                      n_iter = 3000, lr = 1e-1,
                      l2 = 0, ent = 1e-3, tau = 1):

    N, B, K = V.shape;  D = M.shape[1]
    dev = "cpu"

    net = Meta2Weights(D, K).to(dev)
    opt = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=l2)

    M_t = torch.as_tensor(M, dtype=torch.float32, device=dev)
    V_t = torch.as_tensor(V, dtype=torch.float32, device=dev)
    T_t = torch.as_tensor(T, dtype=torch.float32, device=dev)
    losses = []

    for _ in range(n_iter):
        w = net(M_t)                                     # (N,K)
        loss = expected_test_loss(w, V_t, T_t, tau)
        ent_pen = ent * (w * w.clamp_min(1e-9).log()).sum(1).mean()
        (loss + ent_pen).backward()
        opt.step(); opt.zero_grad()
        losses.append(loss.cpu().detach())
        
    plt.plot(losses)
    plt.show()

    return net   

def predict_weights(net, M_new):
    """
    M_new : (n,D)  meta-features of n datasets
    returns  (n,K)  weight vectors (rows sum to 1)
    """
    net.eval()
    with torch.no_grad():
        W = net(torch.as_tensor(M_new, dtype=torch.float32))
    return W.cpu().numpy()



meta_features = pd.read_csv("~/projects/github/multi-annotator-machine-learning/empirical_evaluation/python_scripts/metafeatures.csv")
splits = [(range(7, 28), range(0, 7)), (list(range(0, 7)) + list(range(14, 28)), range(7, 14)), (list(range(0, 14)) + list(range(21, 28)), range(14, 21)), (range(0, 21), range(21, 28)),]
w_dict = {}
valid_cols = [
            f"class_true_{LOSS_FUNC}_cv_valid",
            f"class_mv_unif_{LOSS_FUNC}_cv_valid",
            f"annot_unif_{LOSS_FUNC}_cv_valid",
            f"class_mv_perf_{LOSS_FUNC}_cv_valid",
            f"class_mv_perf_weights_{LOSS_FUNC}_cv_valid",
            f"annot_perf_weights_{LOSS_FUNC}_cv_valid",
]
test_col = f"class_true_{LOSS_FUNC}_full_test"
for approach in approach_list:
    print(approach)
    w_dict[approach] = {}
    for ds in ["spc", "reuters", "music_genres", "label_me", f"dopanim"]:
        print(ds)
        dfs = [v for k, v in sorted(df_dict[approach].items())]
        is_not_ds = np.array([ds not in ds_name for ds_name in meta_features["dataset"].values])
        M = meta_features.values[:, 1:].astype(float)
        sc = StandardScaler().fit(M)
        M = sc.transform(M)
        N, K = len(dfs), len(valid_cols)
        L, T = [], []
        for df in dfs:
            L.append(df[valid_cols])                       # (B, K)
            T.append(df[test_col].to_numpy())  # (B,)
        V  = np.stack(L).astype(float)   # (N, B, K)
        T = np.stack(T).astype(float)  # (N, B)

        net_final = fit_final_network(V=torch.from_numpy(V).float(), T=torch.from_numpy(T).float(), M=torch.from_numpy(M).float())
        M_new = meta_features[~is_not_ds].values[:, 1:].astype(float)
        M_new = sc.transform(M_new)
        W_new = predict_weights(net_final, M_new)
        for i, ds_new in enumerate(meta_features[~is_not_ds]["dataset"]):
            w_dict[approach][ds_new] = W_new[i]
        print(W_new.round(2))

In [None]:
import numpy as np
import torch, torch.nn as nn, torch.nn.functional as F
from torch.optim import RAdam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from lightgbm import early_stopping, log_evaluation, LGBMClassifier
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR

meta_features = pd.read_csv("~/projects/github/multi-annotator-machine-learning/empirical_evaluation/python_scripts/metafeatures.csv")
test_losses = {}
valid_cols = [
            #f"class_true_{LOSS_FUNC}_full_test",
            f"class_smv_unif_{LOSS_FUNC}_cv_valid",
            f"annot_unif_{LOSS_FUNC}_cv_valid",
            f"class_smv_perf_{LOSS_FUNC}_cv_valid",
            f"class_smv_perf_weights_{LOSS_FUNC}_cv_valid",
            f"annot_perf_weights_{LOSS_FUNC}_cv_valid",
            'gt_lr', 'gt_wd', 'ap_lr', 'ap_wd', 'bs', 'dr', 
            'lmbda', 'eta', 'alpha', 'beta', 'dim', 'epsilon',
]
test_col = f"class_true_{LOSS_FUNC}_full_test"
for approach in approach_list:
    print(approach)
    test_losses[approach] = {}
    for ds in ["music_genres", "label_me", "dopanim", "reuters", "spc"]:
        
        # Training data.
        dfs_train = [v for k, v in sorted(df_dict[approach].items())  if not ds in k]
        is_train = np.array([ds not in ds_name for ds_name in meta_features["dataset"].values])
        M_train = meta_features[is_train].values[:, 1:-1].astype(float)
        V_train, T_train = [], []
        for df in dfs_train:
            V_train.append(df[valid_cols])                       
            T_train.append(df[test_col])
        V_train  = np.stack(V_train)
        is_nan_or_non = np.logical_or(np.isnan(V_train.astype(object)).any(axis=0), (V_train == None).any(axis=0))
        V_train = V_train[~is_nan_or_non]
        T_train = np.stack(T_train).astype(float)  
        N_train, B_train, K_train = V_train.shape
        X_train = np.hstack([
            -V_train.reshape(N_train * B_train, K_train),          
            np.repeat(M_train, B_train, axis=0)        
        ])
        X_train[np.isnan(X_train)] = 0
        sc = StandardScaler().fit(X_train)
        X_train = sc.transform(X_train)
        ranks_train  = np.argsort(T_train, axis=1)
        grades_train = (B_train - 1) - ranks_train                             
        q_train      = np.quantile(grades_train, np.arange(0.1, 1.0, 0.05))
        y_train  = np.digitize(grades_train, q).reshape(-1)   
        group_sizes_full_train = np.repeat(B_train, N_train)                   
        query_ids_train = np.repeat(np.arange(N_train), B_train)
        dtrain = lgb.Dataset(X_train, label=y_train, group=group_sizes_full_train)

        
        # Test data.
        dfs_test = [v for k, v in sorted(df_dict[approach].items()) if ds in k]
        test_sets = [k for k, v in sorted(df_dict[approach].items()) if ds in k]
        is_test = np.array([ds in ds_name for ds_name in meta_features["dataset"].values])
        M_test = meta_features[is_test].values[:, 1:-1].astype(float)
        V_test, T_test = [], []
        for df in dfs_test:
            V_test.append(df[valid_cols])                       
            T_test.append(df[test_col])
        V_test  = np.stack(V_test).astype(float)  
        T_test = np.stack(T_test).astype(float)
        N_test, B_test, K_test = V_test.shape
        X_test = np.hstack([
            -V_test.reshape(N_test * B_test, K_test),          
            np.repeat(M_test, B_test, axis=0)        
        ])
        X_test[np.isnan(X_test)] = 0
        X_test = sc.transform(X_test)
        ranks_test = np.argsort(T_test, axis=1)
        grades_test = (B_test - 1) - ranks_test                            
        q_test = np.quantile(grades_test, np.arange(0.1, 1.0, 0.05))
        y_test = np.digitize(grades_test, q).reshape(-1) 
        group_sizes_full_test = np.repeat(B_test, N_test)                   
        query_ids_test = np.repeat(np.arange(N_test), B_test) 
        
        
        # Train model.
        params = {
            "objective": "lambdarank",   
            "metric": "ndcg",                 
            "learning_rate": 0.001,
            "num_leaves": 31,
            "min_data_in_leaf": 1,
            "verbose": -1                     
        }
        model = lgb.train(params, dtrain, num_boost_round=10)
        #lgb.plot_importance(model, importance_type="gain", figsize=(7,6), title="LightGBM Feature Importance (Gain)")
        #plt.show()
        reg = LinearRegression().fit(X_train, T_train.reshape(-1))
        
        # Make predictions.
        for g in np.unique(query_ids_test):
            is_g = g == query_ids_test
            preds_d = -model.predict(X_test[is_g])
            preds_d = reg.predict(X_test[is_g])
            print(f"{test_sets[g]}: {T_test.reshape(-1)[is_g][preds_d.argmin()]}")
            test_losses[approach][test_sets[g]] = T_test.reshape(-1)[is_g][preds_d.argmin()]

In [None]:
test_losses

In [None]:
df_dict["dawid-skene"]["spc_worst-1"].columns