# ==== INTERACTIVE CLUSTERING : ANNOTATION ERROR STUDY ====
> ### Stage 3 : Modelize annotation errors impact on Interactive Clustering and Plot some figures.

------------------------------
## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at modelize annotation errors impact during interactive clusterings, plot overviews of experiments**.
- Environments are represented by subdirectories in the `/experiments` folder. A full path to an experiment environment is `/experiments/[DATASET]/[ALGORITHM]/[ERROR]/[CONSTRAINTS_SELECTION]`.

Before running, **run the notebook `2_Simulate_errors_and_run_clustering.ipynb` on each algorithm you have set**.

### Description each steps

First of all, **load experiments** that have been computed with the last notebook.
- A config file contains parameters used for each experiment and annotation error simulations to analyze.
- For each algorithm and constraints selection method, print performance evolution according to constraints number and error rate.

------------------------------

## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Dict, List, Optional, Tuple, Union
import json
import listing_envs
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.cm as cm
from matplotlib.colors import Normalize, LinearSegmentedColormap
from scipy import stats as scipystats
import statistics
from sklearn import metrics

------------------------------

## 2. Load data

In [None]:
# Get list of experiment environments.
LIST_OF_EXPERIMENT_ENVIRONMENTS: List[str] = listing_envs.get_list_of_constraints_selection_env_paths()
print(
    "There are",
    "`" + str(len(LIST_OF_EXPERIMENT_ENVIRONMENTS)) + "`",
    "created experiment environments in `../experiments`",
)
LIST_OF_EXPERIMENT_ENVIRONMENTS

------------------------------

## 3. Display evolution of performances according to errors simulations and dataset sizes

Define main functions.

In [None]:
def get_MAX_constraints_number(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS,
) -> str:
    """
    Get the maximum number of constraints accross experiments.
    Return constraint max constraints number.
    
    Args:
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS (List[str]): The list of experiments to consider.
    Returns:
        str: Maximum constraints number.
    """

    # Prepare histograms.
    list_of_stop_case_constraints = []

    # For each environment...
    for env in local_LIST_OF_EXPERIMENT_ENVIRONMENTS:

        # Load clustering performance for the current experiment.
        with open(
            env + "dict_of_clustering_performances.json", "r"
        ) as evaluations_file:
            dict_of_clustering_evaluations: Dict[
                str, Dict[str, float]
            ] = json.load(evaluations_file)

        # Update histogram for convergence.
        max_iteration: str = max(dict_of_clustering_evaluations.keys())
        list_of_stop_case_constraints.append(max_iteration)

    # Return
    return str(max(list_of_stop_case_constraints)).zfill(6)

In [None]:
def get_MEAN_SEM_of_performance_evolution_per_constraints_number(
    local_LIST_OF_CONSTRAINTS_NUMBERS: List[str],
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS: List[str],
) -> Tuple[Dict[str, float], Dict[str, float]]:
    """
    Compute evolution of performance accross constraints number.
    Return Mean and Standard error of the mean evolutions
    
    Args:
        local_LIST_OF_CONSTRAINTS_NUMBERS (List[str]): The list of constraints number to consider.
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS (List[str]): The list of experiments to consider.
    Returns:
        Tuple[Dict[str, float], Dict[str, float]]: Evolutions of Mean and Standard error of the mean accross constraints number.
    """

    # Initialize storage of experiment performances for all constraints number.
    dict_of_global_performances_evolution_per_constraints_number: Dict[str, List[float]] = {
        constraints_number: [] for constraints_number in local_LIST_OF_CONSTRAINTS_NUMBERS
    }
    # Initialize storage of performance mean for all constraints number.
    dict_of_global_performances_evolution_per_constraints_number_MEAN: Dict[str, float] = {
        constraints_number: 0 for constraints_number in local_LIST_OF_CONSTRAINTS_NUMBERS
    }
    # Initialize storage of performance standard error of the mean for all constraints number.
    dict_of_global_performances_evolution_per_constraints_number_SEM: Dict[str, float] = {
        constraints_number: 0 for constraints_number in local_LIST_OF_CONSTRAINTS_NUMBERS
    }

    # For each experiment...
    for env_a in local_LIST_OF_EXPERIMENT_ENVIRONMENTS:

        # Load clustering evaluations.
        with open(
            env_a + "dict_of_clustering_performances.json", "r"
        ) as evaluation_file:
            dict_of_clustering_performances: Dict[str, float] = json.load(evaluation_file)
        
        # For each requested constraints number...
        max_constraints_a: str = local_LIST_OF_CONSTRAINTS_NUMBERS[0]
        for nb_constraints_a in local_LIST_OF_CONSTRAINTS_NUMBERS:

            # Append the clustering performancre for the current experiment and for this constraints number.
            if nb_constraints_a in dict_of_clustering_performances.keys():
                max_constraints_a = nb_constraints_a
                dict_of_global_performances_evolution_per_constraints_number[nb_constraints_a].append(
                    dict_of_clustering_performances[nb_constraints_a]["v_measure"]
                )
            # If iteration isn't reached by this experiment, duplicate the last known results.
            # Most of the time: the experiment has reached annotation completeness and there is no more iteration because clustering is "perfect" (v-measure==1.0).
            else:
                dict_of_global_performances_evolution_per_constraints_number[nb_constraints_a].append(
                    dict_of_clustering_performances[max_constraints_a]["v_measure"]
                )

    # Compute mean and sem of performance for each constraints number.
    for nb_constraints_b in dict_of_global_performances_evolution_per_constraints_number.keys():

        # Compute mean of performance for this constraints number.
        dict_of_global_performances_evolution_per_constraints_number_MEAN[nb_constraints_b] = np.mean(dict_of_global_performances_evolution_per_constraints_number[nb_constraints_b])
        # Compute sem of performance for this constraints number.
        dict_of_global_performances_evolution_per_constraints_number_SEM[nb_constraints_b] = scipystats.sem(dict_of_global_performances_evolution_per_constraints_number[nb_constraints_b])
        
    # Return.
    return (
        dict_of_global_performances_evolution_per_constraints_number_MEAN,
        dict_of_global_performances_evolution_per_constraints_number_SEM
    )

In [None]:
def add_plot_of_performance_evolution_per_constraints_number_to_graph(
    axis,
    list_of_x: List[str],
    dict_of_y: Dict[str, float],
    dict_of_y_err: Optional[Dict[str, float]] = None,
    label: str = "",
    label_in_curve: Optional[str] = None,
    marker: str = "",
    markersize: int = 5,
    color: str = "black",
    linewidth: int = 2,
    linestyle: str = "-",
    alpha: float = 0.2,
):
    """
    Add a plot to an axis of a graph.
    
    Args:
        axis (): TODO.
        list_of_x (List[str]): TODO.
        dict_of_y (Dict[str, float]): TODO.
        dict_of_y_err (Optional[Dict[str, float]]): TODO. Defaults to `None`.
        label (str): TODO. Defaults to `""`.
        label_in_curve (Optional[str]): TODO. Defaults to `None`.
        marker (str): TODO. Defaults to `""`.
        markersize (int): TODO. Defaults to `5`.
        color (str): TODO. Defaults to `"black"`.
        linewidth (int): TODO. Defaults to `2`.
        linestyle (str): TODO. Defaults to `"-"`.
        alpha (float): TODO. Defaults to `0.2`.
    """
    # Add curve.
    axis.plot(
        [int(x) for x in list_of_x],  # x
        [dict_of_y[x] for x in list_of_x],  # y
        label=label,
        marker=marker,
        markerfacecolor=color,
        markersize=markersize,
        color=color,
        linewidth=linewidth,
        linestyle=linestyle,
    )
    # Add curve name.
    if label_in_curve is not None:
        axis.text(
            x=int(list_of_x[-1]),
            y=dict_of_y[list_of_x[-1]],
            s=label_in_curve,
        )
    # Add curve error bars.
    if dict_of_y_err is not None:
        axis.fill_between(
            [int(x) for x in list_of_x],  # x
            y1=[(dict_of_y[x] - dict_of_y_err[x]) for x in list_of_x],  # y1
            y2=[(dict_of_y[x] + dict_of_y_err[x]) for x in list_of_x],  # y2
            color=color,
            alpha=alpha,
        )

In [None]:
def get_MEAN_SEM_accordance_between_clusterings(
    constraints_number_id: str,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS: List[str],
) -> Tuple[float, float]:
    """
    Compute accordance between a clustering and its reference.
    Return Mean and Standard error of the mean accordance
    
    Args:
        constraints_number_id (str): The iteration to analyze.
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS (List[str]): The list of experiments to consider.
    Returns:
        Tuple[float, float]: Clustering accordance Mean and Standard error.
    """
    
    # Initialize storage of clustering accordance (2 by 2).
    list_of_clustering_vmeasures: List[float] = []
    
    # For all environment to compare.
    for env_comparison in local_LIST_OF_EXPERIMENT_ENVIRONMENTS:

        # Load clustering results to compare.
        with open(
            env_comparison + "dict_of_clustering_results.json", "r"
        ) as clustering_comparison_file:
            dict_of_clustering_results_comparison: Dict[str, Dict[str, int]] = json.load(clustering_comparison_file)
        clustering_comparison: Dict[str, float] = (
            dict_of_clustering_results_comparison[constraints_number_id]
            if constraints_number_id in dict_of_clustering_results_comparison.keys()
            else dict_of_clustering_results_comparison[sorted(dict_of_clustering_results_comparison.keys())[-1]]
        )

        # Get reference environment for comparison.
        env_reference: str = "/".join([
            env_comparison.split("/")[0],
            env_comparison.split("/")[1],
            env_comparison.split("/")[2],
            env_comparison.split("/")[3],
            "rate_0.00-{0}-{1}".format(
                env_comparison.split("/")[4].split("-")[1],
                env_comparison.split("/")[4].split("-")[2],
            ),
            env_comparison.split("/")[5],
            env_comparison.split("/")[6],
        ])
        
        # Load reference clustering results
        with open(
            env_reference + "dict_of_clustering_results.json", "r"
        ) as clustering_reference_file:
            dict_of_clustering_results_reference: Dict[str, Dict[str, int]] = json.load(clustering_reference_file)
        clustering_reference: Dict[str, float] = (
            dict_of_clustering_results_reference[constraints_number_id]
            if constraints_number_id in dict_of_clustering_results_reference.keys()
            else dict_of_clustering_results_reference[sorted(dict_of_clustering_results_reference.keys())[-1]]
        )
        
        # Compare clustering and add accordance.
        list_of_clustering_vmeasures.append(
            metrics.v_measure_score(
                [clustering_reference[data_id] for data_id in clustering_reference.keys()],
                [clustering_comparison[data_id] for data_id in clustering_reference.keys()],
            )
        )
    
    # Return mean and standard error of the mean of clustering accordance
    return np.mean(list_of_clustering_vmeasures), scipystats.sem(list_of_clustering_vmeasures)

Define configuration for graphs.

In [None]:
# Dataset.
LIST_OF_DATASET_SIZES: List[int] = [1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]
RATIO_CONSTRAINTS_NEEDED: float = 3.15
RATIO_STD_CONSTRAINTS_NEEDED: float = 0.016

In [None]:
# Error rate.
LIST_OF_ERROR_RATES: List[float] = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50]
MAX_ERROR_RATE: float = 0.25

In [None]:
# Define color map.
#list_of_colors = matplotlib.colormaps["RdYlGn"](
#    np.linspace(1, 0, len(LIST_OF_ERROR_RATES))
#)
cmap = LinearSegmentedColormap.from_list(
    name="RedOrangeGreen",
    colors=["darkred", "orangered", "orange", "gold", "yellowgreen", "green"],
    N=256,
    gamma=1.0,
)
list_of_colors = cmap(
    np.linspace(1, 0, len(LIST_OF_ERROR_RATES))
)

In [None]:
# Config for makers.
markers: Dict[float, str] = {error_rate: "." for error_rate in LIST_OF_ERROR_RATES}
markers[0.00] = "^"
    #markers[0.50] = "v"
# Config for maker sizes.
markersizes: Dict[float, int] = {error_rate: 5 for error_rate in LIST_OF_ERROR_RATES}
markersizes[0.00] = 5
    #markersizes[0.50] = 5
# Config for linewidths.
linewidths: Dict[float, int] = {error_rate: 1 for error_rate in LIST_OF_ERROR_RATES}
linewidths[0.00] = 1
    #linewidths[0.50] = 1
# Config for linestyles.
linestyles: Dict[float, str] = {error_rate: "-" for error_rate in LIST_OF_ERROR_RATES}
linestyles[0.00] = "-"
    #linestyles[0.50] = "-"

Compute evolution of performances per dataset size.

In [None]:
# Initialize dataframe of performances review.
df_performances_evolution: Dict[int, Dict[float, float]] = {}
    
# Initialize dataframe of clustering accordance review.
df_accordance_evolution: Dict[int, Dict[float, float]] = {}

# For all dataset size...
for DATASET_SIZE in LIST_OF_DATASET_SIZES:
    print("-----")
    print("ANALYZE DATASET WITH SIZE {0}".format(DATASET_SIZE))

    # Get list of environments to analyse.
    LIST_OF_EXPERIMENT_ENVIRONMENTS_WITH_SPECIFIC_SIZE = [
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if ("size_{size}".format(size=DATASET_SIZE) in env.split("/")[2])
    ]
        
    ###
    ### ANALYZE PERFORMANCE EVOLUTION AND ERROR IMPACT
    ###

    # Get maximum constrains number for these experiments.
    MAX_CONSTRAINTS_NUMBER: str = get_MAX_constraints_number(
        LIST_OF_EXPERIMENT_ENVIRONMENTS_WITH_SPECIFIC_SIZE
    )
    MAX_CONSTRAINTS_NUMBER = str(min(int(MAX_CONSTRAINTS_NUMBER), 10*DATASET_SIZE)).zfill(6)
    LIST_OF_CONSTRAINTS_NUMBERS: List[str] = [
        str(i).zfill(6)
        for i in range(0, int(MAX_CONSTRAINTS_NUMBER)+250, 250)
    ]
        
    # Compute default performance.
    performances_MEAN_default, performances_SEM_default = get_MEAN_SEM_of_performance_evolution_per_constraints_number(
        local_LIST_OF_CONSTRAINTS_NUMBERS=LIST_OF_CONSTRAINTS_NUMBERS,
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
            env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS_WITH_SPECIFIC_SIZE
            if ("rate_0.00" in env.split("/")[4])
        ],
    )
    
    # Get theorical number of constraints needed to reach 90% of v-measure.
    #THEORICAL_CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE: str = str(round(RATIO_CONSTRAINTS_NEEDED * DATASET_SIZE/250)*250).zfill(6)
    #print("   ", "Theorique:", THEORICAL_CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE)
        
    # Get number of constraints needed to reach 90% of v-measure.
    CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE: str = sorted(performances_MEAN_default.keys())[-1]
    for constraint_number, performance in performances_MEAN_default.items():
        if 0.90 <= performance:
            CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE = constraint_number
            break
    #print("   ", "Reel:", CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE)
    
    # Initialize dataframe of performances review for this dataset size.
    df_performances_evolution[
        "size:{0}".format(DATASET_SIZE)
    ] = {
        "constraints_needed": int(CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE)
    }
    
    # Initialize dataframe of clustering accordance review.
    df_accordance_evolution[
        "size:{0}".format(DATASET_SIZE)
    ] = {
        "constraints_needed": int(CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE)
    }

        
    ###
    ### PLOT FIGURE WITH PERFORMANCE EVOLUTION
    ###
    
    # Create a new figure.
    fig_plot_error_simulation: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
    axis_plot_error_simulation = fig_plot_error_simulation.gca()

    # Set range of axis.
    axis_plot_error_simulation.set_xlim(xmin=-250, xmax=int(MAX_CONSTRAINTS_NUMBER)+250)
    axis_plot_error_simulation.set_ylim(ymin=-0.01, ymax=1.01)

    # Plot error simulation.
    for k, error_rate_k in enumerate(LIST_OF_ERROR_RATES):
        if error_rate_k > MAX_ERROR_RATE:
            continue

        # Compute performance MEAN and SEM for this error rate.
        performances_MEAN, performances_SEM = get_MEAN_SEM_of_performance_evolution_per_constraints_number(
            local_LIST_OF_CONSTRAINTS_NUMBERS=LIST_OF_CONSTRAINTS_NUMBERS,
            local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
                env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS_WITH_SPECIFIC_SIZE
                if ("rate_{rate:.2f}".format(rate=error_rate_k) in env.split("/")[4])
            ],
        )
    
        # Complete dataframe of performances review for this dataset size.
        df_performances_evolution[
            "size:{0}".format(DATASET_SIZE)
        ][
            "diff:{0:.2f}%".format(error_rate_k*100)
        ] = (
            "{0:.2f} (+/-{1:.2f})".format(
                performances_MEAN[CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE]*100,
                performances_SEM[CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE]*100,
            )
        )
        
        # Complete dataframe of clustering accordance review for this dataset size.
        accordance_MEAN, accordance_SEM = get_MEAN_SEM_accordance_between_clusterings(
            constraints_number_id=CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE,
            local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
                env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS_WITH_SPECIFIC_SIZE
                if ("rate_{rate:.2f}".format(rate=error_rate_k) in env.split("/")[4])
            ],
        )
        df_accordance_evolution[
            "size:{0}".format(DATASET_SIZE)
        ][
            "diff:{0:.2f}%".format(error_rate_k*100)
        ] = (
            "{0:.2f} (+/-{1:.2f})".format(
                accordance_MEAN*100,
                accordance_SEM*100
            )
        )

        # Add plot.
        add_plot_of_performance_evolution_per_constraints_number_to_graph(
            axis=axis_plot_error_simulation,
            list_of_x=LIST_OF_CONSTRAINTS_NUMBERS,
            dict_of_y=performances_MEAN,
            dict_of_y_err=performances_SEM,
            label="{rate:2d}% de différences".format(rate=int(error_rate_k*100)),
            label_in_curve="{rate:2d}%".format(rate=int(error_rate_k*100)),
            marker=markers[error_rate_k],
            markersize=markersizes[error_rate_k],
            color=list_of_colors[k],
            linewidth=linewidths[error_rate_k],
            linestyle=linestyles[error_rate_k],
            alpha=0.2,
        )
        
    # Plot number of constraints needed to reach 90% of v-measure.
    axis_plot_error_simulation.vlines(
        x=int(CONSTRAINTS_NEEDED_TO_REACH_90VMEASURE),
        ymin=-0.00,
        ymax=1.00,
        label="seuil 90% v-measure",
        colors="black",
        linewidth=2,
        linestyle="--"
    )

    # Set axis name.
    axis_plot_error_simulation.set_xlabel("nombre de contraintes [#]", fontsize=18,)
    axis_plot_error_simulation.set_ylabel("v-measure [%]", fontsize=18,)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)

    # Plot the legend.
    axis_plot_error_simulation.legend(ncol=2, loc="lower right", fontsize=12,)

    # Plot the grid.
    axis_plot_error_simulation.grid(True)

    # Store the graph.
    fig_plot_error_simulation.savefig(
        "../results/etude-erreur-simulation-impact-size-{size}.png".format(size=DATASET_SIZE),
        dpi=300,
        transparent=True,
        bbox_inches="tight",
    )
    plt.show()
    plt.clf()

Display performance delay of error environments.

In [None]:
pd.DataFrame.from_dict(df_performances_evolution)

In [None]:
with pd.ExcelWriter("../results/etude-erreur-simulation-evolution-performances.xlsx") as writer:  
    pd.DataFrame.from_dict(df_performances_evolution).to_excel(
        writer,
        sheet_name="performance",
    )

Display clustering accordance (between non-error and error environments).

In [None]:
pd.DataFrame.from_dict(df_accordance_evolution)

In [None]:
with pd.ExcelWriter("../results/etude-erreur-simulation-evolution-accord-entre-clusterings.xlsx") as writer:  
    pd.DataFrame.from_dict(df_accordance_evolution).to_excel(
        writer,
        sheet_name="clustering_accordance",
    )