# ==== INTERACTIVE CLUSTERING : EFFICIENCE STUDY ====
> ### Stage 4 : Plot some figures according to previous stages

-----

## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at plot several figures according to previous analyses**.
- Environments are represented by subdirectories in the `/experiments` folder. A full path to an experiment environment is `/experiments/[DATASET]/[PREPROCESSING]/[VECTORIZATION]/[SAMPLING]/[CLUSTERING]/[EXPERIMENT]`.
- An experiment run is composed of iterations of _interative clustering_.
- An experiment evaluation look at each _interative clustering_ iteration of the experiment.

Before running, **run the notebook `3_Analyze_main_effects_and_post_hoc.ipynb` to run main effects and and post-hoc analysis on interactive clustering convergence speed over experiments.**.

### Description each steps

1. First section is aimed at compare performance of unconstrained clustering
2. Second section is aimed at display estimation of performance accross iterations for several combinations of algorithm, parameters, or experiments.
3. Third section is aimed at display iteration required to obtain performance goals for several combinations of algorithm, parameters, or experiments.

-----

## 1. Compute average performance of unconstrained clustering (iteration 0)

***WARNING***: _Start by launching the experiment runs, evaluations and synthesis, and launching main effects analysis before this section !_

Import Python dependencies.

In [None]:
from typing import List
import listing_envs
import json
import numpy as np
from scipy import stats as scipystats

Find all experiment environments.

In [None]:
# Get list of experiment environments.
LIST_OF_EXPERIMENT_ENVIRONMENTS: List[
    str
] = listing_envs.get_list_of_experiment_env_paths()
print(
    "There are",
    "`" + str(len(LIST_OF_EXPERIMENT_ENVIRONMENTS)) + "`",
    "experiment environments in `../experiments`",
)

Get clustering performances at iteration 0.

In [None]:
# Initialize list of clustering performances at iteration 0.
list_of_clustering_performances_at_iteration_0: List[float] = []

# For each environment...
for ENV_PATH in LIST_OF_EXPERIMENT_ENVIRONMENTS:

    # Load dictionary of clustering performances.
    with open(
        ENV_PATH + "dict_of_clustering_performances.json", "r"
    ) as file_clustering_performances:
        list_of_clustering_performances_at_iteration_0.append(
            json.load(file_clustering_performances)["0000"]["v_measure"]
        )

Compute statistics on clustering performances at iteration 0.

In [None]:
# Compute min of clustering performances at iteration 0.
perf_min: float = min(list_of_clustering_performances_at_iteration_0)
print("perf_min ", ":", perf_min)
# Compute max of clustering performances at iteration 0.
perf_max: float = max(list_of_clustering_performances_at_iteration_0)
print("perf_max ", ":", perf_max)
# Compute mean of clustering performances at iteration 0.
perf_mean: float = np.mean(list_of_clustering_performances_at_iteration_0)
print("perf_mean", ":", perf_mean)
# Compute standard deviation of clustering performances at iteration 0.
perf_std: float = np.std(list_of_clustering_performances_at_iteration_0)
print("perf_std ", ":", perf_std)
# Compute standard deviation of clustering performances at iteration 0.
perf_sem: float = scipystats.sem(list_of_clustering_performances_at_iteration_0)
print("perf_sem ", ":", perf_sem)

-----

## 2. Plots somes graphs and statistics

***WARNING***: _Start by launching the experiment runs, evaluations and synthesis, and launching main effects analysis before this section !_

### 2.1. Load Python dependencies and Experiements data

Import Python dependencies.

In [None]:
from typing import Any, Dict, List, Optional, Tuple, Union
import listing_envs
import json
import math
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
from scipy import stats as scipystats

Find all experiment environments.

In [None]:
# Get list of experiment environments.
LIST_OF_EXPERIMENT_ENVIRONMENTS: List[
    str
] = listing_envs.get_list_of_experiment_env_paths()
print(
    "There are",
    "`" + str(len(LIST_OF_EXPERIMENT_ENVIRONMENTS)) + "`",
    "experiment environments in `../experiments`",
)

### 2.2. Display iteration needed to annotated the groundtruth (_efficiency_)

Define the list of iteration to analyze.

In [None]:
# Prepare histograms.
list_of_090vmeasure_iterations = []
list_of_100vmeasure_iterations = []
list_of_convergence_iterations = []

# For each environment...
for env_1 in LIST_OF_EXPERIMENT_ENVIRONMENTS:

    # Load annotations for the current experiment.
    with open(
        env_1 + "dict_of_constraints_annotations.json", "r"
    ) as annotation_file:
        dict_of_constraints_annotations: Dict[
            str, List[Tuple[str, str, Optional[str]]]
        ] = json.load(annotation_file)
            
    # Load clustering performance for the current experiment.
    with open(
        env_1 + "dict_of_clustering_performances.json", "r"
    ) as evaluations_file:
        dict_of_clustering_evaluations: Dict[
            str, Dict[str, float]
        ] = json.load(evaluations_file)

    # Update histogram for convergence.
    current_max_iteration: str = max(dict_of_constraints_annotations.keys())
    list_of_convergence_iterations.append(int(current_max_iteration))
    
    # Update histogram for vmeasure=100.
    if dict_of_clustering_evaluations[current_max_iteration]["v_measure"] < 1.00:
        list_of_100vmeasure_iterations.append(-1)
    else:
        for iteration in sorted(dict_of_clustering_evaluations.keys(), reverse=True):
            if dict_of_clustering_evaluations[iteration]["v_measure"] < 1.00:
                break
            continue
        list_of_100vmeasure_iterations.append(int(iteration))

    # Update histogram for vmeasure=090.
    if dict_of_clustering_evaluations[current_max_iteration]["v_measure"] < 0.90:
        list_of_090vmeasure_iterations.append(-1)
    else:
        for iteration in sorted(dict_of_clustering_evaluations.keys(), reverse=True):
            if dict_of_clustering_evaluations[iteration]["v_measure"] < 0.90:
                break
            continue
        list_of_090vmeasure_iterations.append(int(iteration))
            
# Get maximum iteration.
MAX_ITER: str = str(max(list_of_convergence_iterations)).zfill(4)
# If set, force maximum iteration.
####if forced_max_iter is not None:
####    MAX_ITER = min(MAX_ITER, forced_max_iter)
print("MAX_ITER:", MAX_ITER)

# Set list of iterations to analyze.
LIST_OF_ITERATIONS: List[str] = [str(i).zfill(4) for i in range(int(MAX_ITER))]

In [None]:
# Create a new figure.
fig_hist0: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_hist0 = fig_hist0.gca()

# Set range of axis.
axis_hist0.set_xlim(xmin=-1, xmax=int(max(LIST_OF_ITERATIONS))+1)

# Plot histogram of iteration for vmeausre 90%.
axis_hist0.hist(
    list_of_090vmeasure_iterations,
    bins=2*int(math.sqrt(len(list_of_090vmeasure_iterations))),  # int(len(LIST_OF_ITERATIONS)/2),
    label="Tentatives ayant atteint une annotation\npartielle (90% de v-measure)",
    color="green",
)
print("bins:", 2*int(math.sqrt(len(list_of_090vmeasure_iterations))))

# Set axis name.
axis_hist0.set_xlabel("itération [#]", fontsize=18,)
axis_hist0.set_ylabel("tentative [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_hist0.legend(loc="upper right", fontsize=15,)

# Plot the grid.
axis_hist0.grid(True)

# Store the graph.
fig_hist0.savefig(
    "../results/etude-efficience-histogramme-annotation-partielle.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

In [None]:
# Create a new figure.
fig_hist1: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_hist1 = fig_hist1.gca()

# Set range of axis.
axis_hist1.set_xlim(xmin=-1, xmax=int(max(LIST_OF_ITERATIONS))+1)

# Plot histogram of iteration for vmeausre 100%.
axis_hist1.hist(
    list_of_100vmeasure_iterations,
    bins=2*int(math.sqrt(len(list_of_100vmeasure_iterations))),  # int(len(LIST_OF_ITERATIONS)/2),
    label="Tentatives ayant atteint une annotation\nsuffisante (100% de v-measure)",
    color="blue",
)
print("bins:", 2*int(math.sqrt(len(list_of_100vmeasure_iterations))))

# Set axis name.
axis_hist1.set_xlabel("itération [#]", fontsize=18,)
axis_hist1.set_ylabel("tentative [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_hist1.legend(loc="upper right", fontsize=15)

# Plot the grid.
axis_hist1.grid(True)

# Store the graph.
fig_hist1.savefig(
    "../results/etude-efficience-histogramme-annotation-suffisante.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

In [None]:
# Create a new figure.
fig_hist2: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_hist2 = fig_hist2.gca()

# Set range of axis.
axis_hist2.set_xlim(xmin=-1, xmax=int(max(LIST_OF_ITERATIONS))+1)

# Plot histogram of iteration for convergence.
axis_hist2.hist(
    list_of_convergence_iterations,
    bins=2*int(math.sqrt(len(list_of_convergence_iterations))),  # int(len(LIST_OF_ITERATIONS)/2),
    label="Tentatives ayant atteint une annotation\nexhaustive (toutes les contraintes)",
    color="red",
)
print("bins:", 2*int(math.sqrt(len(list_of_convergence_iterations))))

# Set axis name.
axis_hist2.set_xlabel("itération [#]", fontsize=18,)
axis_hist2.set_ylabel("tentative [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_hist2.legend(loc="upper right", fontsize=15,)

# Plot the grid.
axis_hist2.grid(True)

# Store the graph.
fig_hist2.savefig(
    "../results/etude-efficience-histogramme-annotation-exhaustive.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

### 2.3. Display mean of performance per iterations (_efficiency_)

Define main functions.

In [None]:
def get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS,
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float]]:
    """
    Compute evolution of performance accross iterations.
    Return Mean, Standard deviation and Standard error of the mean evolutions
    
    Args:
        local_LIST_OF_ITERATIONS (List[str]): The list of iterations to consider.
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS (List[str]): The list of experiments to consider.
    Returns:
        Tuple[Dict[str, float], Dict[str, float], Dict[str, float]]: Evolutions of Mean, Standard deviation and Standard error of the mean accross iterations.
    """

    # Initialize storage of experiment performances for all iterations.
    dict_of_global_performances_evolution_per_iteration: Dict[str, List[float]] = {
        iter_perf: [] for iter_perf in local_LIST_OF_ITERATIONS
    }
    # Initialize storage of performance mean for all iterations.
    dict_of_global_performances_evolution_per_iteration_MEAN: Dict[str, float] = {
        iter_mean: 0 for iter_mean in local_LIST_OF_ITERATIONS
    }
    # Initialize storage of performance standard deviation for all iterations.
    dict_of_global_performances_evolution_per_iteration_STDEV: Dict[str, float] = {
        iter_stdev: 0 for iter_stdev in local_LIST_OF_ITERATIONS
    }
    # Initialize storage of performance standard error of the mean for all iterations.
    dict_of_global_performances_evolution_per_iteration_SEM: Dict[str, float] = {
        iter_sem: 0 for iter_sem in local_LIST_OF_ITERATIONS
    }

    # For each experiment...
    for env_a in local_LIST_OF_EXPERIMENT_ENVIRONMENTS:

        # Load clustering evaluations.
        with open(
            env_a + "dict_of_clustering_performances.json", "r"
        ) as evaluation_file:
            dict_of_clustering_performances: Dict[
                str, Dict[str, float]
            ] = json.load(evaluation_file)

        # For each requested iteration...
        for iter_a in local_LIST_OF_ITERATIONS:

            # Append the clustering performancre for the current experiment and for this iteration.
            if iter_a in dict_of_clustering_performances.keys():
                dict_of_global_performances_evolution_per_iteration[iter_a].append(
                    dict_of_clustering_performances[iter_a]["v_measure"]
                )
            # If iteration isn't reached by this experiment, duplicate the last known results.
            # Most of the time: the experiment has reached annotation completeness and there is no more iteration because clustering is "perfect" (v-measure==1.0).
            else:
                dict_of_global_performances_evolution_per_iteration[iter_a].append(1.0)

    # Compute mean and sem of performance for each iteration.
    for iter_b in local_LIST_OF_ITERATIONS:

        # Compute mean of performance for this iteration.
        dict_of_global_performances_evolution_per_iteration_MEAN[iter_b] = np.mean(dict_of_global_performances_evolution_per_iteration[iter_b])
        
        # Compute stdev of performance for this iteration.
        dict_of_global_performances_evolution_per_iteration_STDEV[iter_b] = np.std(dict_of_global_performances_evolution_per_iteration[iter_b])

        # Compute sem of performance for this iteration.
        dict_of_global_performances_evolution_per_iteration_SEM[iter_b] = scipystats.sem(dict_of_global_performances_evolution_per_iteration[iter_b])
        
    # Return
    return (
        dict_of_global_performances_evolution_per_iteration_MEAN,
        dict_of_global_performances_evolution_per_iteration_STDEV,
        dict_of_global_performances_evolution_per_iteration_SEM
    )

In [None]:
def add_plot_of_performance_evolution_per_iteration_to_graph(
    axis,
    list_of_x: List[str],
    dict_of_y: Dict[str, float],
    dict_of_y_err: Optional[Dict[str, float]] = None,
    label: str = "",
    marker: str = "",
    markersize: int = 5,
    color: str = "black",
    linewidth: int = 2,
    linestyle: str = "-",
    alpha: float = 0.2,
):
    """
    Add a plot to an axis of a graph.
    
    Args:
        axis (): TODO.
        list_of_x (List[str]): TODO.
        dict_of_y (Dict[str, float]): TODO.
        dict_of_y_err (Optional[Dict[str, float]]): TODO. Defaults to `None`.
        label (str): TODO. Defaults to `""`.
        marker (str): TODO. Defaults to `""`.
        markersize (int): TODO. Defaults to `5`.
        color (str): TODO. Defaults to `"black"`.
        linewidth (int): TODO. Defaults to `2`.
        linestyle (str): TODO. Defaults to `"-"`.
        alpha (float): TODO. Defaults to `0.2`.
    """
    # Add curve.
    axis.plot(
        [float(x) for x in list_of_x],  # x
        [dict_of_y[x] for x in list_of_x],  # y
        label=label,
        marker=marker,
        markerfacecolor=color,
        markersize=markersize,
        color=color,
        linewidth=linewidth,
        linestyle=linestyle,
    )
    # Add curve error bars.
    if dict_of_y_err is not None:
        axis.fill_between(
            [float(x) for x in list_of_x],  # x
            y1=[(dict_of_y[x] - dict_of_y_err[x]) for x in list_of_x],  # y1
            y2=[(dict_of_y[x] + dict_of_y_err[x]) for x in list_of_x],  # y2
            color=color,
            alpha=alpha,
        )

Evolution of performance per iteration for MEAN EXPERIMENT + FASTEST + LOWEST.

In [None]:
# Create a new figure.
fig_plot_average1: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_average1 = fig_plot_average1.gca()

# Set range of axis.
axis_plot_average1.set_xlim(xmin=-2, xmax=int(max(LIST_OF_ITERATIONS))+2)
axis_plot_average1.set_ylim(ymin=-0.01, ymax=1.01)

# Load fastest (too reach vmeasure=100%) clustering evaluations.
env_fastest_100 = "../experiments/bank_cards_v1/simple_prep/tfidf/closest-50/hier_comp-10c/0001/"
with open(
    env_fastest_100 + "dict_of_clustering_performances.json", "r"
) as evaluation_file:
    dict_of_clustering_performances_for_fastest_100: Dict[
        str, Dict[str, float]
    ] = json.load(evaluation_file)
# Plot fastest average clustering performance evolution.
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_average1,
    list_of_x=[
        iter_plot for iter_plot in LIST_OF_ITERATIONS
        if iter_plot in dict_of_clustering_performances_for_fastest_100.keys()
    ],
    dict_of_y={
        iter_plot:dict_of_clustering_performances_for_fastest_100[iter_plot]["v_measure"]
        for iter_plot in LIST_OF_ITERATIONS
        if iter_plot in dict_of_clustering_performances_for_fastest_100.keys()
    },
    dict_of_y_err=None,
    label="Tentative la plus rapide",
    marker="",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Load slowest (too reach vmeasure=100%) clustering evaluations.
env_slowest_100 = "../experiments/bank_cards_v1/no_prep/tfidf/farthest-50/spectral_SPEC-10c/0001/"
with open(
    env_slowest_100 + "dict_of_clustering_performances.json", "r"
) as evaluation_file:
    dict_of_clustering_performances_for_slowest_100: Dict[
        str, Dict[str, float]
    ] = json.load(evaluation_file)
# Plot lowest average clustering performance evolution.
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_average1,
    list_of_x=[
        iter_plot for iter_plot in LIST_OF_ITERATIONS
        if iter_plot in dict_of_clustering_performances_for_slowest_100.keys()
    ],
    dict_of_y={
        iter_plot:dict_of_clustering_performances_for_slowest_100[iter_plot]["v_measure"]
        for iter_plot in LIST_OF_ITERATIONS
        if iter_plot in dict_of_clustering_performances_for_slowest_100.keys()
    },  # y
    dict_of_y_err=None,
    label="Tentative la plus lente",
    marker="",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Mean of performance.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
)
# Plot average clustering performance evolution.
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_average1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_average1.set_xlabel("itération [#]", fontsize=18,)
axis_plot_average1.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_average1.legend(fontsize=15,)

# Plot the grid.
axis_plot_average1.grid(True)

# Store the graph.
fig_plot_average1.savefig(
    "../results/etude-efficacite-evolution-moyenne-0par-iteration.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

In [None]:
description_of_iterations = []
for iteration in ["0000", "0025", "0050", "0075", "0100", "0125", "0150", "0200", "0250", "0300"]:
    min_vmeasure: float = 1.0
    max_vmeasure: float = 0.0
    
    for env in LIST_OF_EXPERIMENT_ENVIRONMENTS:

        # Load clustering evaluations.
        with open(
            env + "dict_of_clustering_performances.json", "r"
        ) as evaluation_file:
            dict_of_clustering_performances: Dict[
                str, Dict[str, float]
            ] = json.load(evaluation_file)
        vmeasure: float = (
            dict_of_clustering_performances[iteration]["v_measure"]
            if iteration in dict_of_clustering_performances.keys()
            else 1.0
        )
        min_vmeasure = min(min_vmeasure, vmeasure)
        max_vmeasure = max(max_vmeasure, vmeasure)
    
    description_of_iterations.append([
        iteration,
        int(iteration)*50,
        "{0:.2f}".format(MEAN_convergence_ALL[iteration]*100),
        "{0:.2f}".format(STDEV_convergence_ALL[iteration]*100),
        "{0:.2f}".format(SEM_convergence_ALL[iteration]*100),
        "{0:.2f}".format(min_vmeasure*100),
        "{0:.2f}".format(max_vmeasure*100),
    ])
pd.DataFrame(
    data=description_of_iterations,
    columns=["iteration [#]", "constraints [#]", "mean [%]", "stdev [%]", "sem [%]", "min [%]", "max [%]"]
)

Evolution of performance per iteration of MEAN + PREPROCESSING.

In [None]:
# Create a new figure.
fig_plot_prep1: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_prep1 = fig_plot_prep1.gca()

# Set range of axis
axis_plot_prep1.set_xlim(xmin=-1, xmax=201)
axis_plot_prep1.set_ylim(ymin=-0.01, ymax=1.01)

# Plot average clustering performance evolution for no-prep.
MEAN_convergence_no_prep, STDEV_convergence_no_prep, SEM_convergence_no_prep = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "no_prep" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_prep1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_no_prep,
    #dict_of_y_err=STDEV_convergence_no_prep,
    dict_of_y_err=SEM_convergence_no_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.no'",
    marker="",
    markersize=5,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for simple-prep.
MEAN_convergence_simple_prep, STDEV_convergence_simple_prep, SEM_convergence_simple_prep = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "simple_prep" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_prep1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_simple_prep,
    #dict_of_y_err=STDEV_convergence_simple_prep,
    dict_of_y_err=SEM_convergence_simple_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.simple'",
    marker="",
    markersize=5,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for lemma-prep.
MEAN_convergence_lemma_prep, STDEV_convergence_lemma_prep, SEM_convergence_lemma_prep = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "lemma_prep" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_prep1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_lemma_prep,
    #dict_of_y_err=STDEV_convergence_lemma_prep,
    dict_of_y_err=SEM_convergence_lemma_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.lemma'",
    marker="",
    markersize=5,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for filter-prep.
MEAN_convergence_filter_prep, STDEV_convergence_filter_prep, SEM_convergence_filter_prep = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "filter_prep" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_prep1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_filter_prep,
    #dict_of_y_err=STDEV_convergence_filter_prep,
    dict_of_y_err=SEM_convergence_filter_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.filter'",
    marker="",
    markersize=5,
    color="orange",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_prep1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_prep1.set_xlabel("itération [#]", fontsize=18,)
axis_plot_prep1.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_prep1.legend(fontsize=15,)

# Plot the grid.
axis_plot_prep1.grid(True)

# Store the graph.
fig_plot_prep1.savefig(
    "../results/etude-efficacite-evolution-moyenne-1preprocessing-par-iteration.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Evolution of performance per iteration of MEAN + VECTORIZATION.

In [None]:
# Create a new figure.
fig_plot_vect1: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_vect1 = fig_plot_vect1.gca()

# Set range of axis.
axis_plot_vect1.set_xlim(xmin=-1, xmax=201)
axis_plot_vect1.set_ylim(ymin=-0.01, ymax=1.01)

# Plot average clustering performance evolution for tfidf.
MEAN_convergence_tfidf, STDEV_convergence_tfidf, SEM_convergence_tfidf = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "tfidf" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_vect1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_tfidf,
    #dict_of_y_err=STDEV_convergence_tfidf,
    dict_of_y_err=SEM_convergence_tfidf,
    label="Moyenne des tentatives avec la vectorisation 'vect.tfidf'",
    marker="",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for fr-core-news-md.
MEAN_convergence_fr_core_news_md, STDEV_convergence_fr_core_news_md, SEM_convergence_fr_core_news_md = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "fr_core_news_md" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_vect1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_fr_core_news_md,
    #dict_of_y_err=STDEV_convergence_fr_core_news_md,
    dict_of_y_err=SEM_convergence_fr_core_news_md,
    label="Moyenne des tentatives avec la vectorisation 'vect.frcorenewsmd'",
    marker="",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_vect1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_vect1.set_xlabel("itération [#]", fontsize=18,)
axis_plot_vect1.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_vect1.legend(fontsize=15,)

# Plot the grid.
axis_plot_vect1.grid(True)

# Store the graph.
fig_plot_vect1.savefig(
    "../results/etude-efficacite-evolution-moyenne-2vectorization-par-iteration.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Evolution of performance per iteration of MEAN + CLUSTERING.

In [None]:
# Create a new figure.
fig_plot_clust1: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_clust1 = fig_plot_clust1.gca()

# Set range of axis.
axis_plot_clust1.set_xlim(xmin=-1, xmax=201)
axis_plot_clust1.set_ylim(ymin=-0.01, ymax=1.01)

# Plot average clustering performance evolution for kmeans_COP.
MEAN_convergence_kmeans_COP, STDEV_convergence_kmeans_COP, SEM_convergence_kmeans_COP = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "kmeans_COP" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_clust1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_kmeans_COP,
    #dict_of_y_err=STDEV_convergence_kmeans_COP,
    dict_of_y_err=SEM_convergence_kmeans_COP,
    label="Moyenne des tentatives avec le clustering 'clust.kmeans.cop'",
    marker="",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for hier_sing.
MEAN_convergence_hier_sing, STDEV_convergence_hier_sing, SEM_convergence_hier_sing = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_sing" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_clust1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_hier_sing,
    #dict_of_y_err=STDEV_convergence_hier_sing,
    dict_of_y_err=SEM_convergence_hier_sing,
    label="Moyenne des tentatives avec le clustering 'clust.hier.sing'",
    marker="",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for hier_comp.
MEAN_convergence_hier_comp, STDEV_convergence_hier_comp, SEM_convergence_hier_comp = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_comp" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_clust1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_hier_comp,
    #dict_of_y_err=STDEV_convergence_hier_comp,
    dict_of_y_err=SEM_convergence_hier_comp,
    label="Moyenne des tentatives avec le clustering 'clust.hier.comp'",
    marker="",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)
    
# Plot average clustering performance evolution for hier_avg.
MEAN_convergence_hier_avg, STDEV_convergence_hier_avg, SEM_convergence_hier_avg = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_avg" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_clust1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_hier_avg,
    #dict_of_y_err=STDEV_convergence_hier_avg,
    dict_of_y_err=SEM_convergence_hier_avg,
    label="Moyenne des tentatives avec le clustering 'clust.hier.avg'",
    marker="",
    markersize=3,
    color="orange",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for hier_ward.
MEAN_convergence_hier_ward, STDEV_convergence_hier_ward, SEM_convergence_hier_ward = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_ward" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_clust1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_hier_ward,
    #dict_of_y_err=STDEV_convergence_hier_ward,
    dict_of_y_err=SEM_convergence_hier_ward,
    label="Moyenne des tentatives avec le clustering 'clust.hier.ward'",
    marker="",
    markersize=3,
    color="violet",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)
    
# Plot average clustering performance evolution for spectral_SPEC.
MEAN_convergence_spectral_SPEC, STDEV_convergence_spectral_SPEC, SEM_convergence_spectral_SPEC = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "spectral_SPEC" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_clust1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_spectral_SPEC,
    #dict_of_y_err=STDEV_convergence_spectral_SPEC,
    dict_of_y_err=SEM_convergence_spectral_SPEC,
    label="Moyenne des tentatives avec le clustering 'clust.spec'",
    marker="",
    markersize=3,
    color="cyan",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_clust1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_clust1.set_xlabel("itération [#]", fontsize=18,)
axis_plot_clust1.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_clust1.legend(fontsize=15,)

# Plot the grid.
axis_plot_clust1.grid(True)

# Store the graph.
fig_plot_clust1.savefig(
    "../results/etude-efficacite-evolution-moyenne-3clustering-par-iteration.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Evolution of performance per iteration of MEAN + SAMPLING.

In [None]:
# Create a new figure.
fig_plot_samp1: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_samp1 = fig_plot_samp1.gca()

# Set range of axis.
axis_plot_samp1.set_xlim(xmin=-1, xmax=201)
axis_plot_samp1.set_ylim(ymin=-0.01, ymax=1.01)

# Plot average clustering performance evolution for random.
MEAN_convergence_random, STDEV_convergence_random, SEM_convergence_random = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "random" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_samp1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_random,
    #dict_of_y_err=STDEV_convergence_random,
    dict_of_y_err=SEM_convergence_random,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.random.full'",
    marker="",
    markersize=5,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for random-in-same.
MEAN_convergence_in_same, STDEV_convergence_in_same, SEM_convergence_in_same = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "in_same" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_samp1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_in_same,
    #dict_of_y_err=STDEV_convergence_in_same,
    dict_of_y_err=SEM_convergence_in_same,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.random.same'",
    marker="",
    markersize=5,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for farthest.
MEAN_convergence_farthest, STDEV_convergence_farthest, SEM_convergence_farthest = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "farthest" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_samp1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_farthest,
    #dict_of_y_err=STDEV_convergence_farthest,
    dict_of_y_err=SEM_convergence_farthest,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.farthest.same'",
    marker="",
    markersize=5,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)
    
# Plot average clustering performance evolution for closest.
MEAN_convergence_closest, STDEV_convergence_closest, SEM_convergence_closest = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "closest" in env
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_samp1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_closest,
    #dict_of_y_err=STDEV_convergence_closest,
    dict_of_y_err=SEM_convergence_closest,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.closest.diff'",
    marker="",
    markersize=5,
    color="orange",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_samp1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_samp1.set_xlabel("itération [#]", fontsize=18,)
axis_plot_samp1.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_samp1.legend(fontsize=15,)

# Plot the grid.
axis_plot_samp1.grid(True)

# Store the graph.
fig_plot_samp1.savefig(
    "../results/etude-efficacite-evolution-moyenne-4sampling-par-iteration.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Evolution of performance per iteration of MEAN + BEST SETTINGS.

In [None]:
# Create a new figure.
fig_plot_best_anova1: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_best_anova1 = fig_plot_best_anova1.gca()

# Set range of axis.
axis_plot_best_anova1.set_xlim(xmin=-0.5, xmax=51)
axis_plot_best_anova1.set_ylim(ymin=-0.01, ymax=1.01)

# Plot average clustering performance evolution to reach 90% of vmeasure.
MEAN_convergence_BEST_ANOVA_90, STDEV_convergence_BEST_ANOVA_90, SEM_convergence_BEST_ANOVA_90 = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if ("simple_prep" in env)
        and ("tfidf" in env)
        and ("hier_avg" in env)
        and ("closest" in env)
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_best_anova1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_BEST_ANOVA_90,
    #dict_of_y_err=STDEV_convergence_BEST_ANOVA_90,
    dict_of_y_err=SEM_convergence_BEST_ANOVA_90,
    label="Moyenne des tentatives ayant le meilleur paramètrage moyen\npour atteindre une annotation partielle (90% de v-measure).",
    marker="",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)
    
# Plot average clustering performance evolution to reach 100% of vmeasure.
MEAN_convergence_BEST_ANOVA_100, STDEV_convergence_BEST_ANOVA_100, SEM_convergence_BEST_ANOVA_100 = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if ("lemma_prep" in env)
        and ("tfidf" in env)
        and ("kmeans_COP" in env)
        and ("closest" in env)
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_best_anova1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_BEST_ANOVA_100,
    #dict_of_y_err=STDEV_convergence_BEST_ANOVA_100,
    dict_of_y_err=SEM_convergence_BEST_ANOVA_100,
    label="Moyenne des tentatives ayant le meilleur paramètrage moyen\npour atteindre une annotation suffisante (100% de v-measure).",
    marker="",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution to reach constraints completude.
MEAN_convergence_BEST_ANOVA_MAX, STDEV_convergence_BEST_ANOVA_MAX, SEM_convergence_BEST_ANOVA_MAX = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if ("lemma_prep" in env)
        and ("tfidf" in env)
        and ("kmeans_COP" in env)
        and ("in_same" in env)
    ],
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_best_anova1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_BEST_ANOVA_MAX,
    #dict_of_y_err=STDEV_convergence_BEST_ANOVA_MAX,
    dict_of_y_err=SEM_convergence_BEST_ANOVA_MAX,
    label="Moyenne des tentatives ayant le meilleur paramètrage moyen\npour atteindre une annotation exhaustive (toutes les contraintes).",
    marker="",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_STDEV_SEM_of_performance_evolution_per_iteration(
    local_LIST_OF_ITERATIONS=LIST_OF_ITERATIONS,
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
)
add_plot_of_performance_evolution_per_iteration_to_graph(
    axis=axis_plot_best_anova1,
    list_of_x=LIST_OF_ITERATIONS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_best_anova1.set_xlabel("itération [#]", fontsize=18,)
axis_plot_best_anova1.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_best_anova1.legend(fontsize=15,)

# Plot the grid.
axis_plot_best_anova1.grid(True)

# Store the graph.
fig_plot_best_anova1.savefig(
    "../results/etude-efficacite-evolution-moyenne-5best-par-iteration.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

### 2.4. Display mean of iterations per performances goal (_effectiveness_)

Define main functions.

In [None]:
def get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS: List[str],
    local_LIST_OF_GOALS: List[float] = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.99, 1.00],
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float]]:
    """
    Compute iteration needed according to a performance goal.
    Return Mean, Standard deviation and Standard error of the mean evolutions.
    
    Args:
        - local_LIST_OF_EXPERIMENT_ENVIRONMENTS (List[str]): The list of experiments to consider.
        - local_LIST_OF_GOALS (List[float]): The list of performance goals to consider.
    Returns:
        Tuple[Dict[str, float], Dict[str, float], Dict[str, float]]: Evolutions of Mean, Standard deviation and Standard error of the mean accross performance goal.
    """

    # Initialize storage of iteration needed for all goal.
    dict_of_global_iteration_needed_per_performance_goal: Dict[str, List[float]] = {
        goal_iter: [] for goal_iter in local_LIST_OF_GOALS
    }
    # Initialize storage of iteration needed mean for all goal.
    dict_of_global_iteration_needed_per_performance_goal_MEAN: Dict[str, float] = {
        goal_mean: [] for goal_mean in local_LIST_OF_GOALS
    }
    # Initialize storage of iteration needed standard deviation for all goal.
    dict_of_global_iteration_needed_per_performance_goal_STDEV: Dict[str, float] = {
        goal_stdev: [] for goal_stdev in local_LIST_OF_GOALS
    }
    # Initialize storage of iteration needed standard error of the mean for all goal.
    dict_of_global_iteration_needed_per_performance_goal_SEM: Dict[str, float] = {
        goal_sem: [] for goal_sem in local_LIST_OF_GOALS
    }

    # For each experiment...
    for env_a in local_LIST_OF_EXPERIMENT_ENVIRONMENTS:

        # Load iteration to highlight.
        with open(
            env_a + "dict_of_iterations_to_highlight.json", "r"
        ) as evaluation_file:
            dict_of_iterations_to_highlight: Dict[
                str, Dict[str, Any]
            ] = json.load(evaluation_file)

        # For each requested iteration...
        for highlight in dict_of_iterations_to_highlight.values():

            # Append the clustering iteration needed for the current experiment and for this performance goal.
            if highlight["goal"] in dict_of_global_iteration_needed_per_performance_goal.keys():
                dict_of_global_iteration_needed_per_performance_goal[highlight["goal"]].append(
                    int(highlight["iteration"])
                )

    # Compute mean and sem of iteration needed for each performance goal.
    for goal in local_LIST_OF_GOALS:

        # Compute mean of performance for this performance goal.
        dict_of_global_iteration_needed_per_performance_goal_MEAN[goal] = np.mean(
            dict_of_global_iteration_needed_per_performance_goal[goal]
        )
        # Compute stdev of performance for this performance goal.
        dict_of_global_iteration_needed_per_performance_goal_STDEV[goal] = np.std(
            dict_of_global_iteration_needed_per_performance_goal[goal]
        )
        # Compute sem of performance for this performance goal.
        dict_of_global_iteration_needed_per_performance_goal_SEM[goal] = scipystats.sem(
            dict_of_global_iteration_needed_per_performance_goal[goal]
        )
        
    # Return
    return (
        dict_of_global_iteration_needed_per_performance_goal_MEAN,
        dict_of_global_iteration_needed_per_performance_goal_STDEV,
        dict_of_global_iteration_needed_per_performance_goal_SEM
    )

In [None]:
def add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis,
    list_of_x,
    dict_of_y,
    dict_of_y_err=None,
    label="",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
):
    """
    Add a plot to an axis of a graph.
    
    Args:
        - axis
        - list_of_x
        - dict_of_y
        - dict_of_y_err
        - label
        - marker
        - markersize
        - color
        - linewidth
        - linestyle
        - alpha
    """
    axis.plot(
        [float(x) for x in list_of_x],  # x
        [dict_of_y[x] for x in list_of_x],  # y
        label=label,
        marker=marker,
        markerfacecolor=color,
        markersize=markersize,
        color=color,
        linewidth=linewidth,
        linestyle=linestyle,
    )
    if dict_of_y_err is not None:
        axis.fill_between(
            [float(x) for x in list_of_x],  # x
            y1=[(dict_of_y[x] - dict_of_y_err[x]) for x in list_of_x],  # y1
            y2=[(dict_of_y[x] + dict_of_y_err[x]) for x in list_of_x],  # y2
            color=color,
            alpha=alpha,
        )

Define list of performance goals.

In [None]:
LIST_OF_GOALS = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.99, 1.00]

Iteration needed for MEAN + BEST

In [None]:
# Create a new figure.
fig_plot_average2: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_average2 = fig_plot_average2.gca()

# Set range of axis.
axis_plot_average2.set_xlim(xmin=-0.005, xmax=1.005)
axis_plot_average2.set_ylim(ymin=-1, ymax=101)

# Plot average clustering performance evolution to reach 90% of vmeasure.
MEAN_convergence_BEST_ANOVA_90, STDEV_convergence_BEST_ANOVA_90, SEM_convergence_BEST_ANOVA_90 = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if ("simple_prep" in env)
        and ("tfidf" in env)
        and ("hier_avg" in env)
        and ("closest" in env)
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_average2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_BEST_ANOVA_90,
    #dict_of_y_err=STDEV_convergence_BEST_ANOVA_90,
    dict_of_y_err=SEM_convergence_BEST_ANOVA_90,
    label="Moyenne des tentatives ayant le meilleur paramètrage moyen\npour atteindre une annotation partielle (90% de v-measure).",
    marker="",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)
    
# Plot average clustering performance evolution to reach 100% of vmeasure.
MEAN_convergence_BEST_ANOVA_100, STDEV_convergence_BEST_ANOVA_100, SEM_convergence_BEST_ANOVA_100 = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if ("lemma_prep" in env)
        and ("tfidf" in env)
        and ("kmeans_COP" in env)
        and ("closest" in env)
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_average2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_BEST_ANOVA_100,
    #dict_of_y_err=STDEV_convergence_BEST_ANOVA_100,
    dict_of_y_err=SEM_convergence_BEST_ANOVA_100,
    label="Moyenne des tentatives ayant le meilleur paramètrage moyen\npour atteindre une annotation suffisante (100% de v-measure).",
    marker="",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution to reach constraints completude.
MEAN_convergence_BEST_ANOVA_MAX, STDEV_convergence_BEST_ANOVA_MAX, SEM_convergence_BEST_ANOVA_MAX = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if ("lemma_prep" in env)
        and ("tfidf" in env)
        and ("kmeans_COP" in env)
        and ("in_same" in env)
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_average2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_BEST_ANOVA_MAX,
    #dict_of_y_err=STDEV_convergence_BEST_ANOVA_MAX,
    dict_of_y_err=SEM_convergence_BEST_ANOVA_MAX,
    label="Moyenne des tentatives ayant le meilleur paramètrage moyen\npour atteindre une annotation exhaustive (toutes les contraintes).",
    marker="",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_iteration_to_highlight_ALL, STDEV_iteration_to_highlight_ALL, SEM_iteration_to_highlight_ALL = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_average2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_iteration_to_highlight_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_iteration_to_highlight_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_average2.set_xlabel("v-measure [%]", fontsize=18,)
axis_plot_average2.set_ylabel("itération [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_average2.legend(loc="upper left", fontsize=15,)

# Plot the grid.
axis_plot_average2.grid(True)

# Store the graph.
fig_plot_average2.savefig(
    "../results/etude-efficience-evolution-moyenne-5best-par-vmeasure.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Iteration needed for MEAN + PREPROCESSING

In [None]:
# Create a new figure.
fig_plot_prep2: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_prep2 = fig_plot_prep2.gca()

# Set range of axis.
axis_plot_prep2.set_xlim(xmin=-0.005, xmax=1.005)
axis_plot_prep2.set_ylim(ymin=-1, ymax=101)

# Plot average clustering performance evolution for no-prep.
MEAN_convergence_no_prep, STDEV_convergence_no_prep, SEM_convergence_no_prep = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "no_prep" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_prep2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_no_prep,
    #dict_of_y_err=STDEV_convergence_no_prep,
    dict_of_y_err=SEM_convergence_no_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.no'",
    marker="",
    markersize=5,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for simple-prep.
MEAN_convergence_simple_prep, STDEV_convergence_simple_prep, SEM_convergence_simple_prep = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "simple_prep" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_prep2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_simple_prep,
    #dict_of_y_err=STDEV_convergence_simple_prep,
    dict_of_y_err=SEM_convergence_simple_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.simple'",
    marker="",
    markersize=5,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for lemma-prep.
MEAN_convergence_lemma_prep, STDEV_convergence_lemma_prep, SEM_convergence_lemma_prep = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "lemma_prep" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_prep2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_lemma_prep,
    #dict_of_y_err=STDEV_convergence_lemma_prep,
    dict_of_y_err=SEM_convergence_lemma_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.lemma'",
    marker="",
    markersize=5,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for filter-prep.
MEAN_convergence_filter_prep, STDEV_convergence_filter_prep, SEM_convergence_filter_prep = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "filter_prep" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_prep2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_filter_prep,
    #dict_of_y_err=STDEV_convergence_filter_prep,
    dict_of_y_err=SEM_convergence_filter_prep,
    label="Moyenne des tentatives avec le prétraitement 'prep.filter'",
    marker="",
    markersize=5,
    color="orange",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_prep2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_prep2.set_xlabel("v-measure [%]", fontsize=18,)
axis_plot_prep2.set_ylabel("itération [#])", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_prep2.legend(loc="upper left", fontsize=15,)

# Plot the grid.
axis_plot_prep2.grid(True)

# Store the graph.
fig_plot_prep2.savefig(
    "../results/etude-efficience-evolution-moyenne-1preprocessing-par-vmeasure.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Iteration needed for MEAN + VECTORIZATION

In [None]:
# Create a new figure.
fig_plot_vect2: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_vect2 = fig_plot_vect2.gca()

# Set range of axis.
axis_plot_vect2.set_xlim(xmin=-0.005, xmax=1.005)
axis_plot_vect2.set_ylim(ymin=-1, ymax=101)

# Plot average clustering performance evolution for tfidf.
MEAN_convergence_tfidf, STDEV_convergence_tfidf, SEM_convergence_tfidf = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "tfidf" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_vect2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_tfidf,
    #dict_of_y_err=STDEV_convergence_tfidf,
    dict_of_y_err=SEM_convergence_tfidf,
    label="Moyenne des tentatives avec la vectorisation 'vect.tfidf'",
    marker="",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for fr-core-news-md.
MEAN_convergence_fr_core_news_md, STDEV_convergence_fr_core_news_md, SEM_convergence_fr_core_news_md = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "fr_core_news_md" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_vect2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_fr_core_news_md,
    #dict_of_y_err=STDEV_convergence_fr_core_news_md,
    dict_of_y_err=SEM_convergence_fr_core_news_md,
    label="Moyenne des tentatives avec la vectorisation 'vect.frcorenewsmd'",
    marker="",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_vect2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_vect2.set_xlabel("v-measure [%]", fontsize=18,)
axis_plot_vect2.set_ylabel("itération [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_vect2.legend(loc="upper left", fontsize=15,)

# Plot the grid.
axis_plot_vect2.grid(True)

# Store the graph.
fig_plot_vect2.savefig(
    "../results/etude-efficience-evolution-moyenne-2vectorization-par-vmeasure.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Iteration needed for MEAN + CLUSTERING

In [None]:
# Create a new figure.
fig_plot_clust2: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_clust2 = fig_plot_clust2.gca()

# Set range of axis.
axis_plot_clust2.set_xlim(xmin=-0.005, xmax=1.005)
axis_plot_clust2.set_ylim(ymin=-1, ymax=101)

# Plot average clustering performance evolution for kmeans_COP.
MEAN_convergence_kmeans_COP, STDEV_convergence_kmeans_COP, SEM_convergence_kmeans_COP = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "kmeans_COP" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_clust2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_kmeans_COP,
    #dict_of_y_err=STDEV_convergence_kmeans_COP,
    dict_of_y_err=SEM_convergence_kmeans_COP,
    label="Moyenne des tentatives avec le clustering 'clust.kmeans.cop'",
    marker="",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for hier_sing.
MEAN_convergence_hier_sing, STDEV_convergence_hier_sing, SEM_convergence_hier_sing = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_sing" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_clust2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_hier_sing,
    #dict_of_y_err=STDEV_convergence_hier_sing,
    dict_of_y_err=SEM_convergence_hier_sing,
    label="Moyenne des tentatives avec le clustering 'clust.hier.sing'",
    marker="",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for hier_comp.
MEAN_convergence_hier_comp, STDEV_convergence_hier_comp, SEM_convergence_hier_comp = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_comp" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_clust2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_hier_comp,
    #dict_of_y_err=STDEV_convergence_hier_comp,
    dict_of_y_err=SEM_convergence_hier_comp,
    label="Moyenne des tentatives avec le clustering 'clust.hier.comp'",
    marker="",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)
    
# Plot average clustering performance evolution for hier_avg.
MEAN_convergence_hier_avg, STDEV_convergence_hier_avg, SEM_convergence_hier_avg = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_avg" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_clust2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_hier_avg,
    #dict_of_y_err=STDEV_convergence_hier_avg,
    dict_of_y_err=SEM_convergence_hier_avg,
    label="Moyenne des tentatives avec le clustering 'clust.hier.avg'",
    marker="",
    markersize=3,
    color="orange",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for hier_ward.
MEAN_convergence_hier_ward, STDEV_convergence_hier_ward, SEM_convergence_hier_ward = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "hier_ward" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_clust2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_hier_ward,
    #dict_of_y_err=STDEV_convergence_hier_ward,
    dict_of_y_err=SEM_convergence_hier_ward,
    label="Moyenne des tentatives avec le clustering 'clust.hier.ward'",
    marker="",
    markersize=3,
    color="violet",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)
    
# Plot average clustering performance evolution for spectral_SPEC.
MEAN_convergence_spectral_SPEC, STDEV_convergence_spectral_SPEC, SEM_convergence_spectral_SPEC = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "spectral_SPEC" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_clust2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_spectral_SPEC,
    #dict_of_y_err=STDEV_convergence_spectral_SPEC,
    dict_of_y_err=SEM_convergence_spectral_SPEC,
    label="Moyenne des tentatives avec le clustering 'clust.spec'",
    marker="",
    markersize=3,
    color="cyan",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
MEAN_convergence_ALL, STDEV_convergence_ALL, SEM_convergence_ALL = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=LIST_OF_EXPERIMENT_ENVIRONMENTS,
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_clust2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_convergence_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_clust2.set_xlabel("v-measure [%]", fontsize=18,)
axis_plot_clust2.set_ylabel("itération [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_clust2.legend(loc="upper left", fontsize=15,)

# Plot the grid.
axis_plot_clust2.grid(True)

# Store the graph.
fig_plot_clust2.savefig(
    "../results/etude-efficience-evolution-moyenne-3clustering-par-vmeasure.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Iteration needed for MEAN + SAMPLING

In [None]:
# Create a new figure.
fig_plot_samp2: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_samp2 = fig_plot_samp2.gca()

# Set range of axis.
axis_plot_samp2.set_xlim(xmin=-0.005, xmax=1.005)
axis_plot_samp2.set_ylim(ymin=-1, ymax=101)

# Plot average clustering performance evolution for random.
MEAN_convergence_random, STDEV_convergence_random, SEM_convergence_random = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "random" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_samp2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_random,
    #dict_of_y_err=STDEV_convergence_random,
    dict_of_y_err=SEM_convergence_random,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.random.full'",
    marker="",
    markersize=5,
    color="red",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for random-in-same.
MEAN_convergence_in_same, STDEV_convergence_in_same, SEM_convergence_in_same = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "in_same" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_samp2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_in_same,
    #dict_of_y_err=STDEV_convergence_in_same,
    dict_of_y_err=SEM_convergence_in_same,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.random.same'",
    marker="",
    markersize=5,
    color="blue",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for farthest.
MEAN_convergence_farthest, STDEV_convergence_farthest, SEM_convergence_farthest = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "farthest" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_samp2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_farthest,
    #dict_of_y_err=STDEV_convergence_farthest,
    dict_of_y_err=SEM_convergence_farthest,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.farthest.same'",
    marker="",
    markersize=5,
    color="green",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution for closest.
MEAN_convergence_closest, STDEV_convergence_closest, SEM_convergence_closest = get_MEAN_SEM_of_iteration_needed_for_performance_goal(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
        env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
        if "closest" in env
    ],
    local_LIST_OF_GOALS=LIST_OF_GOALS,
)
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_samp2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_convergence_closest,
    #dict_of_y_err=STDEV_convergence_closest,
    dict_of_y_err=SEM_convergence_closest,
    label="Moyenne des tentatives avec l'échantillonnage 'samp.closest.diff'",
    marker="",
    markersize=5,
    color="orange",
    linewidth=2,
    linestyle="--",
    alpha=0.2,
)

# Plot average clustering performance evolution.
add_plot_of_iteration_needed_per_performance_goal_to_graph(
    axis=axis_plot_samp2,
    list_of_x=LIST_OF_GOALS,
    dict_of_y=MEAN_iteration_to_highlight_ALL,
    #dict_of_y_err=STDEV_convergence_ALL,
    dict_of_y_err=SEM_iteration_to_highlight_ALL,
    label="Moyenne des tentatives",
    marker="",
    markersize=5,
    color="black",
    linewidth=2,
    linestyle="-",
    alpha=0.2,
)

# Set axis name.
axis_plot_samp2.set_xlabel("v-measure [%]", fontsize=18,)
axis_plot_samp2.set_ylabel("itération [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_samp2.legend(loc="upper left", fontsize=15,)

# Plot the grid.
axis_plot_samp2.grid(True)

# Store the graph.
fig_plot_samp2.savefig(
    "../results/etude-efficience-evolution-moyenne-4sampling-par-vmeasure.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)