# ==== INTERACTIVE CLUSTERING : CONFLICTS FIX STUDY ====
> ### Stage 3 : Modelize annotation errors impact on Interactive Clustering and Plot some figures.

------------------------------
## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at modelize annotation errors impact during interactive clusterings, plot overviews of experiments**.
- Environments are represented by subdirectories in the `/experiments` folder. A full path to an experiment environment is `/experiments/[DATASET]/[CLUSTERING]/[CONSTRAINTS_SELECTION]/[ERRORS_SIMULATION]`.

Before running, **run the notebook `2_Simulate_errors_and_run_clustering.ipynb` on each algorithm you have set**.

### Description each steps

First of all, **load experiments** that have been computed with the last notebook.
- A config file contains parameters used for each experiment and annotation error simulations to analyze.
- For each algorithm and constraints selection method, print performance evolution according to constraints number and error rate.

------------------------------

## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Dict, List, Optional, Tuple, Union
import json
import listing_envs
import numpy as np
import openpyxl
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.cm as cm
from matplotlib.colors import Normalize, LinearSegmentedColormap
from scipy import stats as scipystats
import statistics
import statsmodels
import statsmodels.api
import statsmodels.formula.api

------------------------------

## 2. Load data

In [None]:
# Get list of experiment environments.
LIST_OF_EXPERIMENT_ENVIRONMENTS: List[str] = listing_envs.get_list_of_errors_simulation_env_paths()
LIST_OF_EXPERIMENT_ENVIRONMENTS = [
    env for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
    if "bank_cards_v1" in env
]
print(
    "There are",
    "`" + str(len(LIST_OF_EXPERIMENT_ENVIRONMENTS)) + "`",
    "created experiment environments in `../experiments`",
)
LIST_OF_EXPERIMENT_ENVIRONMENTS

In [None]:
df_simulations = pd.read_csv("../results/experiments_synthesis.csv", sep=";", index_col=0)
df_simulations["error_simulation__error_rate"] = df_simulations["error_simulation__error_rate"].str.replace(",", ".").astype(float)
df_simulations["clustering__v_measure"] = df_simulations["clustering__v_measure"].str.replace(",", ".").astype(float)
df_simulations["clustering__homogeneity"] = df_simulations["clustering__homogeneity"].str.replace(",", ".").astype(float)
df_simulations["clustering__completeness"] = df_simulations["clustering__completeness"].str.replace(",", ".").astype(float)
df_simulations.head()

------------------------------

## 3. Display evolution of performances according to errors simulations

In [None]:
def get_MEAN_SEM_of_performance_evolution_per_constraints_number(
    local_LIST_OF_EXPERIMENT_ENVIRONMENTS,
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float]]:
    """
    Compute evolution of performance accross constraints number.
    Return Mean and Standard error of the mean evolutions
    
    Args:
        - local_LIST_OF_EXPERIMENT_ENVIRONMENTS (List[str]): The list of experiments to consider.
    Returns:
        Tuple[Dict[str, float], Dict[str, float]]: Evolutions of Mean and Standard error of the mean accross constraints number.
    """

    # Initialize storage of experiment performances for all constraints number.
    dict_of_global_performances_evolution_per_constraints_number: Dict[int, List[float]] = {}
    # Initialize storage of performance mean for all constraints number.
    dict_of_global_performances_evolution_per_constraints_number_MEAN: Dict[int, float] = {}
    # Initialize storage of performance standard error of the mean for all constraints number.
    dict_of_global_performances_evolution_per_constraints_number_SEM: Dict[int, float] = {}

    # For each experiment...
    for env_a in local_LIST_OF_EXPERIMENT_ENVIRONMENTS:
        
        # Get number of annotated constraints
        with open(
            env_a + "../config.json", "r"
        ) as config_file:
            nb_constraints: float = json.load(config_file)["nb_constraints"]

        # Load clustering evaluations.
        with open(
            env_a + "dict_of_clustering_performances.json", "r"
        ) as evaluation_file:
            dict_of_clustering_performances: Dict[str, float] = json.load(evaluation_file)

        # For each requested iteration...
        if nb_constraints not in dict_of_global_performances_evolution_per_constraints_number.keys():
            dict_of_global_performances_evolution_per_constraints_number[nb_constraints] = []
        dict_of_global_performances_evolution_per_constraints_number[nb_constraints].append(dict_of_clustering_performances["v_measure"])

    # Compute mean and sem of performance for each constraints number.
    for x in dict_of_global_performances_evolution_per_constraints_number.keys():

        # Compute mean of performance for this constraints number.
        dict_of_global_performances_evolution_per_constraints_number_MEAN[x] = np.mean(dict_of_global_performances_evolution_per_constraints_number[x])
        # Compute sem of performance for this constraints number.
        dict_of_global_performances_evolution_per_constraints_number_SEM[x] = scipystats.sem(dict_of_global_performances_evolution_per_constraints_number[x])
        
    # Return.
    return (
        dict_of_global_performances_evolution_per_constraints_number_MEAN,
        dict_of_global_performances_evolution_per_constraints_number_SEM
    )

In [None]:
def add_plot_of_performance_evolution_per_constraints_number_to_graph(
    axis,
    list_of_x: List[int],
    dict_of_y: Dict[int, float],
    dict_of_y_err: Optional[Dict[int, float]] = None,
    label: str = "",
    label_in_curve: Optional[str] = None,
    marker: str = "",
    markersize: int = 5,
    color: str = "black",
    linewidth: int = 2,
    linestyle: str = "-",
    alpha: float = 0.2,
):
    """
    Add a plot to an axis of a graph.
    
    Args:
        axis (): TODO.
        list_of_x (List[str]): TODO.
        dict_of_y (Dict[str, float]): TODO.
        dict_of_y_err (Optional[Dict[str, float]]): TODO. Defaults to `None`.
        label (str): TODO. Defaults to `""`.
        label_in_curve (Optional[str]): TODO. Defaults to `None`.
        marker (str): TODO. Defaults to `""`.
        markersize (int): TODO. Defaults to `5`.
        color (str): TODO. Defaults to `"black"`.
        linewidth (int): TODO. Defaults to `2`.
        linestyle (str): TODO. Defaults to `"-"`.
        alpha (float): TODO. Defaults to `0.2`.
    """
    # Add curve.
    axis.plot(
        [float(x) for x in list_of_x],  # x
        [dict_of_y[x] for x in list_of_x],  # y
        label=label,
        marker=marker,
        markerfacecolor=color,
        markersize=markersize,
        color=color,
        linewidth=linewidth,
        linestyle=linestyle,
    )
    # Add curve name.
    if label_in_curve is not None:
        axis.text(
            x=list_of_x[-1],
            y=dict_of_y[list_of_x[-1]],
            s=label_in_curve,
        )
    # Add curve error bars.
    if dict_of_y_err is not None:
        axis.fill_between(
            [float(x) for x in list_of_x],  # x
            y1=[(dict_of_y[x] - dict_of_y_err[x]) for x in list_of_x],  # y1
            y2=[(dict_of_y[x] + dict_of_y_err[x]) for x in list_of_x],  # y2
            color=color,
            alpha=alpha,
        )

Define configuration for graphs.

In [None]:
MAX_NB_CONSTRAINTS_TO_PLOT: int = 3000

In [None]:
LIST_OF_ERROR_RATES: List[float] = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50]

In [None]:
# Config for makers.
markers: Dict[float, str] = {error_rate: "." for error_rate in LIST_OF_ERROR_RATES}
markers[0.00] = "^"
markers[0.50] = "v"
# Config for maker sizes.
markersizes: Dict[float, int] = {error_rate: 3 for error_rate in LIST_OF_ERROR_RATES}
markersizes[0.00] = 5
markersizes[0.50] = 5
# Config for linewidths.
linewidths: Dict[float, int] = {error_rate: 0.5 for error_rate in LIST_OF_ERROR_RATES}
linewidths[0.00] = 1
linewidths[0.50] = 1
# Config for linestyles.
linestyles: Dict[float, str] = {error_rate: "--" for error_rate in LIST_OF_ERROR_RATES}
linestyles[0.00] = "-"
linestyles[0.50] = "-"

In [None]:
# Define color map.
#list_of_colors = matplotlib.colormaps["RdYlGn"](
#    np.linspace(1, 0, len(LIST_OF_ERROR_RATES))
#)
cmap = LinearSegmentedColormap.from_list(
    name="RedOrangeGreen",
    colors=["darkred", "orangered", "orange", "gold", "yellowgreen", "green"],
    N=256,
    gamma=1.0,
)
list_of_colors = cmap(
    np.linspace(1, 0, len(LIST_OF_ERROR_RATES))
)

### 3.1. Modelization of `random` `with_fix=False`.

In [None]:
# Fit the model to the data and print results.
model_simulation_random_without_fix = statsmodels.formula.api.glm(
    formula="clustering__v_measure ~ 1 + constraints__annotated*error_simulation__error_rate",
    data=df_simulations[
        (df_simulations["constraints_selection__algorithm"]=="random")
        &(df_simulations["error_simulation__with_fix"]==False)
    ],
)
results_simulation_random_without_fix = model_simulation_random_without_fix.fit()
print("==============================================================================")
print(">>> formula:", model_simulation_random_without_fix.formula)
print("==============================================================================")
print(results_simulation_random_without_fix.summary())

In [None]:
print(
    "random.without_fix ~",
    "{0:.2E}".format(results_simulation_random_without_fix.params["Intercept"]),
    "+ {0:.2E}*constraints_number".format(results_simulation_random_without_fix.params["constraints__annotated"]),
    "+ {0:.2E}*error_rate".format(results_simulation_random_without_fix.params["error_simulation__error_rate"]),
    #"+ {0:.2E}*(constraints_number*error_rate)".format(results_simulation_random_without_fix.params["constraints__annotated:error_simulation__error_rate"]),
)

In [None]:
# Define the interpolation function.
def interpolation_random_without_fix(constraints_number, error_rate) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (results_simulation_random_without_fix.params["Intercept"] - results_simulation_random_without_fix.bse["Intercept"])
    res += results_simulation_random_without_fix.params["Intercept"]
    res_high += (results_simulation_random_without_fix.params["Intercept"] + results_simulation_random_without_fix.bse["Intercept"])
    # constraints__annotated.
    res_low += (results_simulation_random_without_fix.params["constraints__annotated"] - results_simulation_random_without_fix.bse["constraints__annotated"]) * constraints_number
    res += results_simulation_random_without_fix.params["constraints__annotated"] * constraints_number
    res_high += (results_simulation_random_without_fix.params["constraints__annotated"] + results_simulation_random_without_fix.bse["constraints__annotated"]) * constraints_number
    # error_simulation__error_rate.
    res_low += (results_simulation_random_without_fix.params["error_simulation__error_rate"] - results_simulation_random_without_fix.bse["error_simulation__error_rate"]) * error_rate
    res += results_simulation_random_without_fix.params["error_simulation__error_rate"] * error_rate
    res_high += (results_simulation_random_without_fix.params["error_simulation__error_rate"] + results_simulation_random_without_fix.bse["error_simulation__error_rate"]) * error_rate
    # constraints__annotated:error_simulation__error_rate.
    #res_low += (results_simulation_random_without_fix.params["constraints__annotated:error_simulation__error_rate"] - results_simulation_random_without_fix.bse["constraints__annotated:error_simulation__error_rate"]) * (constraints_number*error_rate)
    #res += results_simulation_random_without_fix.params["constraints__annotated:error_simulation__error_rate"] * (constraints_number*error_rate)
    #res_high += (results_simulation_random_without_fix.params["constraints__annotated:error_simulation__error_rate"] + results_simulation_random_without_fix.bse["constraints__annotated:error_simulation__error_rate"]) * (constraints_number*error_rate)
    # Return.
    return res_low, res, res_high

In [None]:
# Create a new figure.
fig_plot_error_simulation_random_without_fix: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_error_simulation_random_without_fix = fig_plot_error_simulation_random_without_fix.gca()

# Set range of axis.
axis_plot_error_simulation_random_without_fix.set_xlim(xmin=-10, xmax=MAX_NB_CONSTRAINTS_TO_PLOT+100)
axis_plot_error_simulation_random_without_fix.set_ylim(ymin=-0.01, ymax=1.01)
    
# Plot error simulation.
for k, error_rate_k in enumerate(LIST_OF_ERROR_RATES):
        
    # Compute performance MEAN and SEM for this error rate.
    performances_random_without_fix_MEAN, performances_random_without_fix_SEM = get_MEAN_SEM_of_performance_evolution_per_constraints_number(
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
            env
            for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
            if (
                "-rand_" in env.split("/")[4]
                and "{rate:.2f}".format(rate=error_rate_k) in env.split("/")[5]
                and "-without_fix" in env.split("/")[5]
            )
        ],
    )
    
    # Add plot.
    add_plot_of_performance_evolution_per_constraints_number_to_graph(
        axis=axis_plot_error_simulation_random_without_fix,
        list_of_x=[
            constraints_number
            for constraints_number in sorted(performances_random_without_fix_MEAN.keys())
            if constraints_number <= MAX_NB_CONSTRAINTS_TO_PLOT
        ],
        dict_of_y=performances_random_without_fix_MEAN,
        dict_of_y_err=performances_random_without_fix_SEM,
        label="{rate:2d}% de différences".format(rate=int(error_rate_k*100)),
        label_in_curve="{rate:2d}%".format(rate=int(error_rate_k*100)),
        marker=markers[error_rate_k],
        markersize=markersizes[error_rate_k],
        color=list_of_colors[k],
        linewidth=linewidths[error_rate_k],
        linestyle=linestyles[error_rate_k],
        alpha=0.2,
    )

    # Add modelization plot.
    #add_plot_of_performance_evolution_per_constraints_number_to_graph(
    #    axis=axis_plot_error_simulation_random_without_fix,
    #    list_of_x=[
    #        constraints_number
    #        for constraints_number in sorted(performances_random_without_fix_MEAN.keys())
    #        if constraints_number <= MAX_NB_CONSTRAINTS_TO_PLOT
    #    ],
    #    dict_of_y={
    #        constraints_number: interpolation_random_without_fix(constraints_number, error_rate_k)[1]
    #        for constraints_number in sorted(performances_random_without_fix_MEAN.keys())
    #        if constraints_number <= MAX_NB_CONSTRAINTS_TO_PLOT
    #    },
    #    dict_of_y_err=None,
    #    label=None,
    #    label_in_curve=None,
    #    marker="",
    #    markersize=markersizes[error_rate_k],
    #    color="black",
    #    linewidth=linewidths[error_rate_k],
    #    linestyle=linestyles[error_rate_k],
    #    alpha=0.2,
    #)

# Set axis name.
axis_plot_error_simulation_random_without_fix.set_xlabel("nombre de contraintes [#]", fontsize=18,)
axis_plot_error_simulation_random_without_fix.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_error_simulation_random_without_fix.legend(ncol=4, loc="upper left", fontsize=12,)

# Plot the grid.
axis_plot_error_simulation_random_without_fix.grid(True)
    
# Store the graph.
fig_plot_error_simulation_random_without_fix.savefig(
    "../results/etude-erreur-simulation-impact-1random-without-fix.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Modelization of `closest_in_different_clusters` `with_fix=False`.

In [None]:
# Fit the model to the data and print results.
model_simulation_closest_without_fix = statsmodels.formula.api.glm(
    formula="clustering__v_measure ~ 1 + constraints__annotated*error_simulation__error_rate",
    data=df_simulations[
        (df_simulations["constraints_selection__algorithm"]=="closest_in_different_clusters")
        &(df_simulations["error_simulation__with_fix"]==False)
    ],
)
results_simulation_closest_without_fix = model_simulation_closest_without_fix.fit()
print("==============================================================================")
print(">>> formula:", model_simulation_closest_without_fix.formula)
print("==============================================================================")
print(results_simulation_closest_without_fix.summary())

In [None]:
print(
    "random.without_fix ~",
    "{0:.2E}".format(results_simulation_closest_without_fix.params["Intercept"]),
    "+ {0:.2E}*constraints_number".format(results_simulation_closest_without_fix.params["constraints__annotated"]),
    "+ {0:.2E}*error_rate".format(results_simulation_closest_without_fix.params["error_simulation__error_rate"]),
)

In [None]:
# Define the interpolation function.
def interpolation_closest_without_fix(constraints_number, error_rate) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (results_simulation_closest_without_fix.params["Intercept"] - results_simulation_closest_without_fix.bse["Intercept"])
    res += results_simulation_closest_without_fix.params["Intercept"]
    res_high += (results_simulation_closest_without_fix.params["Intercept"] + results_simulation_closest_without_fix.bse["Intercept"])
    # constraints__annotated.
    res_low += (results_simulation_closest_without_fix.params["constraints__annotated"] - results_simulation_closest_without_fix.bse["constraints__annotated"]) * constraints_number
    res += results_simulation_closest_without_fix.params["constraints__annotated"] * constraints_number
    res_high += (results_simulation_closest_without_fix.params["constraints__annotated"] + results_simulation_closest_without_fix.bse["constraints__annotated"]) * constraints_number
    # error_simulation__error_rate.
    res_low += (results_simulation_closest_without_fix.params["error_simulation__error_rate"] - results_simulation_closest_without_fix.bse["error_simulation__error_rate"]) * error_rate
    res += results_simulation_closest_without_fix.params["error_simulation__error_rate"] * error_rate
    res_high += (results_simulation_closest_without_fix.params["error_simulation__error_rate"] + results_simulation_closest_without_fix.bse["error_simulation__error_rate"]) * error_rate
    # Return.
    return res_low, res, res_high

In [None]:
# Create a new figure.
fig_plot_error_simulation_closest_without_fix: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_error_simulation_closest_without_fix = fig_plot_error_simulation_closest_without_fix.gca()

# Set range of axis.
axis_plot_error_simulation_closest_without_fix.set_xlim(xmin=-10, xmax=MAX_NB_CONSTRAINTS_TO_PLOT+100)
axis_plot_error_simulation_closest_without_fix.set_ylim(ymin=-0.01, ymax=1.01)
    
# Plot error simulation.
for k, error_rate_k in enumerate(LIST_OF_ERROR_RATES):
        
    # Compute performance MEAN and SEM for this error rate.
    performances_closest_without_fix_MEAN, performances_closest_without_fix_SEM = get_MEAN_SEM_of_performance_evolution_per_constraints_number(
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
            env
            for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
            if (
                "-closest_" in env.split("/")[4]
                and "{rate:.2f}".format(rate=error_rate_k) in env.split("/")[5]
                and "-without_fix" in env.split("/")[5]
            )
        ],
    )
    
    # Add plot.
    add_plot_of_performance_evolution_per_constraints_number_to_graph(
        axis=axis_plot_error_simulation_closest_without_fix,
        list_of_x=[
            constraints_number
            for constraints_number in sorted(performances_closest_without_fix_MEAN.keys())
            if constraints_number <= MAX_NB_CONSTRAINTS_TO_PLOT
        ],
        dict_of_y=performances_closest_without_fix_MEAN,
        dict_of_y_err=performances_closest_without_fix_SEM,
        label="{rate:2d}% de différences".format(rate=int(error_rate_k*100)),
        label_in_curve="{rate:2d}%".format(rate=int(error_rate_k*100)),
        marker=markers[error_rate_k],
        markersize=markersizes[error_rate_k],
        color=list_of_colors[k],
        linewidth=linewidths[error_rate_k],
        linestyle=linestyles[error_rate_k],
        alpha=0.2,
    )

# Set axis name.
axis_plot_error_simulation_closest_without_fix.set_xlabel("nombre de contraintes [#]", fontsize=18,)
axis_plot_error_simulation_closest_without_fix.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_error_simulation_closest_without_fix.legend(ncol=4, loc="lower right", fontsize=12,)

# Plot the grid.
axis_plot_error_simulation_closest_without_fix.grid(True)
    
# Store the graph.
fig_plot_error_simulation_closest_without_fix.savefig(
    "../results/etude-erreur-simulation-impact-1closest-without-fix.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Modelization of `random` `with_fix=True`.

In [None]:
# Fit the model to the data and print results.
model_simulation_random_with_fix = statsmodels.formula.api.glm(
    formula="clustering__v_measure ~ 1 + constraints__annotated*error_simulation__error_rate",
    data=df_simulations[
        (df_simulations["constraints_selection__algorithm"]=="random")
        &(df_simulations["error_simulation__with_fix"]==True)
    ],
)
results_simulation_random_with_fix = model_simulation_random_with_fix.fit()
print("==============================================================================")
print(">>> formula:", model_simulation_random_with_fix.formula)
print("==============================================================================")
print(results_simulation_random_with_fix.summary())

In [None]:
print(
    "random.without_fix ~",
    "{0:.2E}".format(results_simulation_random_with_fix.params["Intercept"]),
    "+ {0:.2E}*constraints".format(results_simulation_random_with_fix.params["constraints__annotated"]),
    "+ {0:.2E}*error_rate".format(results_simulation_random_with_fix.params["error_simulation__error_rate"]),
)

In [None]:
# Define the interpolation function.
def interpolation_random_with_fix(constraints_number, error_rate) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (results_simulation_random_with_fix.params["Intercept"] - results_simulation_random_with_fix.bse["Intercept"])
    res += results_simulation_random_with_fix.params["Intercept"]
    res_high += (results_simulation_random_with_fix.params["Intercept"] + results_simulation_random_with_fix.bse["Intercept"])
    # constraints__annotated.
    res_low += (results_simulation_random_with_fix.params["constraints__annotated"] - results_simulation_random_with_fix.bse["constraints__annotated"]) * constraints_number
    res += results_simulation_random_with_fix.params["constraints__annotated"] * constraints_number
    res_high += (results_simulation_random_with_fix.params["constraints__annotated"] + results_simulation_random_with_fix.bse["constraints__annotated"]) * constraints_number
    # error_simulation__error_rate.
    res_low += (results_simulation_random_with_fix.params["error_simulation__error_rate"] - results_simulation_random_with_fix.bse["error_simulation__error_rate"]) * error_rate
    res += results_simulation_random_with_fix.params["error_simulation__error_rate"] * error_rate
    res_high += (results_simulation_random_with_fix.params["error_simulation__error_rate"] + results_simulation_random_with_fix.bse["error_simulation__error_rate"]) * error_rate
    # Return.
    return res_low, res, res_high

In [None]:
# Create a new figure.
fig_plot_error_simulation_random_with_fix: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_error_simulation_random_with_fix = fig_plot_error_simulation_random_with_fix.gca()

# Set range of axis.
axis_plot_error_simulation_random_with_fix.set_xlim(xmin=-10, xmax=MAX_NB_CONSTRAINTS_TO_PLOT+100)
axis_plot_error_simulation_random_with_fix.set_ylim(ymin=-0.01, ymax=1.01)
    
# Plot error simulation.
for k, error_rate_k in enumerate(LIST_OF_ERROR_RATES):
        
    # Compute performance MEAN and SEM for this error rate.
    performances_random_with_fix_MEAN, performances_random_with_fix_SEM = get_MEAN_SEM_of_performance_evolution_per_constraints_number(
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
            env
            for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
            if (
                "-rand_" in env.split("/")[4]
                and "{rate:.2f}".format(rate=error_rate_k) in env.split("/")[5]
                and "-with_fix" in env.split("/")[5]
            )
        ],
    )
    
    # Add plot.
    add_plot_of_performance_evolution_per_constraints_number_to_graph(
        axis=axis_plot_error_simulation_random_with_fix,
        list_of_x=[
            constraints_number
            for constraints_number in sorted(performances_random_with_fix_MEAN.keys())
            if constraints_number <= MAX_NB_CONSTRAINTS_TO_PLOT
        ],
        dict_of_y=performances_random_with_fix_MEAN,
        dict_of_y_err=performances_random_with_fix_SEM,
        label="{rate:2d}% de différences".format(rate=int(error_rate_k*100)),
        label_in_curve="{rate:2d}%".format(rate=int(error_rate_k*100)),
        marker=markers[error_rate_k],
        markersize=markersizes[error_rate_k],
        color=list_of_colors[k],
        linewidth=linewidths[error_rate_k],
        linestyle=linestyles[error_rate_k],
        alpha=0.2,
    )

# Set axis name.
axis_plot_error_simulation_random_with_fix.set_xlabel("nombre de contraintes [#]", fontsize=18,)
axis_plot_error_simulation_random_with_fix.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_error_simulation_random_with_fix.legend(ncol=4, loc="upper left", fontsize=12,)

# Plot the grid.
axis_plot_error_simulation_random_with_fix.grid(True)
    
# Store the graph.
fig_plot_error_simulation_random_with_fix.savefig(
    "../results/etude-erreur-simulation-impact-2random-with-fix.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Modelization of `closest_in_different_clusters` `with_fix=True`.

In [None]:
# Fit the model to the data and print results.
model_simulation_closest_with_fix = statsmodels.formula.api.glm(
    formula="clustering__v_measure ~ 1 + constraints__annotated*error_simulation__error_rate",
    data=df_simulations[
        (df_simulations["constraints_selection__algorithm"]=="closest_in_different_clusters")
        &(df_simulations["error_simulation__with_fix"]==True)
    ],
)
results_simulation_closest_with_fix = model_simulation_closest_with_fix.fit()
print("==============================================================================")
print(">>> formula:", model_simulation_closest_with_fix.formula)
print("==============================================================================")
print(results_simulation_closest_with_fix.summary())

In [None]:
print(
    "closest.with_fix ~",
    "{0:.2E}".format(results_simulation_closest_with_fix.params["Intercept"]),
    "+ {0:.2E}*constraints".format(results_simulation_closest_with_fix.params["constraints__annotated"]),
    "+ {0:.2E}*error_rate".format(results_simulation_closest_with_fix.params["error_simulation__error_rate"]),
)

In [None]:
# Define the interpolation function.
def interpolation_closest_with_fix(constraints_number, error_rate) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (results_simulation_closest_with_fix.params["Intercept"] - results_simulation_closest_with_fix.bse["Intercept"])
    res += results_simulation_closest_with_fix.params["Intercept"]
    res_high += (results_simulation_closest_with_fix.params["Intercept"] + results_simulation_closest_with_fix.bse["Intercept"])
    # constraints__annotated.
    res_low += (results_simulation_closest_with_fix.params["constraints__annotated"] - results_simulation_closest_with_fix.bse["constraints__annotated"]) * constraints_number
    res += results_simulation_closest_with_fix.params["constraints__annotated"] * constraints_number
    res_high += (results_simulation_closest_with_fix.params["constraints__annotated"] + results_simulation_closest_with_fix.bse["constraints__annotated"]) * constraints_number
    # error_simulation__error_rate.
    res_low += (results_simulation_closest_with_fix.params["error_simulation__error_rate"] - results_simulation_closest_with_fix.bse["error_simulation__error_rate"]) * error_rate
    res += results_simulation_closest_with_fix.params["error_simulation__error_rate"] * error_rate
    res_high += (results_simulation_closest_with_fix.params["error_simulation__error_rate"] + results_simulation_closest_with_fix.bse["error_simulation__error_rate"]) * error_rate
    # Return.
    return res_low, res, res_high

In [None]:
# Create a new figure.
fig_plot_error_simulation_closest_with_fix: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_error_simulation_closest_with_fix = fig_plot_error_simulation_closest_with_fix.gca()

# Set range of axis.
axis_plot_error_simulation_closest_with_fix.set_xlim(xmin=-10, xmax=MAX_NB_CONSTRAINTS_TO_PLOT+100)
axis_plot_error_simulation_closest_with_fix.set_ylim(ymin=-0.01, ymax=1.01)
    
# Plot error simulation.
for k, error_rate_k in enumerate(LIST_OF_ERROR_RATES):
        
    # Compute performance MEAN and SEM for this error rate.
    performances_closest_with_fix_MEAN, performances_closest_with_fix_SEM = get_MEAN_SEM_of_performance_evolution_per_constraints_number(
        local_LIST_OF_EXPERIMENT_ENVIRONMENTS=[
            env
            for env in LIST_OF_EXPERIMENT_ENVIRONMENTS
            if (
                "-closest_" in env.split("/")[4]
                and "{rate:.2f}".format(rate=error_rate_k) in env.split("/")[5]
                and "-with_fix" in env.split("/")[5]
            )
        ],
    )
    
    # Add plot.
    add_plot_of_performance_evolution_per_constraints_number_to_graph(
        axis=axis_plot_error_simulation_closest_with_fix,
        list_of_x=[
            constraints_number
            for constraints_number in sorted(performances_closest_with_fix_MEAN.keys())
            if constraints_number <= MAX_NB_CONSTRAINTS_TO_PLOT
        ],
        dict_of_y=performances_closest_with_fix_MEAN,
        dict_of_y_err=performances_closest_with_fix_SEM,
        label="{rate:2d}% de différences".format(rate=int(error_rate_k*100)),
        label_in_curve="{rate:2d}%".format(rate=int(error_rate_k*100)),
        marker=markers[error_rate_k],
        markersize=markersizes[error_rate_k],
        color=list_of_colors[k],
        linewidth=linewidths[error_rate_k],
        linestyle=linestyles[error_rate_k],
        alpha=0.2,
    )

# Set axis name.
axis_plot_error_simulation_closest_with_fix.set_xlabel("nombre de contraintes [#]", fontsize=18,)
axis_plot_error_simulation_closest_with_fix.set_ylabel("v-measure [%]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_error_simulation_closest_with_fix.legend(ncol=4, loc="lower right", fontsize=12,)

# Plot the grid.
axis_plot_error_simulation_closest_with_fix.grid(True)
    
# Store the graph.
fig_plot_error_simulation_closest_with_fix.savefig(
    "../results/etude-erreur-simulation-impact-2closest-with-fix.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)