# ==== INTERACTIVE CLUSTERING : CONSTRAINTS NUMBER STUDY ====
> ### Stage 3 : Modelize constraints number required to converge and Plot some figures.

-----

## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at modelize constraints number required to converge and plot several figures according to previous analyses**.
- Environments are represented by subdirectories in the `/experiments` folder. A full path to an experiment environment is `/experiments/[DATASET]/[PREPROCESSING]/[VECTORIZATION]/[SAMPLING]/[CLUSTERING]/[EXPERIMENT]`.
- An experiment run is composed of iterations of _interative clustering_.
- An experiment evaluation look at each _interative clustering_ iteration of the experiment.

Before running, **run the notebook `2_Run_until_convergence_and_evaluate_constraints_number_required.ipynb` to run interactive clustering experiment until convergence and estimate constraints number required to converge.**.

### Description each steps

First of all, **load experiment synthesis CSV file** that have made during interactive clustering experiments.
1. Modelize constraints number in function of dataset size

***WARNING***: _Start by launching the experiment runs, evaluations and synthesis, and launching main effects analysis before this section !_

-----

## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import openpyxl
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from scipy import stats as scipystats
import statistics
import statsmodels
import statsmodels.api
import statsmodels.formula.api

-----

## 2. LOAD DATA

In [None]:
df_experiments: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../results/experiments_synthesis.csv",
    sep=";",
)
#df_experiments["V090v__constraints_total"] = df_experiments["V090v__constraints_total"].replace(",", ".").astype(float)
df_experiments.head()

-----

## 3. ANALYZE DATA

- `pvalue(dataset_reference) < 10**(-3)`
- `bank_cards_v2            ~ -52 + 2.91 * dataset_size` / `~  2.90 * dataset_size`
- `mlsum_fr_train_subset_v1 ~ 763 + 3.19 * dataset_size` / `~  3.41 * dataset_size`

In [None]:
print(3.41-3.15, (3.41-3.15)/3.15*100)
print(3.15-2.90, (3.15-2.90)/3.15*100)

In [None]:
# Fit the model to the data and print results.
model_constraints_number = statsmodels.formula.api.glm(
    formula="V090v__constraints_total ~ 0 + dataset_size",
    data=df_experiments,
)
results_constraints_number = model_constraints_number.fit()
print(results_constraints_number.summary())

In [None]:
# Print the modelization.
print(
    "CONSTRAINTS_NUMBER ~",
    "{0:.2E}".format(results_constraints_number.params["Intercept"]) if "Intercept" in results_constraints_number.params.keys() else "",
    "+ {0:.2E}*{1}".format(results_constraints_number.params["dataset_size"], "dataset_size")
)

In [None]:
# Define the interpolation function.
def interpolation_constraints_number(dataset_size) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    if "Intercept" in results_constraints_number.params.keys():
        res_low += (results_constraints_number.params["Intercept"] - results_constraints_number.bse["Intercept"])
        res += results_constraints_number.params["Intercept"]
        res_high += (results_constraints_number.params["Intercept"] + results_constraints_number.bse["Intercept"])
    # constraints_number.
    res_low += (results_constraints_number.params["dataset_size"] - results_constraints_number.bse["dataset_size"]) * dataset_size
    res += results_constraints_number.params["dataset_size"] * dataset_size
    res_high += (results_constraints_number.params["dataset_size"] + results_constraints_number.bse["dataset_size"]) * dataset_size
    # Return.
    return res_low, res, res_high

In [None]:
# Create a new figure.
fig_plot_constraints_number: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_constraints_number = fig_plot_constraints_number.gca()

# Set range of axis.
axis_plot_constraints_number.set_xlim(xmin=0, xmax=5250)
axis_plot_constraints_number.set_ylim(ymin=0, ymax=20000)

# Plot constraints number (observations).
axis_plot_constraints_number.plot(
    df_experiments[df_experiments["dataset_reference"]=="bank_cards_v2"]["dataset_size"],  # x
    df_experiments[df_experiments["dataset_reference"]=="bank_cards_v2"]["V090v__constraints_total"],  # y
    label="Nombre de contraintes observé pour 'Bank Cards (v2.0.0)'",
    marker="x",
    markerfacecolor="red",
    markersize=5, 
    color="red",
    linewidth=0,
    linestyle="",
)
axis_plot_constraints_number.plot(
    df_experiments[df_experiments["dataset_reference"]=="mlsum_fr_train_subset_v1"]["dataset_size"],  # x
    df_experiments[df_experiments["dataset_reference"]=="mlsum_fr_train_subset_v1"]["V090v__constraints_total"],  # y
    label="Nombre de contraintes observé pour 'MLSUM FR Train Subset (v1.0.0-schild)'",
    marker="+",
    markerfacecolor="blue",
    markersize=5,
    color="blue",
    linewidth=0,
    linestyle="",
)
# Plot constraints number (modelization).
axis_plot_constraints_number.plot(
    range(1000, 5001, 100),  # x
    [
        interpolation_constraints_number(x)[1]
        for x in range(1000, 5001, 100)  # x
    ],  # y
    label="Nombre de contraintes modélisé",
    marker="",
    markerfacecolor="purple",
    markersize=3,
    color="purple",
    linewidth=2,
    linestyle="--",
)
axis_plot_constraints_number.fill_between(
    x=range(1000, 5001, 100),  # x
    y1=[
        interpolation_constraints_number(x)[0]
        for x in range(1000, 5001, 100)  # x
    ],  # y1
    y2=[
        interpolation_constraints_number(x)[2]
        for x in range(1000, 5001, 100)  # x
    ],  # y2
    color="purple",
    alpha=0.2,
)

# Set axis name.
axis_plot_constraints_number.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_constraints_number.set_ylabel("nombre de contraintes [#]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_constraints_number.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_constraints_number.grid(True)
    
# Store the graph.
fig_plot_constraints_number.savefig(
    "../results/etude-nombre-contraintes-1-modelisation-nombre.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

Estimation of constraints lower and upper limits

In [None]:
def estimation_of_constraints_lower_limits(dataset_size: int, nb_clusters: int) -> float:
    """
    Estimation of constraints lower limit:
    - first estimate the minimal path to define connected components with MUST_LINK.
    - then define clusters by adding minimal number of CANNOT_LINK in order to distinguish clusters.
    
    Args:
        dataset_size (int): number of data.
        nb_clusters (int): number of clusters.
        
    Return:
        int: number of constraints
    """
    cluster_size: float = dataset_size/nb_clusters
    nb_must_link: float = nb_clusters * (cluster_size-1)
    nb_cannot_link: float = sum(
        (nb_clusters-1-k)
        for k in range(nb_clusters)
    )
    return int(nb_must_link + nb_cannot_link)

In [None]:
def estimation_of_constraints_upper_limits(dataset_size: int) -> float:
    """
    Estimation of constraints upper limit.
    
    Args:
        dataset_size (int): number of data.
        
    Return:
        int: number of constraints
    """
    return int( (dataset_size**2 - dataset_size) / 2 )

In [None]:
estimation_of_constraints_lower_limits(1000, 10), estimation_of_constraints_upper_limits(1000)

In [None]:
estimation_of_constraints_lower_limits(5000, 50), estimation_of_constraints_upper_limits(5000)

-----
# 4. Conclusion

In [None]:
# Annotation time.
def estimate_annotation_time(batch_size: int = 50) -> Dict[str, float]:
    """
        Estimate total time to annotation in an interactive clustering methodology.
        
        Args:
            dataset_size (int): The dataset size.
        
        Return:
            Dict[str, float]: Total estimated annotation time in a dictionnary.
    """
    # return 202 + batch_size * 7  # intercept
    return {
        "min": (7.77-0.29) * batch_size,
        "mean": 7.77 * batch_size,
        "max": (7.77+0.29) * batch_size,
    }

In [None]:
# Computation time.
def estimate_computation_time(dataset_size: int) -> Dict[str, float]:
    """
        Estimate total computation time to apply interactive clustering methodology on a dataset.
        
        Args:
            dataset_size (int): The dataset size.
        
        Return:
            Dict[str, float]: Total computation time in a dictionnary.
    """
    # return -180 + 0.211 * dataset_size  # intercept
    return {
       "min": (0.160 * dataset_size + 1.43*10**(-6) * dataset_size**2),
       "mean": (0.167 * dataset_size + 1.43*10**(-6) * dataset_size**2),
       "max": (0.175 * dataset_size + 1.43*10**(-6) * dataset_size**2),
    }

In [None]:
# Constraints number.
def estimate_constraints_number(dataset_size: int) -> Dict[str, float]:
    """
        Estimate number of constraints requested to converge with an clustering methodology.
        
        Args:
            dataset_size (int): The dataset size.
        
        Return:
            Dict[str, float]: Estimated number of constraints in a dictionnary.
    """
    # return 356 + 3.05 * dataset_size  # intercept
    return {
        "min": (3.15-0.016) * dataset_size,
        "mean": 3.15 * dataset_size,
        "max": (3.15+0.016) * dataset_size,
    }

In [None]:
# Total time.
def estimate_total_time(dataset_size: int, batch_size: Optional[int] = 50, with_parallelization: bool = False) -> Dict[str, float]:
    """
        Estimate total time to apply interactive clustering methodology on a dataset.
        
        Args:
            dataset_size (int): The dataset size.
            batch_size (Optional[int]): The annotation batch size. If `None`, then use a batch size for which annotation time an computation time is equivalent. Defaults to `50`.
            with_parallelization (bool): The option to simulate parallelization between clustering and annotation. Defaults to `False`.
        
        Return:
            Dict[str, float]: Total estimated time in a dictionnary.
    """
    # Estimate unitary times: total constraints number, one iteration computation, one iteration annottaion.
    constraints_number: Dict[str, float] = estimate_constraints_number(dataset_size)
    time_of_one_computation_batch: Dict[str, float] = estimate_computation_time(dataset_size)
    time_of_one_computation_batch: Dict[str, float]
    if batch_size is None:
        batch_size = max(50, min(150, estimate_computation_time(dataset_size)["mean"] / estimate_annotation_time(1)["mean"]))
    time_of_one_annotation_batch: Dict[str, float] = estimate_annotation_time(batch_size)
    
    # Estimate total times.
    nb_iterations: Dict[str, float] = {key: (constraints_number[key]/batch_size) for key in constraints_number.keys()}
    total_computation_time: Dict[str, float] = {key: (time_of_one_computation_batch[key]*nb_iterations[key]) for key in nb_iterations.keys()}
    total_annotation_time: Dict[str, float] = {key: (time_of_one_annotation_batch[key]*nb_iterations[key]) for key in nb_iterations.keys()}
    total_time: Dict[str, float]
    if with_parallelization:
        total_time = {key: max(total_annotation_time[key], total_computation_time[key]) for key in nb_iterations.keys()}
    else:
        total_time = {key: (total_annotation_time[key] + total_computation_time[key]) for key in nb_iterations.keys()}
    print("nb_iterations", nb_iterations)
    return {
        "total-min": total_time["min"],
        "total": total_time["mean"],
        "total-max": total_time["max"],
        "annotation-min": total_annotation_time["min"],
        "annotation": total_annotation_time["mean"],
        "annotation-max": total_annotation_time["max"],
        "computation-min": total_computation_time["min"],
        "computation": total_computation_time["mean"],
        "computation-max": total_computation_time["max"],
    }

In [None]:
# Display total time.
def display_total_time(dataset_size: int, batch_size: int, with_parallelization: bool = False) -> pd.DataFrame:
    """
        Estimate total time to apply interactive clustering methodology on a dataset.
        
        Args:
            dataset_size (int): The dataset size.
            batch_size (int): The annotation batch size.
            with_parallelization (bool): The option to simulate parallelization between clustering and annotation. Defaults to `False`.
        
        Return:
            pd.DataFrame: Total estimated time in a DataFrame.
    """
    df = pd.DataFrame.from_dict(
        data={
            key: [value]
            for key, value in estimate_total_time(dataset_size=dataset_size, batch_size=batch_size, with_parallelization=with_parallelization).items()
        },
        orient="index",
        columns=["time [s]"],
    )
    df["time [m]"] = df.apply(lambda row: round(row["time [s]"] / 60, 2), axis=1)
    df["time [h]"] = df.apply(lambda row: round(row["time [s]"] / 60 / 60, 2), axis=1)
    df["time [d]"] = df.apply(lambda row: round(row["time [s]"] / 60 / 60 / 24, 2), axis=1)
    df["time [wd]"] = df.apply(lambda row: round(row["time [s]"] / 60 / 60 / 7, 2), axis=1)
    return df

In [None]:
# dataset_size=5000, batch_size=50, without parallelization
display_total_time(dataset_size=5000, batch_size=50, with_parallelization=False)

In [None]:
# dataset_size=5000, batch_size=50, with parallelization
display_total_time(dataset_size=5000, batch_size=50, with_parallelization=True)

In [None]:
# dataset_size=5000, batch_size=150, without parallelization
display_total_time(dataset_size=5000, batch_size=150, with_parallelization=False)

In [None]:
# dataset_size=5000, batch_size=150, with parallelization
display_total_time(dataset_size=5000, batch_size=150, with_parallelization=True)

In [None]:
# dataset_size=5000, batch_size=150, without parallelization
display_total_time(dataset_size=5000, batch_size=None, with_parallelization=False)

In [None]:
# dataset_size=5000, batch_size=150, with parallelization
display_total_time(dataset_size=5000, batch_size=None, with_parallelization=True)

In [None]:
# Create a new figure.
fig_plot_total_time: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_total_time = fig_plot_total_time.gca()

# Set range of axis.
axis_plot_total_time.set_xlim(xmin=-50, xmax=5050)
axis_plot_total_time.set_ylim(ymin=-2, ymax=122)

###
### Sequential.
###

# Plot total time (batch 50).
axis_plot_total_time.plot(
    range(1000, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=50, with_parallelization=False)["total"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant après le clustering par paquet de 50",
    marker="",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="-.",
)
axis_plot_total_time.fill_between(
    x=range(1000, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=50, with_parallelization=False)["total-min"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=50, with_parallelization=False)["total-max"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y2
    color="red",
    alpha=0.2,
)

# Plot total time (batch 100).
axis_plot_total_time.plot(
    range(1000, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=100, with_parallelization=False)["total"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant après le clustering par paquet de 100",
    marker="",
    markerfacecolor="orange",
    markersize=3,
    color="orange",
    linewidth=2,
    linestyle="-.",
)
axis_plot_total_time.fill_between(
    x=range(1000, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=100, with_parallelization=False)["total-min"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=100, with_parallelization=False)["total-max"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y2
    color="orange",
    alpha=0.2,
)

# Plot total time (batch 150).
axis_plot_total_time.plot(
    range(1000, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=150, with_parallelization=False)["total"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant après le clustering par paquet de 150",
    marker="",
    markerfacecolor="khaki",
    markersize=3,
    color="khaki",
    linewidth=2,
    linestyle="-.",
)
axis_plot_total_time.fill_between(
    x=range(1000, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=150, with_parallelization=False)["total-min"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=150, with_parallelization=False)["total-max"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y2
    color="khaki",
    alpha=0.2,
)

# Set axis name.
axis_plot_total_time.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_total_time.set_ylabel("temps [h]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_total_time.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_total_time.grid(True)
    
# Store the graph.
fig_plot_total_time.savefig(
    "../results/etude-temps-total-1-modelisation-sequentielle.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

###
### Parallelization.
###

# Plot total time (batch optimal).
axis_plot_total_time.plot(
    range(1000, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=None, with_parallelization=True)["total"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant en parallèle de l'exécution du clustering",
    marker="",
    markerfacecolor="green",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
)
axis_plot_total_time.fill_between(
    x=range(1000, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=None, with_parallelization=True)["total-min"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=None, with_parallelization=True)["total-max"]/60/60
        for dataset_size in range(1000, 5001, 100)  # x
    ],  # y2
    color="green",
    alpha=0.2,
)

# Set axis name.
axis_plot_total_time.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_total_time.set_ylabel("temps [h]", fontsize=18,)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Plot the legend.
axis_plot_total_time.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_total_time.grid(True)
    
# Store the graph.
fig_plot_total_time.savefig(
    "../results/etude-temps-total-2-modelisation-parallele.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)