# ==== INTERACTIVE CLUSTERING : CONSTRAINTS NUMBER STUDY ====
> ### Stage 3 : Modelize constraints number required to converge and Plot some figures.

-----

## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at modelize constraints number required to converge and plot several figures according to previous analyses**.
- Environments are represented by subdirectories in the `/experiments` folder. A full path to an experiment environment is `/experiments/[DATASET]/[PREPROCESSING]/[VECTORIZATION]/[SAMPLING]/[CLUSTERING]/[EXPERIMENT]`.
- An experiment run is composed of iterations of _interative clustering_.
- An experiment evaluation look at each _interative clustering_ iteration of the experiment.

Before running, **run the notebook `2_Run_until_convergence_and_evaluate_constraints_number_required.ipynb` to run interactive clustering experiment until convergence and estimate constraints number required to converge.**.

### Description each steps

First of all, **load experiment synthesis CSV file** that have made during interactive clustering experiments.
1. Modelize constraints number in function of dataset size

***WARNING***: _Start by launching the experiment runs, evaluations and synthesis, and launching main effects analysis before this section !_

-----

## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import openpyxl
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from scipy import stats as scipystats
import statistics
import statsmodels
import statsmodels.api
import statsmodels.formula.api

-----

## 2. LOAD DATA

In [None]:
df_experiments: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../results/experiments_synthesis.csv",
    sep=";",
)
#df_experiments["V090v__constraints_total"] = df_experiments["V090v__constraints_total"].replace(",", ".").astype(float)
df_experiments.head()

-----

## 3. ANALYZE DATA

- `pvalue(dataset_reference) < 10**(-3)`
- `bank_cards_v2            ~  52 + 2.91 * dataset_size`
- `mlsum_fr_train_subset_v1 ~ 763 + 3.19 * dataset_size`

In [None]:
# Fit the model to the data and print results.
model_constraints_number = statsmodels.formula.api.glm(
    formula="V090v__constraints_total ~ 1 + dataset_size",
    data=df_experiments,
)
results_constraints_number = model_constraints_number.fit()
print(results_constraints_number.summary())

In [None]:
# Print the modelization.
print(
    "CONSTRAINTS_NUMBER ~",
    "{0:.2E}".format(results_constraints_number.params["Intercept"]),
    "{0:.2E}*{1}".format(results_constraints_number.params["dataset_size"], "dataset_size")
)

In [None]:
# Define the interpolation function.
def interpolation_constraints_number(dataset_size) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (results_constraints_number.params["Intercept"] - results_constraints_number.bse["Intercept"])
    res += results_constraints_number.params["Intercept"]
    res_high += (results_constraints_number.params["Intercept"] + results_constraints_number.bse["Intercept"])
    # constraints_number.
    res_low += (results_constraints_number.params["dataset_size"] - results_constraints_number.bse["dataset_size"]) * dataset_size
    res += results_constraints_number.params["dataset_size"] * dataset_size
    res_high += (results_constraints_number.params["dataset_size"] + results_constraints_number.bse["dataset_size"]) * dataset_size
    # Return.
    return res_low, res, res_high

In [None]:
# Create a new figure.
fig_plot_constraints_number: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_constraints_number = fig_plot_constraints_number.gca()

# Set range of axis.
axis_plot_constraints_number.set_xlim(xmin=0, xmax=5250)
axis_plot_constraints_number.set_ylim(ymin=0, ymax=20000)

# Plot constraints number (observations).
axis_plot_constraints_number.plot(
    df_experiments[df_experiments["dataset_reference"]=="bank_cards_v2"]["dataset_size"],  # x
    df_experiments[df_experiments["dataset_reference"]=="bank_cards_v2"]["V090v__constraints_total"],  # y
    label="Nombre de contraintes observé pour 'bank_cards_v2'",
    marker="x",
    markerfacecolor="red",
    markersize=5,
    color="red",
    linewidth=0,
    linestyle="",
)
axis_plot_constraints_number.plot(
    df_experiments[df_experiments["dataset_reference"]=="mlsum_fr_train_subset_v1"]["dataset_size"],  # x
    df_experiments[df_experiments["dataset_reference"]=="mlsum_fr_train_subset_v1"]["V090v__constraints_total"],  # y
    label="Nombre de contraintes observé pour 'mlsum_fr_train_subset_v1'",
    marker="+",
    markerfacecolor="blue",
    markersize=5,
    color="blue",
    linewidth=0,
    linestyle="",
)
# Plot constraints number (modelization).
axis_plot_constraints_number.plot(
    range(0, 5001, 100),  # x
    [
        interpolation_constraints_number(x)[1]
        for x in range(0, 5001, 100)  # x
    ],  # y
    label="Nombre de contraintes modélisé",
    marker="",
    markerfacecolor="purple",
    markersize=3,
    color="purple",
    linewidth=2,
    linestyle="--",
)
axis_plot_constraints_number.fill_between(
    x=range(0, 5001, 100),  # x
    y1=[
        interpolation_constraints_number(x)[0]
        for x in range(0, 5001, 100)  # x
    ],  # y1
    y2=[
        interpolation_constraints_number(x)[2]
        for x in range(0, 5001, 100)  # x
    ],  # y2
    color="purple",
    alpha=0.2,
)

# Set axis name.
axis_plot_constraints_number.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_constraints_number.set_ylabel("nombre de contraintes [#]", fontsize=18,)

# Plot the legend.
axis_plot_constraints_number.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_constraints_number.grid(True)
    
# Store the graph.
fig_plot_constraints_number.savefig(
    "../results/etude-nombre-contraintes-1-modelisation-nombre.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

-----
# 4. Conclusion

In [None]:
# Annotation time.
def estimate_annotation_time(batch_size: int = 50) -> Dict[str, float]:
    # return 202 + batch_size * 7
    return {
        "min": (95 + 6.39 * batch_size),
        "mean": (202 + 6.92 * batch_size),
        "max": (309 + 7.45 * batch_size),
    }

In [None]:
# Computation time.
def estimate_computation_time(dataset_size: int) -> Dict[str, float]:
    # return -180 + 0.211 * dataset_size
    return {
       "min": (-243 + 0.216 * dataset_size + 1.463*10**(-6) * dataset_size**2),
       "mean": (-239 + 0.217 * dataset_size + 1.464*10**(-6) * dataset_size**2),
       "max": (-235 + 0.218 * dataset_size + 1.465*10**(-6) * dataset_size**2),
    }

In [None]:
# Constraints number.
def estimate_constraints_number(dataset_size: int) -> Dict[str, float]:
    # return 356 + 3.05 * dataset_size
    return {
        "min": (219 + 3.01 * dataset_size),
        "mean": (356 + 3.05 * dataset_size),
        "max": (492 + 3.10 * dataset_size),
    }

In [None]:
# Total time.
def estimate_total_time(dataset_size: int, batch_size: int) -> Dict[str, float]:
    # Estimate constraints and iterations.
    constraints_number: Dict[str, float] = estimate_constraints_number(dataset_size)
    nb_iterations: Dict[str, float] = {key: (constraints_number[key]/batch_size) for key in constraints_number.keys()}
    # Estimate annotation time.
    time_of_one_annotation_batch: Dict[str, float] = estimate_annotation_time(batch_size)
    total_annotation_time: Dict[str, float] = {key: (time_of_one_annotation_batch[key]*nb_iterations[key]) for key in nb_iterations.keys()}
    # Estimate computation time.
    time_of_one_computation_batch: Dict[str, float] = estimate_computation_time(dataset_size)
    total_computation_time: Dict[str, float] = {key: (time_of_one_computation_batch[key]*nb_iterations[key]) for key in nb_iterations.keys()}
    # Estimate total time.
    total_time: Dict[str, float] = {key: (total_annotation_time[key] + total_computation_time[key]) for key in nb_iterations.keys()}
    return {
        "total-min": total_time["min"],
        "total": total_time["mean"],
        "total-max": total_time["max"],
        "annotation-min": total_annotation_time["min"],
        "annotation": total_annotation_time["mean"],
        "annotation-max": total_annotation_time["max"],
        "computation-min": total_computation_time["min"],
        "computation": total_computation_time["mean"],
        "computation-max": total_computation_time["max"],
    }

In [None]:
# Display total time.
def display_total_time(dataset_size: int, batch_size: int) -> pd.DataFrame:
    df = pd.DataFrame.from_dict(
        data={
            key: [value]
            for key, value in estimate_total_time(dataset_size=dataset_size, batch_size=batch_size).items()
        },
        orient="index",
        columns=["time (seconds)"],
    )
    df["time (minutes)"] = df.apply(lambda row: round(row["time (seconds)"] / 60, 2), axis=1)
    df["time (hours)"] = df.apply(lambda row: round(row["time (seconds)"] / 60 / 60, 2), axis=1)
    df["time (days)"] = df.apply(lambda row: round(row["time (seconds)"] / 60 / 60 / 24, 2), axis=1)
    df["time (work days)"] = df.apply(lambda row: round(row["time (seconds)"] / 60 / 60 / 8, 2), axis=1)
    return df

In [None]:
# dataset_size=5000, batch_size=50
display_total_time(dataset_size=5000, batch_size=50)

In [None]:
# dataset_size=5000, batch_size=100
display_total_time(dataset_size=5000, batch_size=100)

In [None]:
# dataset_size=5000, batch_size=150
display_total_time(dataset_size=5000, batch_size=150)

In [None]:
# dataset_size=5000, batch_size=200
display_total_time(dataset_size=5000, batch_size=200)

In [None]:
# Create a new figure.
fig_plot_total_time: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_total_time = fig_plot_total_time.gca()

# Set range of axis.
axis_plot_total_time.set_xlim(xmin=0, xmax=5050)
axis_plot_total_time.set_ylim(ymin=0, ymax=140)

# Plot total time (batch 50).
axis_plot_total_time.plot(
    range(0, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=50)["total"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant par paquet de   50",
    marker="",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
)
axis_plot_total_time.fill_between(
    x=range(0, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=50)["total-min"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=50)["total-max"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y2
    color="red",
    alpha=0.2,
)

# Plot total time (batch 100).
axis_plot_total_time.plot(
    range(0, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=100)["total"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant par paquet de 100",
    marker="",
    markerfacecolor="blue",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
)
axis_plot_total_time.fill_between(
    x=range(0, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=100)["total-min"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=100)["total-max"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y2
    color="blue",
    alpha=0.2,
)

# Plot total time (batch 150).
axis_plot_total_time.plot(
    range(0, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=150)["total"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant par paquet de 150",
    marker="",
    markerfacecolor="green",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
)
axis_plot_total_time.fill_between(
    x=range(0, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=150)["total-min"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=150)["total-max"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y2
    color="green",
    alpha=0.2,
)

# Plot total time (batch 200).
axis_plot_total_time.plot(
    range(0, 5001, 100),  # x
    [
        estimate_total_time(dataset_size=dataset_size, batch_size=200)["total"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y
    label="Temps total nécessaire en annotant par paquet de 200",
    marker="",
    markerfacecolor="orange",
    markersize=3,
    color="orange",
    linewidth=2,
    linestyle="--",
)
axis_plot_total_time.fill_between(
    x=range(0, 5001, 100),  # x
    y1=[
        estimate_total_time(dataset_size=dataset_size, batch_size=200)["total-min"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y1
    y2=[
        estimate_total_time(dataset_size=dataset_size, batch_size=200)["total-max"]/60/60
        for dataset_size in range(0, 5001, 100)  # x
    ],  # y2
    color="orange",
    alpha=0.2,
)

# Set axis name.
axis_plot_total_time.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_total_time.set_ylabel("temps [heures]", fontsize=18,)

# Plot the legend.
axis_plot_total_time.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_total_time.grid(True)
    
# Store the graph.
fig_plot_total_time.savefig(
    "../results/etude-temps-total-1-modelisation.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)