# ==== INTERACTIVE CLUSTERING : COMPUTATION TIME STUDY ====
> ### Stage 3 : Modelize computation time of Interactive Clustering tasks and Plot some figures.

-----

## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at modelize interactive clustering computation time over experiments**.
- Environments are represented by subdirectories in the `/experiments` folder. A full path to an experiment environment is `/experiments/[TASK]/[DATASET]/[ALGORITHM]/`.
- Experiments have to be run and evaluated in order to analyze convergency speed.

Before running, **run the notebook `2_Estimate_computation_time.ipynb` to run each algorithm you have set**.

Then, **go to the notebook `4_Plot_some_figures.ipynb` to create figures on interactive clustering computation time**.

### Description each steps

First of all, **load experiment synthesis CSV file** that have been computed with the last notebook.
- It contains parameters used for each experiment and convergency metric to compare.
- Several parameters are studied depending on the task:
    - _preprocessing_: `dataset_size`, `algorithm_name`;
    - _vectorization_: `dataset_size`, `algorithm_name`;
    - _sampling_: `dataset_size`, `algorithm_name`, `previous_nb_constraints`, `previous_nb_clusters`, `algorithm_nb_to_select`;
    - _clustering_: `dataset_size`, `algorithm_name`, `previous_nb_constraints`, `previous_nb_clusters`.
- Two random effects are used : `dataset_random_seed`, `algorithm_random_seed`.
- One values is modelized with these factors : `time_total`.

Then, for each task :
1. Compute interactions of factors (`1`, `X1`, `X1²`, `X1*X2`, ...)
2. Sort interactions by correlation in order to choose an efficient modelization of computation time
3. Compute GLM to get the modelization parameters
4. Plot modelized computation time

-----

## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import openpyxl
import pandas as pd
from itertools import combinations_with_replacement, permutations
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from scipy import stats as scipystats
import statistics
import statsmodels
import statsmodels.api
import statsmodels.formula.api

In [None]:
statsmodels.genmod.generalized_linear_model.SET_USE_BIC_LLF(True)

In [None]:
def compute_combinations_of_interactions_of_factors(
    df: pd.DataFrame,
    factors: List[str],
    range_of_powers: List[int] = [0, 1, 2, 3],
    max_power: int = 3,
) -> Tuple[List[Dict[str, str]], pd.DataFrame]:
    """
        Compute combinations of interactions of factors in dataframe.
        
        Args:
            df (pd.DataFrame): Dataframe of results.
            factors (List[str]): Column of dataframe considered as factors.
            range_of_powers (List[int]): Range of power to compute. Defaults to `[0, 1, 2, 3]`.
            max_power (int): Maximum interaction levl, i.e. maximum sum of powers. Defaults to `3`.
        
        Return:
            Tuple[Dict[str, str], pd.DataFrame]: The factors interactions (dictionary) and the updated dataframe of results.
    """

    # The columns computed.
    factors_interactions: Dict[str, str] = {}
    
    # Define combinations of powers.
    for power_combination in combinations_with_replacement(range_of_powers, r=len(factors)):
        
        # Define max combination of powers.
        if sum(power_combination)<=0 or max_power<sum(power_combination):
            continue

        # Define permutations of combinations of powers.
        for powers in sorted(set(permutations(power_combination)), reverse=True):
            
            # Compute column and value of the combination of factors.
            list_of_Xi: List[Dict] = []
            for i, factor in enumerate(factors):
                if powers[i] != 0:
                    list_of_Xi.append({
                        "key_name": "X{0}POW{1}".format(i+1, powers[i]).replace(".", "_"),
                        "full_name": (
                            "{0}".format(factor)
                            if powers[i] == 1
                            else "{0}**{1}".format(factor, powers[i])
                        ),
                        "factor": factor,
                        "power": powers[i],
                    })
            
            # Define names (key name and full name) of factors interactions.
            column_key_name: str = "_".join([x["key_name"] for x in list_of_Xi])
            column_full_name: str = (
                list_of_Xi[0]["full_name"]
                if len(list_of_Xi) == 1
                else " * ".join(["({0})".format(x["full_name"]) for x in list_of_Xi])
            )
            factors_interactions[column_key_name] = column_full_name
            
            # Update dataframe of results by store the factors interaction in a column.
            df[column_key_name] = df.apply(
                lambda row: np.prod([
                    np.power(row[x["factor"]], x["power"])
                    for x in list_of_Xi
                ]),
                axis=1,
            )
    return factors_interactions, df

In [None]:
def compute_correlation_of_factors(
    df: pd.DataFrame,
    factors_interactions: Dict[str, str],
    algorithm_name: str,
) -> pd.DataFrame:
    """
        Compute Pearson (standard) correlation coefficient between factors and computed time.

        Args:
            df (pd.DataFrame): Dataframe of results.
            factors_interactions (Dict[str, str]): Dictionary of columns in dataframe that correspond to computed interaction between factors.
            algorithm_name (str): The algithm name to filter dataframe.
        
        Return:
            pd.DataFrame: The correlation of combination of factors to algorithm computation time.
    """
    # Get columns of factors interactions
    columns_of_factors_interactions: List[str] = list(factors_interactions.keys())
    
    # Filter data on `algorithm_name`.
    df_subset = (
        df
        if algorithm_name is None
        else df[df["algorithm_name"]==algorithm_name]
    )
    
    # Compute correlation.
    df_correlation = df_subset.loc[
        # Exclude columns that are not factors.
        :, df_subset.columns.isin(["time_total"] + columns_of_factors_interactions)
    ].corr(
        # Get correlation of factors.
    )["time_total"].apply(
        # Get the absolute value of correlation.
        lambda x: abs(x)
    ).sort_values(
        # Sort by correlation.
        ascending=False
    ).to_frame(
        # Format in pd.Series
        name="r"
    ).drop(
        # Drop time column.
        ["time_total"]
    )
    
    # Compute R^2.
    df_correlation["r^2"] = np.power(df_correlation["r"], 2)
    
    # Add factors interactions full name in columns
    df_correlation["full_name"] = df_correlation.apply(
        lambda row: factors_interactions[row.name],
        axis=1,
    )

    # Show.
    return df_correlation

In [None]:
def compute_information_score_evolution(
    df: pd.DataFrame,
    df_correlation: pd.DataFrame,
    factors_interactions: Dict[str, str],
    algorithm_name: str,
    with_intercept: bool = True,
    graph_filepath: Optional[str] = None,
    graph_plot_description: Optional[str] = None,
) -> pd.DataFrame:
    """
        Compute OLS model based on top correlation factors.

        Args:
            df (pd.DataFrame): Dataframe of results.
            df_correlation (pd.DataFrame): The correlation of combination of factors to algorithm computation time.
            factors_interactions (Dict[str, str]): Dictionary of columns in dataframe that correspond to computed interaction between factors.
            algorithm_name (str): The algorithm name to filter dataframe.
            with_intercept (bool): The option to add an intercept in the modelization. Defaults to `True`.
            graph_filepath (Optional[str]): The path where to store the information score evolution plot. Defaults to `None`.
            graph_plot_description (Optional[str]): The description of the information score evolution plot. Defaults to `None`.
        
        Return:
            Dict: Evolution of information score of OLS model based on top correlation factors.
    """
    
    information_score_evolution = []
    factors_key_names = []
    factors_full_names = []
       
    # For constant (1) + each interaction factor (df_correlation.index)...
    for i, factor in enumerate(
        (["1"] + list(df_correlation.index))
        if with_intercept
        else list(df_correlation.index)
    ):
        
        # Compute modelization with previous factors and this new one.
        factors_key_names.append(factor)
        factors_full_names.append(
            factors_interactions[factor]
            if factor in factors_interactions.keys()
            else "intercept"
        )
        model = statsmodels.formula.api.ols(
            formula="time_total ~ 0 + " + " + ".join(factors_key_names),
            data=(
                df
                if algorithm_name is None
                else df[df["algorithm_name"]==algorithm_name]
            ),
        )
        result = model.fit()
        
        # Store results.
        information_score_evolution.append({
            "factors": factors_key_names.copy(),
            "scores": {"aic": result.aic, "bic": result.bic, "rsquared": result.rsquared, "llf": result.llf}
        })
        
    # Create a new figure.
    fig_plot: Figure = plt.figure(figsize=(15, 7.5), dpi=50)
    axis_plot = fig_plot.gca()

    # Set range of axis.
    axis_plot.set_ylim(ymin=0, ymax=1)

    # Plot information criteria.
    axis_plot.plot(
        [str(i+1) for i in range(len(information_score_evolution))],  # x
        [result["scores"]["rsquared"] for result in information_score_evolution],  # y
        label=(
            "R²"
            if graph_plot_description is None
            else graph_plot_description
        ),
        marker="x",
        markerfacecolor="red",
        markersize=3,
        color="red",
        linewidth=1,
        linestyle="-",
    )
    
    # Plot x label names.
    axis_plot.set_xticks(
        [str(i+1) for i in range(len(information_score_evolution))],
        factors_full_names,
        rotation=30,
        ha='right'
    )

    # Set axis name.
    axis_plot.set_xlabel("complexité de la modélisation [# de facteurs d'analyse]", fontsize=18,)
    axis_plot.set_ylabel("score du critère d'informations [%]", fontsize=18,)
    
    # Set range of axis.
    axis_plot.set_ylim(ymin=0, ymax=1)  # 0.33 -> 1

    # Plot the legend.
    axis_plot.legend(
        loc="lower right",
        fontsize=15,
    )

    # Plot the grid.
    axis_plot.grid(True)
    
    # Store the graph.
    if graph_filepath is not None:
        fig_plot.savefig(
            graph_filepath,
            dpi=300,
            transparent=True,
            bbox_inches="tight",
        )
    
    return information_score_evolution

In [None]:
# Get best GLM model:
def compare_glm_models(
    df: pd.DataFrame,
    algorithm_name: str,
    formulas: List[str],
) -> pd.DataFrame:
    """
        Compare GLM models.

        Args:
            df (pd.DataFrame): Dataframe of results.
            algorithm_name (str): The algithm name to filter dataframe.
            formulas (List[str]): The list of formulas used to train GLM models.
        
        Return:
            pd.DataFrame: Results of GLM models (pseudo R², log-likelihood)
    """
    df_scores = pd.DataFrame(columns=["formula", "degree of freedom", "pseudo R² (Cox-Snell)", "log-likelihood"])
    for formula in formulas:
        # Fit the model to the data.
        model = statsmodels.formula.api.glm(
            formula=formula,
            data=(
                df
                if algorithm_name is None
                else df[df["algorithm_name"]==algorithm_name]
            ),
        )
        # Store results.
        results = model.fit()
        df_scores = pd.concat(
            [
                df_scores,
                pd.DataFrame(
                    {
                        "formula": formula,
                        "degree of freedom": results.df_model,
                        "pseudo R² (Cox-Snell)": results.pseudo_rsquared(),
                        "log-likelihood": results.llf,
                    },
                    index=[0]
                )
            ],
            ignore_index=True,
        )
    # Return results
    return df_scores

-----

## 2.1. ANALYSIS FOR PREPROCESSING

> - algorithms: `simple_prep`, `lemma_prep`, `filter_prep`
> - factors: `dataset_size`

In [None]:
df_preprocessing = pd.read_csv("../results/experiments_synthesis_for_preprocessing.csv", sep=";", index_col=0)
df_preprocessing["time_total"] = df_preprocessing["time_total"].str.replace(",", ".").astype(float)
df_preprocessing.head()

In [None]:
# Compute interaction of factors for preprocessing.
factors_interactions_for_preprocessing, df_preprocessing = compute_combinations_of_interactions_of_factors(
    df=df_preprocessing,
    factors=["dataset_size"],
    range_of_powers=[0, 1, 2],
    max_power=3,
)
factors_interactions_for_preprocessing

### 2.1.0. Dertermine if preprocessing computation time is `algorithm_name` dependant.

In [None]:
# Fit the model to the data and print results.
model_prep_ALGONAME = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + C(algorithm_name)",
    data=df_preprocessing,
)
results_prep_ALGONAME = model_prep_ALGONAME.fit()
print(results_prep_ALGONAME.summary())

### 2.1.1. Modelize all preprocessing `algorithm_name`.

In [None]:
# Compute correlation of factors for preprocessing.
df_correlation_preprocessing: pd.DataFrame = compute_correlation_of_factors(
    df=df_preprocessing,
    factors_interactions=factors_interactions_for_preprocessing,
    algorithm_name=None,
)
# Print evolution of information score per model complexity.
information_score_evolution_for_preprocessing = compute_information_score_evolution(
    df=df_preprocessing,
    df_correlation=df_correlation_preprocessing,
    factors_interactions=factors_interactions_for_preprocessing,
    algorithm_name=None,
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-1prep.png",
    graph_plot_description="R² pour modéliser les prétraitements 'prep.simple', 'prep.lemma', 'prep.filter'",
)
df_correlation_preprocessing

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_prep = compare_glm_models(
    df=df_preprocessing,
    algorithm_name=None,
    formulas=[
        "time_total ~ 1",
        "time_total ~ 0 + X1POW1",
        "time_total ~ 1 + X1POW1",
    ],
)
df_scores_prep

> `preprocessing ~ 1 + dataset_size`

In [None]:
# Fit the model to the data and print results.
best_model_prep = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW1",  # X1POW1=dataset_size
    data=df_preprocessing,
)
best_results_prep = best_model_prep.fit()
print("==============================================================================")
print(">>> formula:", best_model_prep.formula)
print("==============================================================================")
print("   ", best_results_prep.summary())

In [None]:
# Print the modelization.
print(
    "preprocessing ~",
    "{0:.2E}".format(best_results_prep.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_prep.params["X1POW1"], factors_interactions_for_preprocessing["X1POW1"]),
)

In [None]:
# Define the interpolation function.
def interpolation_prep(dataset_size) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_prep.params["Intercept"] - best_results_prep.bse["Intercept"])
    res += best_results_prep.params["Intercept"]
    res_high += (best_results_prep.params["Intercept"] + best_results_prep.bse["Intercept"])
    # dataset_size.
    res_low += (best_results_prep.params["X1POW1"] - best_results_prep.bse["X1POW1"]) * dataset_size
    res += best_results_prep.params["X1POW1"] * dataset_size
    res_high += (best_results_prep.params["X1POW1"] + best_results_prep.bse["X1POW1"]) * dataset_size
    # Return.
    return res_low, res, res_high

### 3.1.2. Print all preprocessing computation time model.

In [None]:
# Create a new figure.
fig_plot_preprocessing: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_preprocessing = fig_plot_preprocessing.gca()

# Set range of axis.
axis_plot_preprocessing.set_xlim(xmin=0, xmax=5500)
axis_plot_preprocessing.set_ylim(ymin=0, ymax=35)

# Plot computation time for preprocessing.
axis_plot_preprocessing.plot(
    df_preprocessing["dataset_size"],  # x
    df_preprocessing["time_total"],  # y
    label="Temps de calcul observé des prétraitements 'prep.simple', 'prep.lemma', 'prep.filter'",
    marker="x",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=0,
    linestyle="",
)
axis_plot_preprocessing.plot(
    range(0, 5500, 100),  # x
    [
        interpolation_prep(x)[1]
        for x in range(0, 5500, 100)
    ],  # y
    label="Temps de calcul modélisé des prétraitements 'prep.simple', 'prep.lemma', 'prep.filter'",
    marker="",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
)
axis_plot_preprocessing.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        interpolation_prep(x)[0]
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        interpolation_prep(x)[2]
        for x in range(0, 5500, 100)
    ],  # y2
    color="red",
    alpha=0.2,
)

# Set axis name.
axis_plot_preprocessing.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_preprocessing.set_ylabel("temps de calcul [secondes]", fontsize=18,)

# Plot the legend.
axis_plot_preprocessing.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_preprocessing.grid(True)
    
# Store the graph.
fig_plot_preprocessing.savefig(
    "../results/etude-temps-calcul-modelisation-1prep.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

----------

## 2.2. ANALYSIS FOR VECTORIZATION

> - algorithms: `tfidf`, `spacy`
> - factors: `dataset_size`

In [None]:
df_vectorization = pd.read_csv("../results/experiments_synthesis_for_vectorization.csv", sep=";", index_col=0)
df_vectorization["time_total"] = df_vectorization["time_total"].str.replace(",", ".").astype(float)
df_vectorization.head()

In [None]:
# Compute interaction of factors for preprocessing.
factors_interactions_for_vectorization, df_vectorization = compute_combinations_of_interactions_of_factors(
    df=df_vectorization,
    factors=["dataset_size"],
    range_of_powers=[0, 1, 2],
    max_power=3
)
factors_interactions_for_vectorization

### 2.2.0. Dertermine if vectorization computation time is `algorithm_name` dependant.

In [None]:
# Fit the model to the data and print results.
model_vect_ALGONAME = statsmodels.formula.api.glm(
    formula="time_total ~ C(algorithm_name)",
    data=df_vectorization,
)
results_vect_ALGONAME = model_vect_ALGONAME.fit()
print(results_vect_ALGONAME.summary())

### 2.2.1 Modelize `tfidf` vectorization `algorithm_name`.

In [None]:
# Compute correlation of factors for vectorization.
df_correlation_vectorization_tfidf: pd.DataFrame = compute_correlation_of_factors(
    df=df_vectorization,
    factors_interactions=factors_interactions_for_vectorization,
    algorithm_name="tfidf",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_vectorization_tfidf = compute_information_score_evolution(
    df=df_vectorization,
    df_correlation=df_correlation_vectorization_tfidf,
    factors_interactions=factors_interactions_for_vectorization,
    algorithm_name="tfidf",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-2vect-tfidf.png",
    graph_plot_description="R² pour modéliser la vectorisation 'vect.tfidf'",
)
df_correlation_vectorization_tfidf

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_vectorization_tfidf = compare_glm_models(
    df=df_vectorization,
    algorithm_name="tfidf",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 0 + X1POW1",
        "time_total ~ 1 + X1POW1",
    ],
)
df_scores_vectorization_tfidf

> `vectorization.tfidf ~ 1 + dataset_size`

In [None]:
# Fit the model to the data and print results.
best_model_vect_tfidf = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW1",  # X1POW1 = dataset_size
    data=df_vectorization[df_vectorization["algorithm_name"]=="tfidf"],
)
best_results_vect_tfidf = best_model_vect_tfidf.fit()
print("==============================================================================")
print(">>> formula:", best_model_vect_tfidf.formula)
print("==============================================================================")
print(best_results_vect_tfidf.summary())

In [None]:
# Print the modelization.
print(
    "vectorisation.tfidf ~",
    "{0:.2E}".format(best_results_vect_tfidf.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_vect_tfidf.params["X1POW1"], factors_interactions_for_vectorization["X1POW1"])
)

In [None]:
# Define the interpolation function.
def interpolation_vect_tfidf(dataset_size) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_vect_tfidf.params["Intercept"] - best_results_vect_tfidf.bse["Intercept"])
    res += best_results_vect_tfidf.params["Intercept"]
    res_high += (best_results_vect_tfidf.params["Intercept"] + best_results_vect_tfidf.bse["Intercept"])
    # dataset_size.
    res_low += (best_results_vect_tfidf.params["X1POW1"] - best_results_vect_tfidf.bse["X1POW1"]) * dataset_size
    res += best_results_vect_tfidf.params["X1POW1"] * dataset_size
    res_high += (best_results_vect_tfidf.params["X1POW1"] + best_results_vect_tfidf.bse["X1POW1"]) * dataset_size
    # Return.
    return res_low, res, res_high

### 2.2.2. Modelize `frcorenewsmd` vectorization `algorithm_name`.

In [None]:
# Compute correlation of factors for vectorization.
df_correlation_vectorization_frcorenewsmd: pd.DataFrame = compute_correlation_of_factors(
    df=df_vectorization,
    factors_interactions=factors_interactions_for_vectorization,
    algorithm_name="spacy",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_vectorization_frcorenewsmd = compute_information_score_evolution(
    df=df_vectorization,
    df_correlation=df_correlation_vectorization_frcorenewsmd,
    factors_interactions=factors_interactions_for_vectorization,
    algorithm_name="spacy",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-2vect-frcorenewsmd.png",
    graph_plot_description="R² pour modéliser la vectorisation 'vect.frcorenewsmd'",
)
df_correlation_vectorization_frcorenewsmd

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_vectorization_frcorenewsmd = compare_glm_models(
    df=df_vectorization,
    algorithm_name="spacy",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 0 + X1POW1",
        "time_total ~ 1 + X1POW1",
    ],
)
df_scores_vectorization_frcorenewsmd

> `vectorization.spacy ~ 1 + dataset_size`

In [None]:
# Fit the model to the data and print results.
best_model_vect_frcorenewsmd = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW1",  # X1POW1 = dataset_size
    data=df_vectorization[df_vectorization["algorithm_name"]=="spacy"],
)
best_results_vect_frcorenewsmd = best_model_vect_frcorenewsmd.fit()
print("==============================================================================")
print(">>> formula:", best_model_vect_frcorenewsmd.formula)
print("==============================================================================")
print(best_results_vect_frcorenewsmd.summary())

In [None]:
# Print the modelization.
print(
    "vectorization.frcorenewsmd ~",
    "{0:.2E}".format(best_results_vect_frcorenewsmd.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_vect_frcorenewsmd.params["X1POW1"], factors_interactions_for_vectorization["X1POW1"])
)

In [None]:
# Define the interpolation function.
def interpolation_vect_frcorenewsmd(dataset_size) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_vect_frcorenewsmd.params["Intercept"] - best_results_vect_frcorenewsmd.bse["Intercept"])
    res += best_results_vect_frcorenewsmd.params["Intercept"]
    res_high += (best_results_vect_frcorenewsmd.params["Intercept"] + best_results_vect_frcorenewsmd.bse["Intercept"])
    # dataset_size.
    res_low += (best_results_vect_frcorenewsmd.params["X1POW1"] - best_results_vect_frcorenewsmd.bse["X1POW1"]) * dataset_size
    res += best_results_vect_frcorenewsmd.params["X1POW1"] * dataset_size
    res_high += (best_results_vect_frcorenewsmd.params["X1POW1"] + best_results_vect_frcorenewsmd.bse["X1POW1"]) * dataset_size
    # Return.
    return res_low, res, res_high

### 2.2.3. Print all vectorization computation time model.

In [None]:
# Create a new figure.
fig_plot_vectorization: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_vectorization = fig_plot_vectorization.gca()

# Set range of axis.
axis_plot_vectorization.set_xlim(xmin=0, xmax=5500)
axis_plot_vectorization.set_ylim(ymin=0, ymax=25)

# Plot computation time for tfidf.
axis_plot_vectorization.plot(
    df_vectorization[df_vectorization["algorithm_name"]=="tfidf"]["dataset_size"],  # x
    df_vectorization[df_vectorization["algorithm_name"]=="tfidf"]["time_total"],  # y
    label="Temps de calcul observé de la vectorisation 'vect.tfidf'",
    marker="x",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=0,
    linestyle="",
)
axis_plot_vectorization.plot(
    range(0, 5500, 100),  # x
    [
        interpolation_vect_tfidf(x)[1]
        for x in range(0, 5500, 100)
    ],  # y
    label="Temps de calcul modélisé de la vectorisation 'vect.tfidf'",
    marker="",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
)
axis_plot_vectorization.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        interpolation_vect_tfidf(x)[0]
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        interpolation_vect_tfidf(x)[2]
        for x in range(0, 5500, 100)
    ],  # y2
    color="red",
    alpha=0.2,
)

# Plot computation time for frcorenewsmd.
axis_plot_vectorization.plot(
    df_vectorization[df_vectorization["algorithm_name"]=="spacy"]["dataset_size"],  # x
    df_vectorization[df_vectorization["algorithm_name"]=="spacy"]["time_total"],  # y
    label="Temps de calcul observé de la vectorisation 'vect.frcorenewsmd'",
    marker="x",
    markerfacecolor="blue",
    markersize=3,
    color="blue",
    linewidth=0,
    linestyle="",
)
axis_plot_vectorization.plot(
    range(0, 5500, 100),  # x
    [
        interpolation_vect_frcorenewsmd(x)[1]
        for x in range(0, 5500, 100)
    ],  # y
    label="Temps de calcul modélisé de la vectorisation 'vect.frcorenewsmd'",
    marker="",
    markerfacecolor="blue",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
)
axis_plot_vectorization.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        interpolation_vect_frcorenewsmd(x)[0]
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        interpolation_vect_frcorenewsmd(x)[2]
        for x in range(0, 5500, 100)
    ],  # y2
    color="blue",
    alpha=0.2,
)

# Set axis name.
axis_plot_vectorization.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_vectorization.set_ylabel("temps de calcul [secondes]", fontsize=18,)

# Plot the legend.
axis_plot_vectorization.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_vectorization.grid(True)

# Store the graph.
fig_plot_vectorization.savefig(
    "../results/etude-temps-calcul-modelisation-2vect.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

----------

## 2.3. ANALYSIS FOR CLUSTERING

> - algorithms: `kmeans_COP`, `hier_single`, `hier_complete`, `hier_average`, `hier_ward`, `spectral_SPEC`
> - factors: `dataset_size`, `previous_nb_constraints`, `algorithm_nb_clusters`

In [None]:
df_clustering = pd.read_csv("../results/experiments_synthesis_for_clustering.csv", sep=";", index_col=0)
df_clustering["time_total"] = df_clustering["time_total"].str.replace(",", ".").astype(float)
df_clustering.head()

In [None]:
# Compute interaction of factors for clustering.
factors_interactions_for_clustering, df_clustering = compute_combinations_of_interactions_of_factors(
    df=df_clustering,
    factors=["dataset_size", "previous_nb_constraints", "algorithm_nb_clusters"],
    range_of_powers=[0, 1, 2],
    max_power=3,
)
factors_interactions_for_clustering

### 2.3.0. Dertermine if clustering computation time is `algorithm_name` dependant.

In [None]:
# Fit the model to the data and print results.
model_clust_ALGONAME = statsmodels.formula.api.glm(
    formula="time_total ~ C(algorithm_name)",
    data=df_clustering,
)
results_clust_ALGONAME = model_clust_ALGONAME.fit()
print(results_clust_ALGONAME.summary())

### 2.3.1. Modelize `kmeans_COP` clustering `algorithm_name`.

In [None]:
# Compute correlation of factors for clustering.
df_correlation_clustering_kmeans_COP: pd.DataFrame = compute_correlation_of_factors(
    df=df_clustering,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="kmeans_COP",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_clustering_kmeans_COP = compute_information_score_evolution(
    df=df_clustering,
    df_correlation=df_correlation_clustering_kmeans_COP,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="kmeans_COP",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-3clust-kmeans-cop.png",
    graph_plot_description="R² pour modéliser le clustering 'clust.kmeans.cop'",
)
df_correlation_clustering_kmeans_COP

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_clustering_kmeans_COP = compare_glm_models(
    df=df_clustering,
    algorithm_name="kmeans_COP",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW1",
        "time_total ~ 1 + X1POW2_X3POW1",
    ],
)
df_scores_clustering_kmeans_COP

> `clustering.kmeans_COP ~ 1 + dataset_size`

In [None]:
# Fit the model to the data and print results.
best_model_clust_kmeans_COP = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW1",  # dataset_size, (dataset_size**2)*(algorithm_nb_clusters)
    data=df_clustering[df_clustering["algorithm_name"]=="kmeans_COP"],
)
best_results_clust_kmeans_COP = best_model_clust_kmeans_COP.fit()
print("==============================================================================")
print(">>> formula:", best_model_clust_kmeans_COP.formula)
print("==============================================================================")
print(best_results_clust_kmeans_COP.summary())

In [None]:
# Print the modelization.
print(
    "clustering.kmeans.cop ~",
    "{0:.2E}".format(best_results_clust_kmeans_COP.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_clust_kmeans_COP.params["X1POW1"], factors_interactions_for_clustering["X1POW1"])
    # "+ {0:.2E}(+/-{1:.2E})*{2}".format(best_results_clust_kmeans_COP.params["X1POW2_X3POW1"], best_results_clust_kmeans_COP.bse["X1POW2_X3POW1"], factors_interactions_for_clustering["X1POW2_X3POW1"])
)

In [None]:
# Define the interpolation function.
def interpolation_clust_kmeans_COP(dataset_size, previous_nb_constraints, algorithm_nb_clusters) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_clust_kmeans_COP.params["Intercept"] - best_results_clust_kmeans_COP.bse["Intercept"])
    res += best_results_clust_kmeans_COP.params["Intercept"]
    res_high += (best_results_clust_kmeans_COP.params["Intercept"] + best_results_clust_kmeans_COP.bse["Intercept"])
    # dataset_size.
    res_low += (best_results_clust_kmeans_COP.params["X1POW1"] - best_results_clust_kmeans_COP.bse["X1POW1"]) * dataset_size
    res += best_results_clust_kmeans_COP.params["X1POW1"] * dataset_size
    res_high += (best_results_clust_kmeans_COP.params["X1POW1"] + best_results_clust_kmeans_COP.bse["X1POW1"]) * dataset_size
    # (dataset_size**2)*(algorithm_nb_clusters)
    res_low += 0  # (best_results_clust_kmeans_COP.params["X1POW2_X3POW1"] - best_results_clust_kmeans_COP.bse["X1POW2_X3POW1"]) * (dataset_size**2)*(algorithm_nb_clusters)
    res += 0  # best_results_clust_kmeans_COP.params["X1POW2_X3POW1"] * (dataset_size**2)*(algorithm_nb_clusters)
    res_high += 0  # (best_results_clust_kmeans_COP.params["X1POW2_X3POW1"] + best_results_clust_kmeans_COP.bse["X1POW2_X3POW1"]) * (dataset_size**2)*(algorithm_nb_clusters)
    # Return.
    return res_low, res, res_high

### 2.3.2. Modelize `hier_single` clustering `algorithm_name`.

In [None]:
# Compute correlation of factors for clustering.
df_correlation_clustering_hier_single: pd.DataFrame = compute_correlation_of_factors(
    df=df_clustering,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_single",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_clustering_hier_single = compute_information_score_evolution(
    df=df_clustering,
    df_correlation=df_correlation_clustering_hier_single,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_single",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-3clust-hier-sing.png",
    graph_plot_description="R² pour modéliser le clustering 'clust.hier.sing'",
)
df_correlation_clustering_hier_single

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_clustering_hier_sing = compare_glm_models(
    df=df_clustering,
    algorithm_name="hier_single",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2_X3POW1",
    ],
)
df_scores_clustering_hier_sing

> `clustering.hier_sing ~ 1 + dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_clust_hier_single = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW2",  # dataset_size**2
    data=df_clustering[df_clustering["algorithm_name"]=="hier_single"],
)
best_results_clust_hier_single = best_model_clust_hier_single.fit()
print(best_results_clust_hier_single.summary())

In [None]:
# Print the modelization.
print(
    "clustering.hier.sing ~",
    "{0:.2E}".format(best_results_clust_hier_single.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_clust_hier_single.params["X1POW2"], factors_interactions_for_clustering["X1POW2"]),
    #"+ {0:.2E}*{1}".format(best_results_clust_hier_single.params["X1POW1_X2POW1"], factors_interactions_for_clustering["X1POW1_X2POW1"]),
)

In [None]:
# Define the interpolation function.
def interpolation_clust_hier_single(dataset_size, previous_nb_constraints, algorithm_nb_clusters) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_clust_hier_single.params["Intercept"] - best_results_clust_hier_single.bse["Intercept"])
    res += best_results_clust_hier_single.params["Intercept"]
    res_high += (best_results_clust_hier_single.params["Intercept"] + best_results_clust_hier_single.bse["Intercept"])
    # dataset_size**2.
    res_low += (best_results_clust_hier_single.params["X1POW2"] - best_results_clust_hier_single.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_clust_hier_single.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_clust_hier_single.params["X1POW2"] + best_results_clust_hier_single.bse["X1POW2"]) * np.power(dataset_size,2)
    # dataset_size*previous_nb_constraints.
    res_low += 0.0  # (best_results_clust_hier_single.params["X1POW2"] - best_results_clust_hier_single.bse["X1POW1_X2POW1"]) * dataset_size*previous_nb_constraints
    res += 0.0  # best_results_clust_hier_single.params["X1POW1_X2POW1"] * dataset_size*previous_nb_constraints
    res_high += 0.0  # (best_results_clust_hier_single.params["X1POW1_X2POW1"] + best_results_clust_hier_single.bse["X1POW2"]) * dataset_size*previous_nb_constraints
    # Return.
    return res_low, res, res_high

### 2.3.3. Modelize `hier_complete` clustering `algorithm_name`.

In [None]:
# Compute correlation of factors for clustering.
df_correlation_clustering_hier_complete: pd.DataFrame = compute_correlation_of_factors(
    df=df_clustering,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_complete",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_clustering_hier_complete = compute_information_score_evolution(
    df=df_clustering,
    df_correlation=df_correlation_clustering_hier_complete,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_complete",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-3clust-hier-comp.png",
    graph_plot_description="R² pour modéliser le clustering 'clust.hier.comp'",
)
df_correlation_clustering_hier_complete

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_clustering_hier_comp = compare_glm_models(
    df=df_clustering,
    algorithm_name="hier_complete",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2_X3POW1",
    ],
)
df_scores_clustering_hier_comp

> `clustering.hier_complete ~ 1 + dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_clust_hier_complete = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW2",  # dataset_size**2
    data=df_clustering[df_clustering["algorithm_name"]=="hier_complete"],
)
best_results_clust_hier_complete = best_model_clust_hier_complete.fit()
print(best_results_clust_hier_complete.summary())

In [None]:
# Print the modelization.
print(
    "clustering.hier.comp ~",
    "{0:.2E}".format(best_results_clust_hier_complete.params["Intercept"]),
    "+ {0:.2E}{1}".format(best_results_clust_hier_complete.params["X1POW2"], factors_interactions_for_clustering["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_clust_hier_complete(dataset_size, previous_nb_constraints, algorithm_nb_clusters) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept
    res_low += (best_results_clust_hier_complete.params["Intercept"] - best_results_clust_hier_complete.bse["Intercept"])
    res += best_results_clust_hier_complete.params["Intercept"]
    res_high += (best_results_clust_hier_complete.params["Intercept"] + best_results_clust_hier_complete.bse["Intercept"])
    # X1POW2: dataset_size**2
    res_low += (best_results_clust_hier_complete.params["X1POW2"] - best_results_clust_hier_complete.bse["X1POW2"]) * np.power(dataset_size,2)
    res += (best_results_clust_hier_complete.params["X1POW2"]) * np.power(dataset_size,2)
    res_high += (best_results_clust_hier_complete.params["X1POW2"] + best_results_clust_hier_complete.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return.
    return res_low, res, res_high

### 2.3.4. Modelize `hier_average` clustering `algorithm_name`.

In [None]:
# Compute correlation of factors for clustering.
df_correlation_clustering_hier_average: pd.DataFrame = compute_correlation_of_factors(
    df=df_clustering,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_average",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_clustering_hier_average = compute_information_score_evolution(
    df=df_clustering,
    df_correlation=df_correlation_clustering_hier_average,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_average",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-3clust-hier-avg.png",
    graph_plot_description="R² pour modéliser le clustering 'clust.hier.avg'",
)
df_correlation_clustering_hier_average

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_clustering_hier_avg = compare_glm_models(
    df=df_clustering,
    algorithm_name="hier_average",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2_X3POW1",
    ],
)
df_scores_clustering_hier_avg

> `clustering.hier_average ~ 1 + dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_clust_hier_average = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW2",  # dataset_size**2
    data=df_clustering[df_clustering["algorithm_name"]=="hier_average"],
)
best_results_clust_hier_average = best_model_clust_hier_average.fit()
print(best_results_clust_hier_average.summary())

In [None]:
# Print the modelization.
print(
    "clustering.hier.avg ~",
    "{0:.2E}".format(best_results_clust_hier_average.params["Intercept"]),
    "+ {0:.2E}{1}".format(best_results_clust_hier_average.params["X1POW2"], factors_interactions_for_clustering["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_clust_hier_average(dataset_size, previous_nb_constraints, algorithm_nb_clusters) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept
    res_low += (best_results_clust_hier_average.params["Intercept"] - best_results_clust_hier_average.bse["Intercept"])
    res += best_results_clust_hier_average.params["Intercept"]
    res_high += (best_results_clust_hier_average.params["Intercept"] + best_results_clust_hier_average.bse["Intercept"])
    # X1POW2: dataset_size**2
    res_low += (best_results_clust_hier_average.params["X1POW2"] - best_results_clust_hier_average.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_clust_hier_average.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_clust_hier_average.params["X1POW2"] + best_results_clust_hier_average.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return.
    return res_low, res, res_high

### 2.3.5. Modelize `hier_ward` clustering `algorithm_name`.

In [None]:
# Compute correlation of factors for clustering.
df_correlation_clustering_hier_ward: pd.DataFrame = compute_correlation_of_factors(
    df=df_clustering,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_ward",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_clustering_hier_ward = compute_information_score_evolution(
    df=df_clustering,
    df_correlation=df_correlation_clustering_hier_ward,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="hier_ward",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-3clust-hier-ward.png",
    graph_plot_description="R² pour modéliser le clustering 'clust.hier.ward'",
)
df_correlation_clustering_hier_ward

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_clustering_hier_ward = compare_glm_models(
    df=df_clustering,
    algorithm_name="hier_ward",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2_X3POW1",
    ],
)
df_scores_clustering_hier_ward

> `clustering.hier_ward ~ 1 + dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_clust_hier_ward = statsmodels.formula.api.glm(
    formula="time_total ~ X1POW2",  # dataset_size**2
    data=df_clustering[df_clustering["algorithm_name"]=="hier_ward"],
)
best_results_clust_hier_ward = best_model_clust_hier_ward.fit()
print(best_results_clust_hier_ward.summary())

In [None]:
# Print the modelization.
print(
    "clustering.hier.ward ~",
    "{0:.2E}".format(best_results_clust_hier_ward.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_clust_hier_ward.params["X1POW2"], factors_interactions_for_clustering["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_clust_hier_ward(dataset_size, previous_nb_constraints, algorithm_nb_clusters) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_clust_hier_ward.params["Intercept"] - best_results_clust_hier_ward.bse["Intercept"])
    res += best_results_clust_hier_ward.params["Intercept"]
    res_high += (best_results_clust_hier_ward.params["Intercept"] + best_results_clust_hier_ward.bse["Intercept"])
    # X1POW2: dataset_size**2.
    res_low += (best_results_clust_hier_ward.params["X1POW2"] - best_results_clust_hier_ward.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_clust_hier_ward.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_clust_hier_ward.params["X1POW2"] + best_results_clust_hier_ward.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return. 
    return res_low, res, res_high

### 2.3.6. Modelize `spectral_SPEC` clustering `algorithm_name`.

In [None]:
# Compute correlation of factors for clustering.
df_correlation_clustering_spectral_SPEC: pd.DataFrame = compute_correlation_of_factors(
    df=df_clustering,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="spectral_SPEC",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_clustering_spectral_SPEC = compute_information_score_evolution(
    df=df_clustering,
    df_correlation=df_correlation_clustering_spectral_SPEC,
    factors_interactions=factors_interactions_for_clustering,
    algorithm_name="spectral_SPEC",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-3clust-spec.png",
    graph_plot_description="R² pour modéliser le clustering 'clust.spec'",
)
df_correlation_clustering_spectral_SPEC

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_clustering_spec = compare_glm_models(
    df=df_clustering,
    algorithm_name="spectral_SPEC",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2 + X1POW2_X3POW1",
    ],
)
df_scores_clustering_spec

> `clustering.spectral_SPEC ~ 1 + dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_clust_spectral_SPEC = statsmodels.formula.api.glm(
    formula="time_total ~ X1POW2",  # dataset_size**2
    data=df_clustering[df_clustering["algorithm_name"]=="spectral_SPEC"],
)
best_results_clust_spectral_SPEC = best_model_clust_spectral_SPEC.fit()
print(best_results_clust_spectral_SPEC.summary())

In [None]:
# Print the modelization.
print(
    "clustering.spectral.spec ~",
    "{0:.2E}".format(best_results_clust_spectral_SPEC.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_clust_spectral_SPEC.params["X1POW2"], factors_interactions_for_clustering["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_clust_spectral_SPEC(dataset_size, previous_nb_constraints, algorithm_nb_clusters) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_clust_spectral_SPEC.params["Intercept"] - best_results_clust_spectral_SPEC.bse["Intercept"])
    res += best_results_clust_spectral_SPEC.params["Intercept"]
    res_high += (best_results_clust_spectral_SPEC.params["Intercept"] + best_results_clust_spectral_SPEC.bse["Intercept"])
    # X1POW2: dataset_size**2.
    res_low += (best_results_clust_spectral_SPEC.params["X1POW2"] - best_results_clust_spectral_SPEC.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_clust_spectral_SPEC.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_clust_spectral_SPEC.params["X1POW2"] + best_results_clust_spectral_SPEC.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return.
    return res_low, res, res_high

### 2.3.7. Print all clustering computation time model.

In [None]:
# Create a new figure.
fig_plot_clustering: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_clustering = fig_plot_clustering.gca()

# Set range of axis.
axis_plot_clustering.set_xlim(xmin=0, xmax=5500)
axis_plot_clustering.set_ylim(ymin=0, ymax=20000)

# Plot computation time for kmeans_COP.
axis_plot_clustering.plot(
    df_clustering[df_clustering["algorithm_name"]=="kmeans_COP"]["dataset_size"],  # x
    df_clustering[df_clustering["algorithm_name"]=="kmeans_COP"]["time_total"],  # y
    label="Temps de calcul observé du clustering 'clust.kmeans.cop'",
    marker="x",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=0,
    linestyle="",
)
axis_plot_clustering.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_clust_kmeans_COP(x, y, z)[1]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé du clustering 'clust.kmeans.cop'",
    marker="",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
)
axis_plot_clustering.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_clust_kmeans_COP(x, y, z)[0]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        np.mean([
            interpolation_clust_kmeans_COP(x, y, z)[2]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y2
    color="red",
    alpha=0.2,
)

# Plot computation time for hier_single.
axis_plot_clustering.plot(
    df_clustering[df_clustering["algorithm_name"]=="hier_single"]["dataset_size"],  # x
    df_clustering[df_clustering["algorithm_name"]=="hier_single"]["time_total"],  # y
    label="Temps de calcul observé du clustering 'clust.hier.sing'",
    marker="x",
    markerfacecolor="blue",
    markersize=3,
    color="blue",
    linewidth=0,
    linestyle="",
)
axis_plot_clustering.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_clust_hier_single(x, y, z)[1]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé du clustering 'clust.hier.sing'",
    marker="",
    markerfacecolor="blue",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
)
axis_plot_clustering.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_clust_hier_single(x, y, z)[0]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        np.mean([
            interpolation_clust_hier_single(x, y, z)[2]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y2
    color="blue",
    alpha=0.2,
)

# Plot computation time for hier_complete.
axis_plot_clustering.plot(
    df_clustering[df_clustering["algorithm_name"]=="hier_complete"]["dataset_size"],  # x
    df_clustering[df_clustering["algorithm_name"]=="hier_complete"]["time_total"],  # y
    label="Temps de calcul observé du clustering 'clust.hier.comp'",
    marker="x",
    markerfacecolor="green",
    markersize=3,
    color="green",
    linewidth=0,
    linestyle="",
)
axis_plot_clustering.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_clust_hier_complete(x, y, z)[1]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé du clustering 'clust.hier.comp'",
    marker="",
    markerfacecolor="green",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
)
axis_plot_clustering.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_clust_hier_complete(x, y, z)[0]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        np.mean([
            interpolation_clust_hier_complete(x, y, z)[2]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y2
    color="green",
    alpha=0.2,
)

# Plot computation time for hier_average.
axis_plot_clustering.plot(
    df_clustering[df_clustering["algorithm_name"]=="hier_average"]["dataset_size"],  # x
    df_clustering[df_clustering["algorithm_name"]=="hier_average"]["time_total"],  # y
    label="Temps de calcul observé du clustering 'clust.hier.avg'",
    marker="x",
    markerfacecolor="orange",
    markersize=3,
    color="orange",
    linewidth=0,
    linestyle="",
)
axis_plot_clustering.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_clust_hier_average(x, y, z)[1]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé du clustering 'clust.hier.avg'",
    marker="",
    markerfacecolor="orange",
    markersize=3,
    color="orange",
    linewidth=2,
    linestyle="--",
)
axis_plot_clustering.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_clust_hier_average(x, y, z)[0]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        np.mean([
            interpolation_clust_hier_average(x, y, z)[2]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y2
    color="orange",
    alpha=0.2,
)

# Plot computation time for hier_ward.
axis_plot_clustering.plot(
    df_clustering[df_clustering["algorithm_name"]=="hier_ward"]["dataset_size"],  # x
    df_clustering[df_clustering["algorithm_name"]=="hier_ward"]["time_total"],  # y
    label="Temps de calcul observé du clustering 'clust.hier.ward'",
    marker="x",
    markerfacecolor="violet",
    markersize=3,
    color="violet",
    linewidth=0,
    linestyle="",
)
axis_plot_clustering.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_clust_hier_ward(x, y, z)[1]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé du clustering 'clust.hier.ward'",
    marker="",
    markerfacecolor="violet",
    markersize=3,
    color="violet",
    linewidth=2,
    linestyle="--",
)
axis_plot_clustering.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_clust_hier_ward(x, y, z)[0]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        np.mean([
            interpolation_clust_hier_ward(x, y, z)[2]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y2
    color="violet",
    alpha=0.2,
)

# Plot computation time for spectral_SPEC.
axis_plot_clustering.plot(
    df_clustering[df_clustering["algorithm_name"]=="spectral_SPEC"]["dataset_size"],  # x
    df_clustering[df_clustering["algorithm_name"]=="spectral_SPEC"]["time_total"],  # y
    label="Temps de calcul observé du clustering 'clust.spec'",
    marker="x",
    markerfacecolor="cyan",
    markersize=3,
    color="cyan",
    linewidth=0,
    linestyle="",
)
axis_plot_clustering.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_clust_spectral_SPEC(x, y, z)[1]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé du clustering 'clust.hier.spec'",
    marker="",
    markerfacecolor="cyan",
    markersize=3,
    color="cyan",
    linewidth=2,
    linestyle="--",
)
axis_plot_clustering.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_clust_spectral_SPEC(x, y, z)[0]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y1
    y2=[
        np.mean([
            interpolation_clust_spectral_SPEC(x, y, z)[2]
            for y in range(0, 5500, 500)
            for z in range(0, 55, 10)
        ])
        for x in range(0, 5500, 100)
    ],  # y2
    color="cyan",
    alpha=0.2,
)

# Set axis name.
axis_plot_clustering.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_clustering.set_ylabel("temps de calcul [secondes]", fontsize=18,)

# Plot the legend.
axis_plot_clustering.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_clustering.grid(True)

# Store the graph.
fig_plot_clustering.savefig(
    "../results/etude-temps-calcul-modelisation-3clust.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

----------

## 2.4. ANALYSIS FOR SAMPLING

> - algorithms: `random`, `in_same`, `farthest`, `closest`
> - factors: `dataset_size`, `previous_nb_constraints`, `previous_nb_clusters`, `algorithm_nb_to_select`

In [None]:
df_sampling = pd.read_csv("../results/experiments_synthesis_for_sampling.csv", sep=";", index_col=0)
df_sampling["time_total"] = df_sampling["time_total"].str.replace(",", ".").astype(float)
df_sampling.head()

In [None]:
# Compute interaction of factors for sampling.
factors_interactions_for_sampling, df_sampling = compute_combinations_of_interactions_of_factors(
    df=df_sampling,
    factors=["dataset_size", "previous_nb_constraints", "previous_nb_clusters", "algorithm_nb_to_select"],
    range_of_powers=[0, 1, 2],
    max_power=3,
)
factors_interactions_for_sampling

### 2.4.0. Dertermine if sampling computation time is `algorithm_name` dependant.

In [None]:
# Fit the model to the data and print results.
model_samp_ALGONAME = statsmodels.formula.api.glm(
    formula="time_total ~ C(algorithm_name)",
    data=df_sampling,
)
results_samp_ALGONAME = model_samp_ALGONAME.fit()
print(results_samp_ALGONAME.summary())

### 2.4.1. Modelize `random` sampling `algorithm_name`.

In [None]:
# Compute correlation of factors for sampling.
df_correlation_sampling_random: pd.DataFrame = compute_correlation_of_factors(
    df=df_sampling,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="random",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_sampling_random = compute_information_score_evolution(
    df=df_sampling,
    df_correlation=df_correlation_sampling_random,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="random",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-4samp-rand-full.png",
    graph_plot_description="R² pour modéliser l'échantillonnage 'samp.rand.full'",
)
df_correlation_sampling_random

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_sampling_random = compare_glm_models(
    df=df_sampling,
    algorithm_name="random",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2 + X1POW2_X3POW1",
    ],
)
df_scores_sampling_random

> `sampling.random ~ 1 + dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_samp_random = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW2",  # dataset_size**2
    data=df_sampling[df_sampling["algorithm_name"]=="random"],
)
best_results_samp_random = best_model_samp_random.fit()
print(best_results_samp_random.summary())

In [None]:
# Print the modelization.
print(
    "sampling.rand.full ~",
    "{0:.2E}".format(best_results_samp_random.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_samp_random.params["X1POW2"], factors_interactions_for_sampling["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_samp_random(dataset_size, previous_nb_constraints, previous_nb_clusters, algorithm_nb_to_select) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_samp_random.params["Intercept"] - best_results_samp_random.bse["Intercept"])
    res += best_results_samp_random.params["Intercept"]
    res_high += (best_results_samp_random.params["Intercept"] + best_results_samp_random.bse["Intercept"])
    # X1POW2: dataset_size**2.
    res_low += (best_results_samp_random.params["X1POW2"] - best_results_samp_random.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_samp_random.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_samp_random.params["X1POW2"] + best_results_samp_random.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return.
    return res_low, res, res_high

### 2.4.2. Modelize `random_in_same` sampling `algorithm_name`.

In [None]:
# Compute correlation of factors for sampling.
df_correlation_sampling_random_in_same: pd.DataFrame = compute_correlation_of_factors(
    df=df_sampling,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="in_same",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_sampling_random_in_same = compute_information_score_evolution(
    df=df_sampling,
    df_correlation=df_correlation_sampling_random_in_same,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="in_same",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-4samp-rand-same.png",
    graph_plot_description="R² pour modéliser l'échantillonnage 'samp.rand.same'",
)
df_correlation_sampling_random_in_same

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_sampling_random_in_same = compare_glm_models(
    df=df_sampling,
    algorithm_name="in_same",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW1_X4POW1",
        "time_total ~ 1 + X1POW2 + X1POW1_X4POW1",
    ],
)
df_scores_sampling_random_in_same

> `sampling.random_in_same ~ 1 + dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_samp_random_in_same = statsmodels.formula.api.glm(
    formula="time_total ~ 1 + X1POW2",  # dataset_size**2
    data=df_sampling[df_sampling["algorithm_name"]=="in_same"],
)
best_results_samp_random_in_same = best_model_samp_random_in_same.fit()
print(best_results_samp_random_in_same.summary())

In [None]:
# Print the modelization.
print(
    "sampling.rand.same ~",
    "{0:.2E}".format(best_results_samp_random_in_same.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_samp_random_in_same.params["X1POW2"], factors_interactions_for_sampling["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_samp_random_in_same(dataset_size, previous_nb_constraints, previous_nb_clusters, algorithm_nb_to_select) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_samp_random_in_same.params["Intercept"] - best_results_samp_random_in_same.bse["Intercept"])
    res += best_results_samp_random_in_same.params["Intercept"]
    res_high += (best_results_samp_random_in_same.params["Intercept"] + best_results_samp_random_in_same.bse["Intercept"])
    # X1POW2: dataset_size**2.
    res_low += (best_results_samp_random_in_same.params["X1POW2"] - best_results_samp_random_in_same.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_samp_random_in_same.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_samp_random_in_same.params["X1POW2"] + best_results_samp_random_in_same.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return.
    return res_low, res, res_high

### 2.4.3. Modelize `farthest_in_same` sampling `algorithm_name`.

In [None]:
# Compute correlation of factors for sampling.
df_correlation_sampling_farthest_in_same: pd.DataFrame = compute_correlation_of_factors(
    df=df_sampling,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="farthest",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_sampling_farthest_in_same = compute_information_score_evolution(
    df=df_sampling,
    df_correlation=df_correlation_sampling_farthest_in_same,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="farthest",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-4samp-farthest-same.png",
    graph_plot_description="R² pour modéliser l'échantillonnage 'samp.farthest.same'",
)
df_correlation_sampling_farthest_in_same

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_sampling_farthest_in_same = compare_glm_models(
    df=df_sampling,
    algorithm_name="farthest",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2_X3POW1",
        "time_total ~ 1 + X1POW2 + X1POW2_X3POW1",
    ],
)
df_scores_sampling_farthest_in_same

> `sampling.farthest_in_same ~ dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_samp_farthest_in_same = statsmodels.formula.api.glm(
    formula="time_total ~ X1POW2",  # dataset_size**2
    data=df_sampling[df_sampling["algorithm_name"]=="farthest"],
)
best_results_samp_farthest_in_same = best_model_samp_farthest_in_same.fit()
print(best_results_samp_farthest_in_same.summary())

In [None]:
# Print the modelization.
print(
    "sampling.farthest.same ~",
    "{0:.2E}".format(best_results_samp_farthest_in_same.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_samp_farthest_in_same.params["X1POW2"], factors_interactions_for_sampling["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_samp_farthest_in_same(dataset_size, previous_nb_constraints, previous_nb_clusters, algorithm_nb_to_select) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_samp_farthest_in_same.params["Intercept"] - best_results_samp_farthest_in_same.bse["Intercept"])
    res += best_results_samp_farthest_in_same.params["Intercept"]
    res_high += (best_results_samp_farthest_in_same.params["Intercept"] + best_results_samp_farthest_in_same.bse["Intercept"])
    # X1POW2: dataset_size**2.
    res_low += (best_results_samp_farthest_in_same.params["X1POW2"] - best_results_samp_farthest_in_same.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_samp_farthest_in_same.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_samp_farthest_in_same.params["X1POW2"] + best_results_samp_farthest_in_same.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return.
    return res_low, res, res_high

### 2.4.4. Modelize `closest_in_different` sampling `algorithm_name`.

In [None]:
# Compute correlation of factors for sampling.
df_correlation_sampling_closest_in_different: pd.DataFrame = compute_correlation_of_factors(
    df=df_sampling,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="closest",
)
# Print evolution of information score per model complexity.
information_score_evolution_for_sampling_closest_in_different = compute_information_score_evolution(
    df=df_sampling,
    df_correlation=df_correlation_sampling_closest_in_different,
    factors_interactions=factors_interactions_for_sampling,
    algorithm_name="closest",
    graph_filepath="../results/etude-temps-calcul-analyse-facteurs-4samp-closest-diff.png",
    graph_plot_description="R² pour modéliser l'échantillonnage 'samp.closest.diff'",
)
df_correlation_sampling_closest_in_different

In [None]:
# Compare several GLm models to get the best (lower Deviance, maximum Log-Likelihood)
df_scores_sampling_closest_in_different = compare_glm_models(
    df=df_sampling,
    algorithm_name="closest",
    formulas=[
        "time_total ~ 1",
        "time_total ~ 1 + X1POW2",
        "time_total ~ 1 + X1POW2_X3POW1",
        "time_total ~ 1 + X1POW2 + X1POW2_X3POW1",
    ],
)
df_scores_sampling_closest_in_different

> `sampling.closest_in_different ~ dataset_size**2`

In [None]:
# Fit the model to the data and print results.
best_model_samp_closest_in_different = statsmodels.formula.api.glm(
    formula="time_total ~ X1POW2",  # dataset_size**2
    data=df_sampling[df_sampling["algorithm_name"]=="closest"],
)
best_results_samp_closest_in_different = best_model_samp_closest_in_different.fit()
print(best_results_samp_closest_in_different.summary())

In [None]:
# Print the modelization.
print(
    "sampling.closest.diff ~",
    "{0:.2E}".format(best_results_samp_closest_in_different.params["Intercept"]),
    "+ {0:.2E}*{1}".format(best_results_samp_closest_in_different.params["X1POW2"], factors_interactions_for_sampling["X1POW2"])
)

In [None]:
# Define the interpolation function.
def interpolation_samp_closest_in_different(dataset_size, previous_nb_constraints, previous_nb_clusters, algorithm_nb_to_select) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (best_results_samp_closest_in_different.params["Intercept"] - best_results_samp_closest_in_different.bse["Intercept"])
    res += best_results_samp_closest_in_different.params["Intercept"]
    res_high += (best_results_samp_closest_in_different.params["Intercept"] + best_results_samp_closest_in_different.bse["Intercept"])
    # X1POW2: dataset_size**2.
    res_low += (best_results_samp_closest_in_different.params["X1POW2"] - best_results_samp_closest_in_different.bse["X1POW2"]) * np.power(dataset_size,2)
    res += best_results_samp_closest_in_different.params["X1POW2"] * np.power(dataset_size,2)
    res_high += (best_results_samp_closest_in_different.params["X1POW2"] + best_results_samp_closest_in_different.bse["X1POW2"]) * np.power(dataset_size,2)
    # Return.
    return res_low, res, res_high

### 2.4.5. Print all sampling computation time model.

In [None]:
# Create a new figure.
fig_plot_sampling: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_sampling = fig_plot_sampling.gca()

# Set range of axis.
axis_plot_sampling.set_xlim(xmin=0, xmax=5500)
axis_plot_sampling.set_ylim(ymin=0, ymax=45)

# Plot computation time for random.
axis_plot_sampling.plot(
    df_sampling[df_sampling["algorithm_name"]=="random"]["dataset_size"],  # x
    df_sampling[df_sampling["algorithm_name"]=="random"]["time_total"],  # y
    label="Temps de calcul observé de l'échantillonnage 'samp.random.full'",
    marker="x",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=0,
    linestyle="",
)
axis_plot_sampling.plot(
    range(0, 5500, 100),  # x1
    [
        np.mean([
            interpolation_samp_random(x1, x2, x3, x4)[0]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé de l'échantillonnage 'samp.random.full'",
    marker="",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
)
axis_plot_sampling.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_samp_random(x1, x2, x3, x4)[1]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    y2=[
        np.mean([
            interpolation_samp_random(x1, x2, x3, x4)[2]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    color="red",
    alpha=0.2,
)

# Plot computation time for random_in_same.
axis_plot_sampling.plot(
    df_sampling[df_sampling["algorithm_name"]=="in_same"]["dataset_size"],  # x
    df_sampling[df_sampling["algorithm_name"]=="in_same"]["time_total"],  # y
    label="Temps de calcul observé de l'échantillonnage 'samp.random.same'",
    marker="x",
    markerfacecolor="blue",
    markersize=3,
    color="blue",
    linewidth=0,
    linestyle="",
)
axis_plot_sampling.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_samp_random_in_same(x1, x2, x3, x4)[0]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé de l'échantillonnage 'samp.random.same'",
    marker="",
    markerfacecolor="blue",
    markersize=3,
    color="blue",
    linewidth=2,
    linestyle="--",
)
axis_plot_sampling.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_samp_random_in_same(x1, x2, x3, x4)[1]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    y2=[
        np.mean([
            interpolation_samp_random_in_same(x1, x2, x3, x4)[2]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    color="blue",
    alpha=0.2,
)

# Plot computation time for farthest_in_same.
axis_plot_sampling.plot(
    df_sampling[df_sampling["algorithm_name"]=="farthest"]["dataset_size"],  # x
    df_sampling[df_sampling["algorithm_name"]=="farthest"]["time_total"],  # y
    label="Temps de calcul observé de l'échantillonnage 'samp.farthest.same'",
    marker="x",
    markerfacecolor="green",
    markersize=3,
    color="green",
    linewidth=0,
    linestyle="",
)
axis_plot_sampling.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_samp_farthest_in_same(x1, x2, x3, x4)[0]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé de l'échantillonnage 'samp.farthest.same'",
    marker="",
    markerfacecolor="green",
    markersize=3,
    color="green",
    linewidth=2,
    linestyle="--",
)
axis_plot_sampling.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_samp_farthest_in_same(x1, x2, x3, x4)[1]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    y2=[
        np.mean([
            interpolation_samp_farthest_in_same(x1, x2, x3, x4)[2]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    color="green",
    alpha=0.2,
)

# Plot computation time for closest_in_different.
axis_plot_sampling.plot(
    df_sampling[df_sampling["algorithm_name"]=="closest"]["dataset_size"],  # x
    df_sampling[df_sampling["algorithm_name"]=="closest"]["time_total"],  # y
    label="Temps de calcul observé de l'échantillonnage 'samp.closest.diff'",
    marker="x",
    markerfacecolor="orange",
    markersize=3,
    color="orange",
    linewidth=0,
    linestyle="",
)
axis_plot_sampling.plot(
    range(0, 5500, 100),  # x
    [
        np.mean([
            interpolation_samp_closest_in_different(x1, x2, x3, x4)[0]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    label="Temps de calcul modélisé de l'échantillonnage 'samp.closest.diff'",
    marker="",
    markerfacecolor="orange",
    markersize=3,
    color="orange",
    linewidth=2,
    linestyle="--",
)
axis_plot_sampling.fill_between(
    x=range(0, 5500, 100),  # x
    y1=[
        np.mean([
            interpolation_samp_closest_in_different(x1, x2, x3, x4)[1]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    y2=[
        np.mean([
            interpolation_samp_closest_in_different(x1, x2, x3, x4)[2]
            for x2 in range(0, 5500, 500)
            for x3 in range(0, 55, 5)
            for x4 in range(0, 300, 50)
        ])
        for x1 in range(0, 5500, 100)
    ],
    color="orange",
    alpha=0.2,
)

# Set axis name.
axis_plot_sampling.set_xlabel("nombre de données [#]", fontsize=18,)
axis_plot_sampling.set_ylabel("temps de calcul [secondes]", fontsize=18,)

# Plot the legend.
axis_plot_sampling.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_sampling.grid(True)

# Store the graph.
fig_plot_sampling.savefig(
    "../results/etude-temps-calcul-modelisation-4samp.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)