# ==== INTERACTIVE CLUSTERING COMPARATIVE STUDY ====
> ### Drafts.

------------------------------
## READ-ME BEFORE RUNNING

### Quick Description

***#TODO:***

### Description each steps

***#TODO:***

## [DRAFT] Compare kfold-SVM trainnings on clustering results

***WARNING***: _Start by launching the experiment runs, evaluations and synthesis, and launching main effects analysis before the kfold-svm comparison !_

Import Python dependencies.

In [None]:
from typing import Dict, List, Union  # Python code typing (mypy).
from scipy.sparse import csr_matrix  # To handle matrix and vectors.
import json  # Serialization.
import pickle  # noqa: S403  # Serialization.

from sklearn import svm  # SVM training.
from sklearn.model_selection import StratifiedKFold  # Cross validation management.
from sklearn.model_selection import cross_val_score  # Cross validation management.
import numpy  # Statistics computation.
from scipy import stats as scipystats  # Statistics computation.

from matplotlib import pyplot as plt  # Graph management.
from matplotlib.figure import Figure  # Graph management.

Define settings to use (based on main effects and post-hoc analyses).

In [None]:
# Paremeters to use
settings_to_use: Dict[str, str] = {
    "dataset": "bank_cards",
    "preprocessing": "simple_prep",
    "vectorisation": "tfidf",
    "sampling": "closest-50",
    "clustering": "hier_avg-10c",
    "experiment_id": "0001",
}
random_state: int = 42

# Base exepriences to use (without experiment_id).
env_to_use: str = (
    "../experiments/"
    + settings_to_use["dataset"]
    + "/"
    + settings_to_use["preprocessing"]
    + "/"
    + settings_to_use["vectorisation"]
    + "/"
    + settings_to_use["sampling"]
    + "/"
    + settings_to_use["clustering"]
    + "/"
    + settings_to_use["experiment_id"]
    + "/"
)
env_to_use

Load needed data (vectors, data IDs).

In [None]:
# Load dict of vectors.
with open(env_to_use + "../../../dict_of_vectors.pkl", "rb") as file_vec:
    dict_of_vectors: Dict[str, csr_matrix] = pickle.load(file_vec)  # noqa: S301

# Get list of data IDs
list_of_data_IDs: List[str] = sorted(dict_of_vectors.keys())

# Format vectors
vectors = csr_matrix(
    [dict_of_vectors[data_ID].toarray()[0] for data_ID in list_of_data_IDs]
)

Initialize a linear SVC and a stratified KFold for classification training.

In [None]:
# Initialize Linear SVC svm classifier.
classification_model = svm.LinearSVC(random_state=random_state)

# Initailize KFold cross valider.
cross_valider = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True)

Train the linear SVC with cross validation for several results of clustering.

In [None]:
# Information to compute.
iterations: Dict[float, str] = {}
clustering_labels: Dict[float, List[str]] = {}
clustering_repartition: Dict[float, Dict[str, int]] = {}
cross_validation_scores: Dict[float, List[float]] = {}
cross_validation_mean_scores: Dict[float, float] = {}
cross_validation_sem_scores: Dict[float, float] = {}

# Load clusterings results.
with open(env_to_use + "dict_of_clustering_results.json", "r") as file_clustering:
    CLUSTERING_RESULTS: Dict[str, Dict[str, str]] = json.load(file_clustering)

# Load clustering evaluation results.
with open(env_to_use + "dict_of_clustering_performances.json", "r") as file_evaluation:
    CLUSTERING_EVALUATIONS: Dict[str, Dict[str, float]] = json.load(file_evaluation)

# Load iterations to highlight
with open(env_to_use + "dict_of_iterations_to_highlight.json", "r") as file_iterations:
    ITERATIONS_TO_HIGHTLIGH: Dict[str, Dict[str, Union[None, str, float]]] = json.load(
        file_iterations
    )
ITERATIONS_TO_HIGHTLIGH["0"] = {"iteration": "0000"}

# For each metric to highlight...
for _, values in sorted(ITERATIONS_TO_HIGHTLIGH.items()):

    # Get iteration.
    iteration: str = str(values["iteration"])

    # Get clustering performance.
    v_measure = CLUSTERING_EVALUATIONS[iteration]["v_measure"]
    iterations[v_measure] = iteration

    # Load clustering labels.
    clustering_labels[v_measure] = [
        CLUSTERING_RESULTS[iteration][data_ID] for data_ID in list_of_data_IDs
    ]

    # Compute clusters repartition.
    clustering_repartition[v_measure] = {
        clustering_label: len(
            [
                y_pred
                for y_pred in clustering_labels[v_measure]
                if y_pred == clustering_label
            ]
        )
        for clustering_label in set(clustering_labels[v_measure])
    }

    # Compute cross validation scores.
    cross_validation_scores[v_measure] = cross_val_score(
        estimator=classification_model,
        X=vectors,
        y=clustering_labels[v_measure],
        scoring="accuracy",  # accuracy, f1_macro, accuracy
        cv=cross_valider,
        n_jobs=-1,
    )

    # Compute cross vbalidation mean score.
    cross_validation_mean_scores[v_measure] = numpy.mean(
        cross_validation_scores[v_measure]
    )
    cross_validation_sem_scores[v_measure] = scipystats.sem(
        cross_validation_scores[v_measure]
    )

# Print cross validation mean scores
env_to_use, {
    v_measure: {
        "mean": float(str(cross_validation_mean_scores[v_measure])[:6]),
        "sem": float(str(cross_validation_sem_scores[v_measure])[:6]),
    }
    for v_measure in sorted(cross_validation_mean_scores.keys())
}

Plot the linear SVC performance evolution in function of clustering v-measure.

In [None]:
# Create a new figure.
fig: Figure = plt.figure(figsize=(15, 7.5), dpi=300.0)
axis = fig.gca()

# Set range of axis.
axis.set_xlim(xmin=0, xmax=1)
# axis.set_ylim(ymin=0.85, ymax=1)

# Plot svm performance.
axis.plot(
    [float(v_measure) for v_measure in cross_validation_mean_scores.keys()],  # x
    [
        cross_validation_mean_scores[v_measure]
        for v_measure in cross_validation_mean_scores.keys()
    ],  # y
    # label="SVM performance",
    marker="o",
    markerfacecolor="black",
    markersize=5,
    color="black",
    linewidth=1,
)

# Plot error bars for svm performance.
axis.fill_between(
    x=[float(v_measure) for v_measure in cross_validation_mean_scores.keys()],  # x
    y1=[
        cross_validation_mean_scores[v_measure] - cross_validation_sem_scores[v_measure]
        for v_measure in cross_validation_mean_scores.keys()
    ],  # y1
    y2=[
        cross_validation_mean_scores[v_measure] + cross_validation_sem_scores[v_measure]
        for v_measure in cross_validation_mean_scores.keys()
    ],  # y2
    # label="Standard error of the mean",
    color="black",
    alpha=0.2,
)

# Plot a horizontal line
axis.axhline(
    y=cross_validation_mean_scores[1],
    color="gray",
    linestyle="--",
    # label="",
)

# Set axis name.
axis.set_xlabel(
    "v-measure (%)",
    fontsize=18,
)
axis.set_ylabel(
    "accuracy (%)",
    fontsize=18,
)

# Plot the title.
axis.set_title(
    "Evolution of accuracy obtained during training of an intents classification model",
    fontsize=20,
)


# Plot the grid.
axis.grid(True)

# Store the graph.
fig.savefig(
    "../experiments/plot_svm_training_best_parameters.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

# Close figure.
plt.close()