# ==== INTERACTIVE CLUSTERING : RENTABILITY STUDY ====
> ### Stage 1 : Evaluate rentability of one more iteration.

------------------------------
## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at compute rentability of performaing one more iteration of interactive clustering**.
- Computations are based on previous experiments (cf. efficience study) exported in `./previous` folder.
- Environments results are stored in their `.JSON` files in the `/experiments` folder.

Then, **go to the notebook `2_Plot_some_figures.ipynb` to plot several figures according to these computations**.

### Description each steps

The several computations are :
- `annotation_agreement_score`: Compute the proportion of annotations that is similar to previous clustering results ("MUST_LINK" when same cluster, "CANNOT_LINK" when different clusters). If agreement is low (near to `0.0`), then constraints annotated will considerably fix clustering in next iteration. Otherwise, clustering is stable.
- `clustering_similarity`: Compute v-measure between two clustering to estimate local similarity.

------------------------------
## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Any, Dict, List, Optional, Tuple
import os
import json

In [None]:
import warnings
warnings.filterwarnings("ignore")

------------------------------
## 2. LOAD EXPERIMENTS TO STUDY

Find all implementations to analyze and associated experiments.

In [None]:
# List of run tasks to parallelize.
LIST_OF_TASKS: List[Dict[str, str]] = [
    {
        "implementation": implementation,  # Environment of experiment.
        "experiments": [
            exp_path
            for exp_path in os.listdir("../previous/"+implementation)
            if ".json" in exp_path
        ]
    }
    for implementation in os.listdir("../previous")
    if (
        os.path.isdir("../previous/" + implementation)
        and implementation in [
            "bank_cards_v1_-_settings_0_partial",  # "bank_cards_v1" + "settings_0_partial" # "bank_cards_v1" + "simple_prep_-_tfidf" + "closest-50" + "hier_avg-10c", best to reach 90% of v-measure (cf. efficiency study)
            "bank_cards_v1_-_settings_1_sufficient",  # "bank_cards_v1" + "lemma_prep_-_tfidf" + "closest-50" + "kmeans_COP-10c", best to reach 100% of v-measure (cf. efficiency study)
            "bank_cards_v1_-_settings_2_exhaustive",  # "bank_cards_v1" + "lemma_prep_-_tfidf" + "in_same-50" + "kmeans_COP-10c", best to reach annotation completeness (cf. efficiency study)
            "bank_cards_v1_-_settings_3_favorite",  # "bank_cards_v1" + "simple_prep_-_tfidf" + "closest-50" + "kmeans_COP-10c", choice of author (cf. cost study)
            "bank_cards_v1",  # Mean.
            "bank_cards_v1_-_simple_tfidf_kmeans",  # Mean.
            "bank_cards_v1_-_simple_tfidf_kmeans_random",  # "bank_cards_v1" + "simple_prep_-_tfidf" + "random-50" + "kmeans_COP-10c", choice of author with random
            "bank_cards_v1_-_simple_tfidf_kmeans_same",  # "bank_cards_v1" + "simple_prep_-_tfidf" + "in_same-50" + "kmeans_COP-10c", choice of author with random in same cluster
            "bank_cards_v1_-_simple_tfidf_kmeans_farthest",  # "bank_cards_v1" + "simple_prep_-_tfidf" + "farthest-50" + "kmeans_COP-10c", choice of author with farthest in same cluster
            "bank_cards_v1_-_simple_tfidf_kmeans_closest",  # "bank_cards_v1" + "simple_prep_-_tfidf" + "closest-50" + "kmeans_COP-10c", choice of author with closest in different clusters
        ]
    )
]
print("There are", "`" + str(len(LIST_OF_TASKS)) + "`", "implementations to analyze.")

Create one folder per implementation to analyze.

In [None]:
for task in LIST_OF_TASKS:
    
    # If folder exists: continue.
    if os.path.isdir("../experiments/" + task["implementation"]):
        continue
    
    # Create folder for analyses.
    os.mkdir("../experiments/" + task["implementation"])
    
    # Copy data.
    for experiment in task["experiments"]:
        with open("../previous/" + task["implementation"] + "/" + experiment, "r") as file_previous_results_r:
            previous_results = json.load(file_previous_results_r)
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "w") as file_previous_results_w:
            json.dump(
                previous_results,
                file_previous_results_w
            )

------------------------------
## 3. START ANALYSES

----------
### 3.1. Analyze annotation agreement score between annotator and previous clustering.

Define annotation agreement score.

In [None]:
import pandas as pd
from annotation_agreement_score import compute_annotation_agreement_score, display_annotation_agreement_score

Compute all annotation agreement scores.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        print("    ", "experiment:", experiment)
    
        # If folder exists: continue.
        if os.path.exists("../experiments/" + task["implementation"] + "/annotation_agreement_score___" + experiment):
            continue
        
        # Load data
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_constraints_annotations: Dict[str, List[Tuple[str, str, str]]] = experiment_data["dict_of_constraints_annotations"]
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]
            
        # Compute annotation agreement score for each iteration.
        annotation_agreement_score_evolution: Dict[str, float] = {}
        previous_iteration: Optional[str] = None
        for iteration in dict_of_constraints_annotations.keys():
            # Compute annotation agreement score. No annotation in iteration "0000", so no agreement score to compure.
            if iteration != "0000":
                annotation_agreement_score_evolution[iteration] = compute_annotation_agreement_score(
                    clustering=dict_of_clustering_results[previous_iteration],
                    annotations=dict_of_constraints_annotations[iteration],
                )
            # Update temporary variables.
            previous_iteration = iteration
        
        # Store results.
        with open("../experiments/" + task["implementation"] + "/annotation_agreement_score___" + experiment, "w") as file_annotation_agreement_score_w:
            json.dump(
                annotation_agreement_score_evolution,
                file_annotation_agreement_score_w,
            )

Display annotation agreement score evolution in a graph.

In [None]:
config_annotation_agreement_score = {
    "bank_cards_v1_-_settings_0_partial": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\nayant le meilleur paramétrage moyen pour atteindre\nune annotation partielle (90% de v-measure).",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "green",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-partielle.png",
    },  # best to reach 90% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_settings_1_sufficient": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\nayant le meilleur paramétrage moyen pour atteindre\nune annotation suffisante (100% de v-measure).",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "blue",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-suffisante.png",
    },  # best to reach 100% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_settings_2_exhaustive": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\nayant le meilleur paramétrage moyen pour atteindre\nune annotation exhaustive (toutes les contraintes).",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "red",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-exhaustive.png",
    },  # best to reach annotation completeness (cf. efficiency study)
    "bank_cards_v1_-_settings_3_favorite": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\nayant notre paramétrage favori pour atteindre\n90% de v-measure avec un coût global minimal.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "gold",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-favori.png",
    },  # choice of author (cf. cost study)
    "bank_cards_v1": {
        "plot_label": "Accord annotations/clustering moyen des tentatives.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "black",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-moyen.png",
    },  # Mean.
    "bank_cards_v1_-_simple_tfidf_kmeans": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\navec un clustering 'clust.kmeans.cop'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-favori-with-kmeans.png",
    },  # Mean.
    "bank_cards_v1_-_simple_tfidf_kmeans_random": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\navec une sélection 'samp.rand.full'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-favori-with-random.png",
    },  # choice of author with random
    "bank_cards_v1_-_simple_tfidf_kmeans_same": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\navec une sélection 'samp.rand.same'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-favori-with-in-same.png",
    },  # choice of author with random in same cluster
    "bank_cards_v1_-_simple_tfidf_kmeans_farthest": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\navec une sélection 'samp.farthest.same'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-favori-with-farthest.png",
    },  # choice of author with farthest in same cluster
    "bank_cards_v1_-_simple_tfidf_kmeans_closest": {
        "plot_label": "Accord annotations/clustering moyen des tentatives\navec une sélection 'samp.closest.diff'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "lower right",
        "graph_filename": "etude-rentabilite-accord-annotation-clustering-annotation-favori-with-closest.png",
    },  # choice of author with closest in different clusters
}

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"], "\n    (", config_annotation_agreement_score[task["implementation"]]["graph_filename"], ")")
    display_annotation_agreement_score(
        implementation=task["implementation"],
        list_of_experiments=task["experiments"],
        list_of_iterations=[str(i).zfill(4) for i in range(1, 50)],
        plot_label=config_annotation_agreement_score[task["implementation"]]["plot_label"],
        plot_groundtruth_label=config_annotation_agreement_score[task["implementation"]]["plot_groundtruth_label"],
        plot_color=config_annotation_agreement_score[task["implementation"]]["plot_color"],
        legend_loc=config_annotation_agreement_score[task["implementation"]]["legend_loc"],
        graph_filename=config_annotation_agreement_score[task["implementation"]]["graph_filename"],
    )

Compute correlation factor between agreement score and vmeasure.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"], end=": ")
    
    # Initialize dataframe to store scores.
    df_scores: pd.DataFrame = pd.DataFrame(columns=["experiment", "iteration", "agreement_score", "v_measure"])
    
    # For each experiments.
    for experiment in task["experiments"]:

        # If folder exists: continue.
        if not os.path.exists("../experiments/" + task["implementation"] + "/annotation_agreement_score___" + experiment):
            continue
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_clustering_performances: Dict[str, Dict[str, float]] = experiment_data["dict_of_clustering_performances"]
            
        # Load results.
        with open("../experiments/" + task["implementation"] + "/annotation_agreement_score___" + experiment, "r") as file_annotation_agreement_score_r:
            annotation_agreement_score_evolution: Dict[str, float] = json.load(file_annotation_agreement_score_r)
        
        # Add scores.
        for iteration in annotation_agreement_score_evolution.keys():
            df_scores = df_scores.append(
                {
                    "experiment": experiment,
                    "iteration": iteration,
                    "agreement_score": annotation_agreement_score_evolution[iteration],
                    "v_measure": dict_of_clustering_performances[iteration]["v_measure"],
                },
                ignore_index=True,
            )
    
    # Compute correlation.
    correlation: pd.DataFrame = df_scores.corr()["agreement_score"]["v_measure"]
    print("r={0:.2f} ; r^2={1:.2f}".format(correlation, correlation**2)

----------
### 3.2. Analyze clustering similarity.

Load Python dependencies.

In [None]:
import pandas as pd
from clustering_similarity import compute_clustering_similarity, display_clustering_similarity

Compute clustering similarity.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        print("    ", "experiment:", experiment)
    
        # If folder exists: continue.
        if os.path.exists("../experiments/" + task["implementation"] + "/clustering_similarity___" + experiment):
            continue
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]
            
        # Compute clustering similarity moving average.
        clustering_similarity_evolution: Dict[str, Dict[str, float]] = compute_clustering_similarity(
            dict_of_clustering_results=dict_of_clustering_results,
            short_average=2,
            long_average=4,
        )
        
        # Store results.
        with open("../experiments/" + task["implementation"] + "/clustering_similarity___" + experiment, "w") as file_clustering_similarity_w:
            json.dump(
                clustering_similarity_evolution,
                file_clustering_similarity_w,
            )

Display clustering similarity in a graph.

In [None]:
config_clustering_similarity = {
    "bank_cards_v1_-_settings_0_partial": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec le meilleur paramétrage moyen pour atteindre\nune annotation partielle (90% de v-measure).",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "green",
        "legend_loc": "center right",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-partielle.png",
    },  # best to reach 90% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_settings_1_sufficient": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec le meilleur paramétrage moyen pour atteindre\nune annotation suffisante (100% de v-measure).",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "blue",
        "legend_loc": "center right",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-suffisante.png",
    },  # best to reach 100% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_settings_2_exhaustive": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec le meilleur paramétrage moyen pour atteindre\nune annotation exhaustive (toutes les contraintes).",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "red",
        "legend_loc": "center right",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-exhaustive.png",
    },  # best to reach annotation completeness (cf. efficiency study)
    "bank_cards_v1_-_settings_3_favorite": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec notre paramétrage favori pour atteindre\n90% de v-measure avec un coût global minimal.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "gold",
        "legend_loc": "center right",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-favori.png",
    },  # choice of author (cf. cost study)
    "bank_cards_v1": {
        "plot_label": "Différence moyenne entre deux itérations de clustering.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "black",
        "legend_loc": "upper left",
        "graph_filename": "etude-rentabilite-similarite-clustering-moyen.png",
    },  # Mean.
    "bank_cards_v1_-_simple_tfidf_kmeans": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec un clustering 'clust.kmeans.cop'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "upper left",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-favori-with-kmeans.png",
    },  # Mean.
    "bank_cards_v1_-_simple_tfidf_kmeans_random": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec une sélection 'samp.rand.full'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "upper left",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-favori-with-random.png",
    },  # choice of author with random
    "bank_cards_v1_-_simple_tfidf_kmeans_same": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec une sélection 'samp.rand.same'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "upper left",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-favori-with-in-same.png",
    },  # choice of author with random in same cluster
    "bank_cards_v1_-_simple_tfidf_kmeans_farthest": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec une sélection 'samp.farthest.same'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "upper left",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-favori-with-farthest.png",
    },  # choice of author with farthest in same cluster
    "bank_cards_v1_-_simple_tfidf_kmeans_closest": {
        "plot_label": "Différence moyenne entre deux itérations de clustering\navec une sélection 'samp.closest.diff'.",
        "plot_groundtruth_label": "Similarité moyenne entre le clustering et la vérité terrain",
        "plot_color": "orange",
        "legend_loc": "upper left",
        "graph_filename": "etude-rentabilite-similarite-clustering-annotation-favori-with-closest.png",
    },  # choice of author with closest in different clusters
}

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"], "\n    (", config_clustering_similarity[task["implementation"]]["graph_filename"], ")")
    display_clustering_similarity(
        implementation=task["implementation"],
        list_of_experiments=task["experiments"],
        list_of_iterations=[str(i).zfill(4) for i in range(1, 50)],
        plot_label=config_clustering_similarity[task["implementation"]]["plot_label"],
        plot_groundtruth_label=config_clustering_similarity[task["implementation"]]["plot_groundtruth_label"],
        plot_color=config_clustering_similarity[task["implementation"]]["plot_color"],
        legend_loc=config_clustering_similarity[task["implementation"]]["legend_loc"],
        graph_filename=config_clustering_similarity[task["implementation"]]["graph_filename"],
    )

Compute correlation factor between clustering similarity and vmeasure.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"], end=": ")
    
    # Initialize dataframe to store scores.
    df_scores: pd.DataFrame = pd.DataFrame(columns=["experiment", "iteration", "clustering_difference", "v_measure"])
    
    # For each experiments.
    for experiment in task["experiments"]:

        # If folder exists: continue.
        if not os.path.exists("../experiments/" + task["implementation"] + "/clustering_similarity___" + experiment):
            continue
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_clustering_performances: Dict[str, Dict[str, float]] = experiment_data["dict_of_clustering_performances"]
            
        # Load results.
        with open("../experiments/" + task["implementation"] + "/clustering_similarity___" + experiment, "r") as file_clustering_similarity_r:
            clustering_similarity_evolution: Dict[str, Dict[str, float]] = json.load(file_clustering_similarity_r)
        
        # Add scores.
        for iteration in clustering_similarity_evolution["similarity"].keys():
            df_scores = df_scores.append(
                {
                    "experiment": experiment,
                    "iteration": iteration,
                    "clustering_difference": clustering_similarity_evolution["similarity"][iteration],
                    "v_measure": dict_of_clustering_performances[iteration]["v_measure"],
                },
                ignore_index=True,
            )
    
    # Compute correlation.
    correlation: pd.DataFrame = df_scores.corr()["clustering_difference"]["v_measure"]
    print("r={0:.2f} ; r^2={1:.2f}".format(correlation, correlation**2))
    break