# ==== INTERACTIVE CLUSTERING : BUSINESS RELEVANCE STUDY ====
> ### Stage 1 : Evaluate business relevance on previous experiments.

------------------------------
## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at compute business relevance on interactive clustering experiments**.
- Computations are based on previous experiments (cf. efficience study) exported in `./previous` folder.
- Environments results are stored in their `.JSON` files in the `/experiments` folder.

Then, **go to the notebook `2_Plot_some_figures.ipynb` to plot several figures according to these computations**.

### Description each steps

The several computations are :
- `consistency_score`: Test a TF-IDF + Linear SVM model on its own trainset with a high prediction score threshold in order to check data consistency. The consistency score is the f1-score computed. If f1-score is low (i.e. less than `0.75`), then trainset can be inconsistent : Data may be badly labeled or classes may overlap. Otherwise, f1-score should tend to 100%.
- `fmc_description`: Use the Features Maximization Contrast méthod of feature selection in order to describe a cluster by its relevant linguistic patterns.
- `llm_summarization`: Use a large language model (`openai/GPT3.5`) to summarize each cluster as a one sentence description.

Some combinations of parameters are studied:
- `simple_prep` + `tfidf` + `closest-50` + `hier_avg-10c`: best to reach 90% of v-measure (cf. efficiency study)
- `lemma_prep` + `tfidf` + `closest-50` + `kmeans_COP-10c`: best to reach 100% of v-measure (cf. efficiency study)
- `lemma_prep` + `tfidf` + `in_same-50` + `kmeans_COP-10c`: best to reach annotation completeness (cf. efficiency study)
- `simple_prep` + `tfidf` + `closest-50` + `kmeans_COP-10c`: choice of author (cf. cost study)

------------------------------
## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Any, Dict, List, Any, Optional
import os
import json

In [None]:
import warnings
warnings.filterwarnings("ignore")

------------------------------
## 2. LOAD EXPERIMENTS TO STUDY

Find all implementations to analyze and associated experiments.

In [None]:
# List of run tasks to parallelize.
LIST_OF_TASKS: List[Dict[str, str]] = [
    {
        "implementation": implementation,  # Environment of experiment.
        "experiments": [
            exp_path
            for exp_path in os.listdir("../previous/"+implementation)
            if ".json" in exp_path
        ]
    }
    for implementation in os.listdir("../previous")
    if (
        os.path.isdir("../previous/" + implementation)
        and implementation in [
            "bank_cards_v1_-_settings_0_partial",  # "bank_cards_v1" + "settings_0_partial" # "bank_cards_v1" + "simple_prep_-_tfidf" + "closest-50" + "hier_avg-10c", best to reach 90% of v-measure (cf. efficiency study)
            "bank_cards_v1_-_settings_1_sufficient",  # "bank_cards_v1" + "lemma_prep_-_tfidf" + "closest-50" + "kmeans_COP-10c", best to reach 100% of v-measure (cf. efficiency study)
            "bank_cards_v1_-_settings_2_exhaustive",  # "bank_cards_v1" + "lemma_prep_-_tfidf" + "in_same-50" + "kmeans_COP-10c", best to reach annotation completeness (cf. efficiency study)
            "bank_cards_v1_-_settings_3_favorite",  # "bank_cards_v1" + "simple_prep_-_tfidf" + "closest-50" + "kmeans_COP-10c", choice of author (cf. cost study)
            #"bank_cards_v1_-_simple_prep_-_tfidf",  # Mean.
            #"bank_cards_v1",  # Mean.
        ]
    )
]
print("There are", "`" + str(len(LIST_OF_TASKS)) + "`", "implementations to analyze.")

Create one folder per implementation to analyze.

In [None]:
for task in LIST_OF_TASKS:
    
    # If folder exists: continue.
    if os.path.isdir("../experiments/" + task["implementation"]):
        continue
    
    # Create folder for analyses.
    os.mkdir("../experiments/" + task["implementation"])
    
    # Copy data.
    for experiment in task["experiments"]:
        with open("../previous/" + task["implementation"] + "/" + experiment, "r") as file_previous_results_r:
            previous_results = json.load(file_previous_results_r)
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "w") as file_previous_results_w:
            json.dump(
                previous_results,
                file_previous_results_w
            )

------------------------------
## 3. START ANALYSES

----------
### 3.1. Analyze clustering consistency.

Load Python dependencies.

In [None]:
from consistency_score import compute_consistency_score, display_consistency_score

Compute all consistency scores.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        print("    ", "experiment:", experiment)
    
        # If folder exists: continue.
        if os.path.exists("../experiments/" + task["implementation"] + "/constistency_score___" + experiment):
            continue
        
        # Load data
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_preprocessed_texts: Dict[str, str] = experiment_data["dict_of_preprocessed_texts"]
        dict_of_true_intents: Dict[str, str] = experiment_data["dict_of_true_intents"]
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]
            
        # Compute consistency score of groundtruth.
        groundtruth_consistency_score: float = compute_consistency_score(
            x_train = [
                dict_of_preprocessed_texts[text_ID]
                for text_ID in dict_of_preprocessed_texts.keys()
            ],
            y_train = [
                dict_of_true_intents[text_ID]
                for text_ID in dict_of_preprocessed_texts.keys()
            ],
            prediction_score_threshold = 0.75,
        )

        # Compute consistency score of clustering.
        clustering_consistency_score_evolution: Dict[str, float] = {
            iteration: compute_consistency_score(
                x_train = [
                    dict_of_preprocessed_texts[text_ID]
                    for text_ID in dict_of_preprocessed_texts.keys()
                ],
                y_train = [
                    str(dict_of_clustering_results[iteration][text_ID])
                    for text_ID in dict_of_preprocessed_texts.keys()
                ],
                prediction_score_threshold = 0.75,
            )
            for iteration in dict_of_clustering_results.keys()
        }
            
        # Store results.
        with open("../experiments/" + task["implementation"] + "/constistency_score___" + experiment, "w") as file_constistency_score_w:
            json.dump(
                {
                    "groundtruth": groundtruth_consistency_score,
                    "evolution": clustering_consistency_score_evolution,
                },
                file_constistency_score_w
            )

Display consistency score evolution in a graph.

In [None]:
config_consistency_score = {
    "bank_cards_v1_-_settings_0_partial": {
        "plot_label": "Score de cohérence moyen des tentatives\nayant le meilleur paramétrage moyen pour atteindre\nune annotation partielle (90% de v-measure).",
        "plot_color": "green",
        "graph_filename": "etude-pertinence-consistence-annotation-partielle.png",
    },  # best to reach 90% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_settings_1_sufficient": {
        "plot_label": "Score de cohérence moyen des tentatives\nayant le meilleur paramétrage moyen pour atteindre\nune annotation suffisante (100% de v-measure).",
        "plot_color": "blue",
        "graph_filename": "etude-pertinence-consistence-annotation-suffisante.png",
    },  # best to reach 100% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_settings_2_exhaustive": {
        "plot_label": "Score de cohérence moyen des tentatives\nayant le meilleur paramétrage moyen pour atteindre\nune annotation exhaustive (toutes les contraintes).",
        "plot_color": "red",
        "graph_filename": "etude-pertinence-consistence-annotation-exhaustive.png",
    },  # best to reach annotation completeness (cf. efficiency study)
    "bank_cards_v1_-_settings_3_favorite": {
        "plot_label": "Score de cohérence moyen des tentatives\nayant notre paramétrage favori pour atteindre\n90% de v-measure avec un coût global minimal.",
        "plot_color": "gold",
        "graph_filename": "etude-pertinence-consistence-annotation-favori.png",
    },  # choice of author (cf. cost study)
}

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"], "\n    (", config_consistency_score[task["implementation"]]["graph_filename"], ")")
    display_consistency_score(
        implementation=task["implementation"],
        list_of_experiments=task["experiments"],
        list_of_iterations=[str(i).zfill(4) for i in range(50)],
        plot_label=config_consistency_score[task["implementation"]]["plot_label"],
        plot_color=config_consistency_score[task["implementation"]]["plot_color"],
        graph_filename=config_consistency_score[task["implementation"]]["graph_filename"],
    )

----------
### 3.2. Describe clusters by their relevant linguistic patterns.

Load Python dependencies.

In [None]:
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from cognitivefactory.features_maximization_metric.fmc import FeaturesMaximizationMetric
import openpyxl
import pandas as pd

Describe clusters with FMC.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        print("    ", "experiment:", experiment)
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_preprocessed_texts: Dict[str, str] = experiment_data["dict_of_preprocessed_texts"]
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]
        
        # Intialize FMC descriptions if need.
        if not os.path.exists("../experiments/" + task["implementation"] + "/fmc_description___" + experiment):
            # Define list of iteration to analyze.
            list_of_iteration: List[str] = [
                i
                for i in dict_of_clustering_results.keys()
                if (
                    int(i) % 5 == 0
                    or i == max(dict_of_clustering_results.keys())
                )
            ]
            # Define initial FMC descriptions for each cluster of each iteration to analyze.
            initial_fmc_descriptions: Dict[str, Dict[str, Optional[List[str]]]] = {
                iteration: {
                    str(cluster_id): None  # need to force cluster_id to str for serialization
                    for cluster_id in sorted(set(dict_of_clustering_results[iteration].values()))
                }
                for iteration in list_of_iteration
            }
            # Store initial FMC descriptions.
            with open("../experiments/" + task["implementation"] + "/fmc_description___" + experiment, "w") as file_initial_fmc_description_w:
                json.dump(initial_fmc_descriptions, file_initial_fmc_description_w)
        
        # Load FMC descriptions already done.
        with open("../experiments/" + task["implementation"] + "/fmc_description___" + experiment, "r") as file_fmc_description_r:
            descriptions: Dict[str, Dict[str, Optional[str]]] = json.load(file_fmc_description_r)
        
        # For each iteration...
        for iteration in descriptions.keys():
            print("    ", "    ", "iteration:", iteration)
            
            # Define vectorizer.
            vectorizer = TfidfVectorizer(min_df=0, ngram_range=(1, 3), analyzer="word", sublinear_tf=True)
            matrix_of_vectors: csr_matrix = vectorizer.fit_transform(
                [
                    str(dict_of_preprocessed_texts[text_id])
                    for text_id in dict_of_preprocessed_texts.keys()
                ]
            )
            list_of_possible_vectors_features: List[str] = list(vectorizer.get_feature_names_out())

            # Define FMC modelization.
            fmc_computer: FeaturesMaximizationMetric = FeaturesMaximizationMetric(
                data_vectors=matrix_of_vectors,
                data_classes=[dict_of_clustering_results[iteration][text_id] for text_id in dict_of_preprocessed_texts.keys()],
                list_of_possible_features=list_of_possible_vectors_features,
                amplification_factor=1,
            )
            
            # For each cluster
            for cluster_id in descriptions[iteration].keys():
                print("    ", "    ", "    ", cluster_id, end=": ")
                
                # If already done: continue
                if descriptions[iteration][str(cluster_id)] is not None:
                    print("(already done)")
                    continue
                
                # Get FMC description of cluster.
                cluster_description: List[str] = [
                    linguistic_pattern
                    for linguistic_pattern in fmc_computer.get_most_active_features_by_a_classe(
                        classe=int(cluster_id),
                        activation_only=True,
                        sort_by='fmeasure',  # "contrast"
                        max_number=50,
                    )
                    if fmc_computer.get_most_activated_classes_by_a_feature(linguistic_pattern) == [int(cluster_id)]
                ]
                print(cluster_description)
                
                # Store updated descriptions.
                descriptions[iteration][str(cluster_id)] = cluster_description
                with open("../experiments/" + task["implementation"] + "/fmc_description___" + experiment, "w") as file_fmc_description_w:
                    json.dump(descriptions, file_fmc_description_w)

Export in XLSX file.

In [None]:
def translate_token_in_text(text: str, dict_of_translation: Dict[str, str]) -> str:
    """
        In a text, replace all token according to a translation dictionnary.
        
        Args:
            text (str): The text to translate.
            dict_of_translation (Dict[str, str]): The list of translations.
        
        Return:
            str: The translated text.
    """
    for token, translation in dict_of_translation.items():
        text = text.replace(token, translation)
    return text

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        #if os.path.isfile("../experiments/" + task["implementation"] + "/fmc_description___" + experiment.split(".")[0] + ".xlsx"):
        #    print("    ", "experiment:", experiment, "--> SKIP: export already done")
        #    continue
        print("    ", "experiment:", experiment)
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_texts: Dict[str, str] = experiment_data["dict_of_texts"]
        dict_of_preprocessed_texts: Dict[str, str] = experiment_data["dict_of_preprocessed_texts"]
        dict_of_true_intents: Dict[str, str] = experiment_data["dict_of_true_intents"]
        list_of_possible_true_labels: List[str] = sorted(set(dict_of_true_intents.values()))
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]

        # Load FMC descriptions.
        with open("../experiments/" + task["implementation"] + "/fmc_description___" + experiment, "r") as file_fmc_description_r:
            descriptions: Dict[str, Dict[str, Optional[str]]] = json.load(file_fmc_description_r)
        
        # Initialize list of exports.
        df_exports: Dict[str, pd.DataFrame] = {}
        
        # For each iteration...
        for iteration in descriptions.keys():
            
            # Initialize export data.
            df_exports[iteration] = pd.DataFrame(
                columns=[
                    "implementation_id", "experiment_id", "iteration_id", "cluster_id", "cluster", "fmc_description", "cluster_emphasized", "confusion",
                    "[Q] cluster well-designed?", "[Q] cluster main topic?", "[Q] FMC description relevant?", "[Q] FMC description main topic?",
                ]
            )
            
            # For each cluster.
            for cluster_id in descriptions[iteration].keys():
                
                # Estimate confusion.
                list_of_present_true_labels: List[str] = sorted(set(
                    dict_of_true_intents[text_id]
                    for text_id in dict_of_texts.keys()
                    if dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                ))
                
                # Update export data.
                data_summary: Dict[str, Any] = {
                    "implementation_id": task["implementation"],
                    "experiment_id": experiment.split(".")[0],
                    "iteration_id": iteration,
                    "cluster_id": cluster_id,
                    "cluster": "'" + "\r\n".join([
                        "- {0}".format(text)
                        for text_id, text in dict_of_texts.items()
                        if dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                    ]),
                    "cluster_emphasized": "'" + "\r\n".join([
                        "- {0}".format(
                            translate_token_in_text(
                                text=text,
                                dict_of_translation={
                                    token: "[{0}]".format(token.upper())
                                    for token in descriptions[iteration][cluster_id]
                                }
                            )
                        )
                        for text_id, text in dict_of_preprocessed_texts.items()
                        if dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                    ]),
                    "fmc_description": "'" + "\r\n".join([
                        "- {0}".format(pattern)
                        for pattern in descriptions[iteration][cluster_id]
                    ]),
                    "confusion": "'" + "\r\n".join([
                        "- {0}: {1}".format(
                            true_label,
                            len([
                                text_id
                                for text_id in dict_of_texts.keys()
                                if (
                                    dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                                    and dict_of_true_intents[text_id] == true_label
                                )
                            ])
                        )
                        for true_label in list_of_present_true_labels
                    ]),
                    "[Q] cluster well-designed?": "",
                    "[Q] cluster main topic?": "",
                    "[Q] FMC description relevant?": "",
                    "[Q] FMC description main topic?": "",
                }
                df_exports[iteration] = df_exports[iteration].append(
                    data_summary,
                    ignore_index=True,
                )

        # Export summaries.
        with pd.ExcelWriter("../experiments/" + task["implementation"] + "/fmc_description___" + experiment.split(".")[0] + ".xlsx") as writer:
            
            # Export all iterations, one iteration per sheet.
            for iteration_key, df_export in df_exports.items():
                # Export data.
                df_export.to_excel(
                    excel_writer=writer,
                    sheet_name=iteration_key,
                    index=False,
                    #engine="openpyxl",
                )
                # Format data.
                workbook = writer.book
                format_header = workbook.add_format({"text_wrap": True, "bold": True})
                format_cluster = workbook.add_format({"text_wrap": True})
                format_description = workbook.add_format({"text_wrap": True, "bold": True, "valign": "top", "font_color": "orange"})
                format_confusion = workbook.add_format({"text_wrap": True, "valign": "top"})
                format_cluster_analysis = workbook.add_format({"text_wrap": True, "bold": True, "valign": "top", "bg_color": "#97D2D4"})
                format_fmc_analysis = workbook.add_format({"text_wrap": True, "bold": True, "valign": "top", "bg_color": "#EBDB5E"})
                worksheet = writer.sheets[iteration_key]
                worksheet.set_row(row=0, height=None, cell_format=format_header)
                worksheet.set_column(first_col=0, last_col=3, width=10, cell_format=None)
                worksheet.set_column(first_col=4, last_col=4, width=70, cell_format=format_cluster)
                worksheet.set_column(first_col=5, last_col=5, width=25, cell_format=format_description)
                worksheet.set_column(first_col=6, last_col=6, width=70, cell_format=format_cluster)
                worksheet.set_column(first_col=7, last_col=7, width=25, cell_format=format_confusion)
                worksheet.set_column(first_col=8, last_col=9, width=20, cell_format=format_cluster_analysis),
                worksheet.set_column(first_col=10, last_col=11, width=20, cell_format=format_fmc_analysis),
                for i in range(len(df_export)):
                    worksheet.set_row(row=1+i, height=150, cell_format=None)

Manually analyze each clusters and their FMC descriptions.

1. **Questions for annotators/reviewers**:
    - **on cluster analysis**:
        - `is this cluster well-designed / consistent / suited for training ?` (_all well-designed clusters are accepted, even if its not the groundtruth_).
        - `if so, what is the single main topic of this cluster ?`
    - **on FMC description analysis**:
        - `does these linguistics patterns represent a single main topic ?`
        - `if so, what is the single main topic of this description ?`

2. **Notations scale**:
    - `not_exploitable`:
        - definition is not clear: the cluster has no main topic or too many topics.
        - so content is not trainable: the cluster is too small or too big.
        - $\Rightarrow$ it's absolutely not usable."
    - `partially_exploitable` :
        - definition can be deduced: the cluster has several topics ideas but lacks relevance.
        - so content is not trainable: the cluster is not usable as is.
        - $\Rightarrow$ it gives some clues to manually define and create an exploitable cluster."
    - `exploitable`:
        - definition is clear: the cluster has one identifiable main topic.
        - content is  trainable: the cluster is consistent, has enough data and few intruders.
        - $\Rightarrow$ the cluster can be used with little manual editing."

/!\ Annotation are expected in `fmc_description___XXXX_checked.xlsx` file /!\
- **cluster analysis** are annotated in columns `[Q] cluster well-designed?` and `[Q] cluster main topic?`.
- **FMC description analysis** are annotated in columns `[Q] FMC description relevant?` and `[Q] FMC description main topic?`.

Conclusions of this manual analysis.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # Check if annotations are performed.
    if task["implementation"] != "bank_cards_v1_-_settings_3_favorite":
        print("--> SKIP: task not annotated")
        continue
        
    ###
    ### GET ANNOTATIONS
    ###
    print("    ", "Get annotations")
        
    # Initialize annotations loading.
    df_fmc_annotations: Dict[str, Dict[str, pd.DataFrame]] = {}
    
    # For each experiments.
    for experiment in task["experiments"]:

        # Load clustering summaries.
        with open("../experiments/" + task["implementation"] + "/llm_summary___" + experiment, "r") as file_fmc_description_r:
            fmc_descriptions: Dict[str, Dict[str, Optional[str]]] = json.load(file_fmc_description_r)
        
        # Load annotations.
        df_fmc_annotations[experiment] = {
            iteration: pd.read_excel(
                io="../experiments/" + task["implementation"] + "/fmc_description___" + experiment.split(".")[0] + "_checked.xlsx",
                sheet_name=iteration,
                engine="openpyxl",
            )
            for iteration in fmc_descriptions.keys()
        }
    
    # Group by relevance (annotation label).
    df_fmc_annotations_by_relevance: Dict[str, pd.DataFrame] = {
        "exploitable": pd.DataFrame(
            columns=[
                "implementation_id", "experiment_id", "iteration_id", "cluster_id", "cluster", "fmc_description", "cluster_emphasized", "confusion",
                "[Q] cluster well-designed?", "[Q] cluster main topic?", "[Q] FMC description relevant?", "[Q] FMC description main topic?",
            ]
        ),
        "partially_exploitable": pd.DataFrame(
            columns=[
                "implementation_id", "experiment_id", "iteration_id", "cluster_id", "cluster", "fmc_description", "cluster_emphasized", "confusion",
                "[Q] cluster well-designed?", "[Q] cluster main topic?", "[Q] FMC description relevant?", "[Q] FMC description main topic?",
            ]
        ),
        "not_exploitable": pd.DataFrame(
            columns=[
                "implementation_id", "experiment_id", "iteration_id", "cluster_id", "cluster", "fmc_description", "cluster_emphasized", "confusion",
                "[Q] cluster well-designed?", "[Q] cluster main topic?", "[Q] FMC description relevant?", "[Q] FMC description main topic?",
            ]
        )
    }
    for experiment in df_fmc_annotations.keys():
        for iteration in df_fmc_annotations[experiment].keys():
            df_fmc_annotations_by_relevance["exploitable"] = df_fmc_annotations_by_relevance["exploitable"].append(
                df_fmc_annotations[experiment][iteration][df_fmc_annotations[experiment][iteration]["[Q] cluster well-designed?"]=="exploitable"],
                ignore_index=True,
            )
            df_fmc_annotations_by_relevance["partially_exploitable"] = df_fmc_annotations_by_relevance["partially_exploitable"].append(
                df_fmc_annotations[experiment][iteration][df_fmc_annotations[experiment][iteration]["[Q] cluster well-designed?"]=="partially_exploitable"],
                ignore_index=True,
            )
            df_fmc_annotations_by_relevance["not_exploitable"] = df_fmc_annotations_by_relevance["not_exploitable"].append(
                df_fmc_annotations[experiment][iteration][df_fmc_annotations[experiment][iteration]["[Q] cluster well-designed?"]=="not_exploitable"],
                ignore_index=True,
            )
        
    # Compute number of FMC pattern identified and print Mean for this relavance category.
    for key in df_fmc_annotations_by_relevance.keys():
        df_fmc_annotations_by_relevance[key]["len(fmc_description)"] = df_fmc_annotations_by_relevance[key].apply(
            lambda row: len([
                pattern.split("- ")[1]
                for pattern in row["fmc_description"].split("_x000D_\n")
                if len(pattern.split("- "))==2
            ]),
            axis=1,
        )
        print(
            "    ", "FMC description size for '{0}':".format(key),
            "Mean: {0:.1f}".format(df_fmc_annotations_by_relevance[key]["len(fmc_description)"].mean()),
            "Min: {0:.1f}".format(min(df_fmc_annotations_by_relevance[key]["len(fmc_description)"])),
            "Max: {0:.1f}".format(max(df_fmc_annotations_by_relevance[key]["len(fmc_description)"])),
        )

----------
### 3.3. Summarize clusters by a large language model.

Load Python dependencies.

In [None]:
import openai
import openpyxl
import pandas as pd
import time
import numpy as np
from scipy import stats as scipystats
from matplotlib import pyplot as plt
from matplotlib.figure import Figure

Load credentials.
> Need a file `credentials.py` in projet home (`../..`)
> with content `OPENAI_API_TOKEN = "..."` from https://platform.openai.com/account/api-keys

In [None]:
import sys
sys.path.append("../..")
import credentials  # Need a file `credentials.py` in projet home (`..`) with content `OPENAI_API_TOKEN = "..."` from https://platform.openai.com/account/api-keys
openai.api_key = credentials.OPENAI_API_TOKEN

Define large language model to call and prompt template

In [None]:
# Model to use.
OPENAI_MODEL: str = "gpt-3.5-turbo"

In [None]:
# Prompts to use
prompt_context: str = "Tu es un expert des secteurs banque, assurance et finance."
prompt_task: str = "Résume-moi en une phrase la thématique traitée dans les textes suivants"

Summarize clusters.

In [None]:
MAX_RETRY: int = 5  # retry when timeout.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        print("    ", "experiment:", experiment)
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_texts: Dict[str, str] = experiment_data["dict_of_texts"]
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]
        
        # Intialize clustering summaries if need.
        if not os.path.exists("../experiments/" + task["implementation"] + "/llm_summary___" + experiment):
            # Define list of iteration to analyze.
            list_of_iteration: List[str] = [
                i
                for i in dict_of_clustering_results.keys()
                if (
                    int(i) % 5 == 0
                    or i == max(dict_of_clustering_results.keys())
                )
            ]
            # Define initial clustering summaries for each cluster of each iteration to analyze.
            initial_clustering_summaries: Dict[str, Dict[str, Optional[str]]] = {
                iteration: {
                    str(cluster_id): None  # need to force cluster_id to str for serialization
                    for cluster_id in sorted(set(dict_of_clustering_results[iteration].values()))
                }
                for iteration in list_of_iteration
            }
            # Store initial clustering summaries.
            with open("../experiments/" + task["implementation"] + "/llm_summary___" + experiment, "w") as file_initial_llm_summaries_w:
                json.dump(initial_clustering_summaries, file_initial_llm_summaries_w)

        # Load clustering summaries already done.
        with open("../experiments/" + task["implementation"] + "/llm_summary___" + experiment, "r") as file_llm_summary_r:
            summaries: Dict[str, Dict[str, Optional[str]]] = json.load(file_llm_summary_r)
        
        # For each iteration...
        for iteration in summaries.keys():
            print("\n    ", "    ", "iteration:", iteration)
            
            # For each cluster
            for cluster_id in summaries[iteration].keys():
                print("    ", "    ", "    ", cluster_id, end=": ")
                
                # If already done: continue
                if summaries[iteration][str(cluster_id)] is not None:
                    print("(already done)")
                    continue
                
                # Get texts of this cluster.
                cluster = [
                    text
                    for text_id, text in dict_of_texts.items()
                    if dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                ]
                print("(cluster length: {0})".format(len(cluster)), end=" ; ")
                
                # Call the LLM to summarize the document (use loop to by-pass timeout).
                it: int = 0
                last_err: Exception = None
                while it<MAX_RETRY:
                    time.sleep(21)  # To avoid "Rate limit reached" => "default-gpt-3.5-turbo" limited at 3 request per minute.
                    try : 
                        it += 1
                        # Create a chat completion.
                        chat_answers = openai.ChatCompletion.create(
                            model=OPENAI_MODEL,
                            messages=[
                                {
                                    "role": "system",
                                    "content": "{context}".format(
                                        context=prompt_context,
                                    )
                                },
                                {
                                    "role": "user",
                                    "content": "{task} :\n\n{data}".format(
                                        task=prompt_task,
                                        data="- " + "\n- ".join(cluster),
                                    )
                                }
                            ]
                        )
                        break
                    # except OSError :
                    except Exception as err:
                        last_err = err
                        continue
                
                # If error: continue...
                if it==MAX_RETRY:
                    print(last_err)
                    continue
                
                # Get summary from answer.
                cluster_summary: str = chat_answers.choices[0].message.content
                print(cluster_summary[:100], "[...]")
                
                # Store updated summaries.
                summaries[iteration][str(cluster_id)] = cluster_summary
                with open("../experiments/" + task["implementation"] + "/llm_summary___" + experiment, "w") as file_llm_summary_w:
                    json.dump(summaries, file_llm_summary_w)

Export in XLSX file.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        #if os.path.isfile("../experiments/" + task["implementation"] + "/llm_summary___" + experiment.split(".")[0] + ".xlsx"):
        #    print("    ", "experiment:", experiment, "--> SKIP: export already done")
        #    continue
        print("    ", "experiment:", experiment)
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_texts: Dict[str, str] = experiment_data["dict_of_texts"]
        dict_of_true_intents: Dict[str, str] = experiment_data["dict_of_true_intents"]
        list_of_possible_true_labels: List[str] = sorted(set(dict_of_true_intents.values()))
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]

        # Load clustering summaries.
        with open("../experiments/" + task["implementation"] + "/llm_summary___" + experiment, "r") as file_llm_summary_r:
            summaries: Dict[str, Dict[str, Optional[str]]] = json.load(file_llm_summary_r)
        
        # Initialize list of exports.
        df_exports: Dict[str, pd.DataFrame] = {}
        
        # For each iteration...
        for iteration in summaries.keys():
            
            # Initialize export data.
            df_exports[iteration] = pd.DataFrame(
                columns=[
                    "implementation_id", "experiment_id", "iteration_id", "cluster_id", "cluster", "summary", "confusion",
                    "[Q] cluster well-designed?", "[Q] cluster main topic?", "[Q] summary relevant?", "[Q] summary main topic?",
                ]
            )
            
            # For each cluster.
            for cluster_id in summaries[iteration].keys():
                
                # Estimate confusion.
                list_of_present_true_labels: List[str] = sorted(set(
                    dict_of_true_intents[text_id]
                    for text_id in dict_of_texts.keys()
                    if dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                ))
                
                # Update export data.
                data_summary: Dict[str, Any] = {
                    "implementation_id": task["implementation"],
                    "experiment_id": experiment.split(".")[0],
                    "iteration_id": iteration,
                    "cluster_id": cluster_id,
                    "cluster": "'" + "\r\n".join([
                        "- {0}".format(text)
                        for text_id, text in dict_of_texts.items()
                        if dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                    ]),
                    "summary": summaries[iteration][cluster_id],
                    "confusion": "'" + "\r\n".join([
                        "- {0}: {1}".format(
                            true_label,
                            len([
                                text_id
                                for text_id in dict_of_texts.keys()
                                if (
                                    dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                                    and dict_of_true_intents[text_id] == true_label
                                )
                            ])
                        )
                        for true_label in list_of_present_true_labels
                    ]),
                    "[Q] cluster well-designed?": "",
                    "[Q] cluster main topic?": "",
                    "[Q] summary relevant?": "",
                    "[Q] summary main topic?": "",
                }
                df_exports[iteration] = df_exports[iteration].append(
                    data_summary,
                    ignore_index=True,
                )

        # Export summaries.
        with pd.ExcelWriter("../experiments/" + task["implementation"] + "/llm_summary___" + experiment.split(".")[0] + ".xlsx") as writer:
            
            # Export all iterations, one iteration per sheet.
            for iteration_key, df_export in df_exports.items():
                # Export data.
                df_export.to_excel(
                    excel_writer=writer,
                    sheet_name=iteration_key,
                    index=False,
                    #engine="openpyxl",
                )
                # Format data.
                workbook = writer.book
                format_header = workbook.add_format({"text_wrap": True, "bold": True})
                format_cluster = workbook.add_format({"text_wrap": True})
                format_summary = workbook.add_format({"text_wrap": True, "bold": True, "valign": "top", "font_color": "blue"})
                format_confusion = workbook.add_format({"text_wrap": True, "valign": "top"})
                format_cluster_analysis = workbook.add_format({"text_wrap": True, "bold": True, "valign": "top", "bg_color": "#97D2D4"})
                format_llm_analysis = workbook.add_format({"text_wrap": True, "bold": True, "valign": "top", "bg_color": "#EBDB5E"})
                worksheet = writer.sheets[iteration_key]
                worksheet.set_row(row=0, height=None, cell_format=format_header)
                worksheet.set_column(first_col=0, last_col=3, width=10, cell_format=None)
                worksheet.set_column(first_col=4, last_col=4, width=70, cell_format=format_cluster)
                worksheet.set_column(first_col=5, last_col=5, width=40, cell_format=format_summary)
                worksheet.set_column(first_col=6, last_col=6, width=25, cell_format=format_confusion)
                worksheet.set_column(first_col=7, last_col=8, width=20, cell_format=format_cluster_analysis),
                worksheet.set_column(first_col=9, last_col=10, width=20, cell_format=format_llm_analysis),
                for i in range(len(df_export)):
                    worksheet.set_row(row=1+i, height=150, cell_format=None)

Manually analyze each clusters and their summaries.

1. **Questions for annotators/reviewers**:
    - **on cluster analysis**:
        - `is this cluster well-designed / consistent / suited for training ?` (_all well-designed clusters are accepted, even if its not the groundtruth_).
        - `if so, what is the single main topic of this cluster ?`
    - **on summary analysis**:
        - `does this summary represent a single main topic ?`
        - `if so, what is the single main topic of this summary ?`

2. **Notations scale**:
    - `not_exploitable`:
        - definition is not clear: the cluster has no main topic or too many topics.
        - so content is not trainable: the cluster is too small or too big.
        - $\Rightarrow$ it's absolutely not usable."
    - `partially_exploitable` :
        - definition can be deduced: the cluster has several topics ideas but lacks relevance.
        - so content is not trainable: the cluster is not usable as is.
        - $\Rightarrow$ it gives some clues to manually define and create an exploitable cluster."
    - `exploitable`:
        - definition is clear: the cluster has one identifiable main topic.
        - content is  trainable: the cluster is consistent, has enough data and few intruders.
        - $\Rightarrow$ the cluster can be used with little manual editing."

/!\ Annotation are expected in `llm_summary___XXXX_checked.xlsx` file /!\
- **cluster analysis** are annotated in columns `[Q] cluster well-designed?` and `[Q] cluster main topic?`.
- **summary analysis** are annotated in columns `[Q] summary relevant?` and `[Q] summary main topic?`.

Conclusions of this manual analysis.

In [None]:
# TODO: change and create a python file for llm summary relevance display

In [None]:
def add_plot_of_evolution_per_iteration_to_graph(
    axis,
    list_of_x: List[str],
    dict_of_y: Dict[str, float],
    dict_of_y_err: Optional[Dict[str, float]] = None,
    label: str = "",
    marker: str = "",
    markersize: int = 5,
    color: str = "black",
    linewidth: int = 2,
    linestyle: str = "-",
    alpha: float = 0.2,
):
    """
    Add a plot to an axis of a graph.
    
    Args:
        axis (): TODO.
        list_of_x (List[str]): TODO.
        dict_of_y (Dict[str, float]): TODO.
        dict_of_y_err (Optional[Dict[str, float]]): TODO. Defaults to `None`.
        label (str): TODO. Defaults to `""`.
        marker (str): TODO. Defaults to `""`.
        markersize (int): TODO. Defaults to `5`.
        color (str): TODO. Defaults to `"black"`.
        linewidth (int): TODO. Defaults to `2`.
        linestyle (str): TODO. Defaults to `"-"`.
        alpha (float): TODO. Defaults to `0.2`.
    """
    # Add curve.
    axis.plot(
        [float(x) for x in list_of_x],  # x
        [dict_of_y[x] for x in list_of_x],  # y
        label=label,
        marker=marker,
        markerfacecolor=color,
        markersize=markersize,
        color=color,
        linewidth=linewidth,
        linestyle=linestyle,
    )
    # Add curve error bars.
    if dict_of_y_err is not None:
        axis.fill_between(
            [float(x) for x in list_of_x],  # x
            y1=[(dict_of_y[x] - dict_of_y_err[x]) for x in list_of_x],  # y1
            y2=[(dict_of_y[x] + dict_of_y_err[x]) for x in list_of_x],  # y2
            color=color,
            alpha=alpha,
        )

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # Check if annotations are performed.
    if task["implementation"] != "bank_cards_v1_-_settings_3_favorite":
        print("--> SKIP: task not annotated")
        continue
        
    ###
    ### GET ANNOTATIONS
    ###
    print("    ", "Get annotations")
        
    # Initialize annotations loading.
    df_llm_annotations: Dict[str, Dict[str, pd.DataFrame]] = {}
    
    # For each experiments.
    for experiment in task["experiments"]:

        # Load clustering summaries.
        with open("../experiments/" + task["implementation"] + "/llm_summary___" + experiment, "r") as file_llm_summary_r:
            summaries: Dict[str, Dict[str, Optional[str]]] = json.load(file_llm_summary_r)
        
        # Load annotations.
        df_llm_annotations[experiment] = {
            iteration: pd.read_excel(
                io="../experiments/" + task["implementation"] + "/llm_summary___" + experiment.split(".")[0] + "_checked.xlsx",
                sheet_name=iteration,
                engine="openpyxl",
            )
            for iteration in summaries.keys()
        }
        
    # Get list of iterations to analyze.
    max_iteration: str = max(set(
        iteration_2
        for experiment_2 in df_llm_annotations.keys()
        for iteration_2 in df_llm_annotations[experiment_2].keys()
    ))
    list_of_iterations: List[str] = [
        str(iteration_3).zfill(4)
        for iteration_3 in range(0, 51, 5)
    ]
        
    ###
    ### GET PERFORMANCE
    ###
    print("    ", "Get performance")
        
    # Initialize storage of experiment performances for all iterations.
    dict_of_performances_evolution_per_iteration: Dict[str, List[float]] = {
        iter_perf: []
        for iter_perf in list_of_iterations
    }
    
    # For each experiments.
    for experiment_perf in task["experiments"]:
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment_perf, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_clustering_performances: Dict[str, Dict[str, float]] = experiment_data["dict_of_clustering_performances"]

        # For each requested iteration...
        for iter_perf in list_of_iterations:

            # Append the clustering performance for the current experiment and for this iteration.
            if iter_perf in dict_of_clustering_performances.keys():
                dict_of_performances_evolution_per_iteration[iter_perf].append(
                    dict_of_clustering_performances[iter_perf]["v_measure"]
                )
            # If iteration isn't reached by this experiment, duplicate the last known results.
            # Most of the time: the experiment has reached annotation completeness and there is no more iteration because clustering is "perfect" (v-measure==1.0).
            else:
                dict_of_performances_evolution_per_iteration[iter_perf].append(1.0)
                
    # Compute mean of performance evolution.
    dict_of_performances_evolution_per_iteration_MEAN: Dict[str, Dict[str, float]] = {
        iteration_0m: np.mean(dict_of_performances_evolution_per_iteration[iteration_0m])
        for iteration_0m in dict_of_performances_evolution_per_iteration.keys()
    }
        
    # Compute sem of performance evolution.
    dict_of_performances_evolution_per_iteration_SEM: Dict[str, Dict[str, float]] = {
        iteration_0s: scipystats.sem(dict_of_performances_evolution_per_iteration[iteration_0s])
        for iteration_0s in dict_of_performances_evolution_per_iteration.keys()
    }


    ###
    ### ANALYZE CLUSTERING ANNOTATIONS
    ###
    print("    ", "Analyze clustering annotation")
        
    # Analyze clustering annotations.
    dict_of_clustering_relevance: Dict[str, Dict[str, List[float]]] = {}
    for iteration_4 in list_of_iterations:
        dict_of_clustering_relevance[iteration_4] = {
            "not_exploitable": [],
            "partially_exploitable": [],
            "exploitable": [],
        }
        # For all experiment...
        for experiment_4 in df_llm_annotations.keys():
            # Case of iteration exist in this experiment.
            if iteration_4 in df_llm_annotations[experiment_4].keys():
                # Count ratio of not exploitable clusters.
                dict_of_clustering_relevance[iteration_4]["not_exploitable"].append(
                    sum([
                        annotation == "not_exploitable"
                        for annotation in df_llm_annotations[experiment_4][iteration_4]["[Q] cluster well-designed?"]
                    ]) / len(df_llm_annotations[experiment_4][iteration_4]["cluster_id"])
                )
                # Count ratio of partially exploitable clusters.
                dict_of_clustering_relevance[iteration_4]["partially_exploitable"].append(
                    sum([
                        annotation == "partially_exploitable"
                        for annotation in df_llm_annotations[experiment_4][iteration_4]["[Q] cluster well-designed?"]
                    ]) / len(df_llm_annotations[experiment_4][iteration_4]["cluster_id"])
                )
                # Count ratio of exploitable clusters.
                dict_of_clustering_relevance[iteration_4]["exploitable"].append(
                    sum([
                        annotation == "exploitable"
                        for annotation in df_llm_annotations[experiment_4][iteration_4]["[Q] cluster well-designed?"]
                    ]) / len(df_llm_annotations[experiment_4][iteration_4]["cluster_id"])
                )
            # Case of iteration doesn't exist in this experiment, so take the last existing iteration.
            else:
                # Define the last existing iteration.
                max_experiment_iteration_4: str = max(df_llm_annotations[experiment_4].keys())
                # Count ratio of not exploitable clusters.
                dict_of_clustering_relevance[iteration_4]["not_exploitable"].append(
                    sum([
                        annotation == "not_exploitable"
                        for annotation in df_llm_annotations[experiment_4][max_experiment_iteration_4]["[Q] cluster well-designed?"]
                    ]) / len(df_llm_annotations[experiment_4][max_experiment_iteration_4]["cluster_id"])
                )
                # Count ratio of partially exploitable clusters.
                dict_of_clustering_relevance[iteration_4]["partially_exploitable"].append(
                    sum([
                        annotation == "partially_exploitable"
                        for annotation in df_llm_annotations[experiment_4][max_experiment_iteration_4]["[Q] cluster well-designed?"]
                    ]) / len(df_llm_annotations[experiment_4][max_experiment_iteration_4]["cluster_id"])
                )
                # Count ratio of exploitable clusters.
                dict_of_clustering_relevance[iteration_4]["exploitable"].append(
                    sum([
                        annotation == "exploitable"
                        for annotation in df_llm_annotations[experiment_4][max_experiment_iteration_4]["[Q] cluster well-designed?"]
                    ]) / len(df_llm_annotations[experiment_4][max_experiment_iteration_4]["cluster_id"])
                )
                
    # Compute mean of clustering relevance.
    dict_of_clustering_relevance_MEAN: Dict[str, Dict[str, float]] = {
        iteration_4m: {
            level: np.mean(dict_of_clustering_relevance[iteration_4m][level])
            for level in dict_of_clustering_relevance[iteration_4m].keys()
        }
        for iteration_4m in dict_of_clustering_relevance.keys()
    }
        
    # Compute sem of clustering relevance.
    dict_of_clustering_relevance_SEM: Dict[str, Dict[str, float]] = {
        iteration_4s: {
            level: scipystats.sem(dict_of_clustering_relevance[iteration_4s][level])
            for level in dict_of_clustering_relevance[iteration_4s].keys()
        }
        for iteration_4s in dict_of_clustering_relevance.keys()
    }
    
    # Create a new figure.
    fig_plot_clustering_relevance: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
    axis_plot_clustering_relevance = fig_plot_clustering_relevance.gca()

    # Set range of axis.
    axis_plot_clustering_relevance.set_xlim(xmin=-0.5, xmax=int(max(list_of_iterations))+0.5)
    axis_plot_clustering_relevance.set_ylim(ymin=-0.01, ymax=1.01)
    
    # Plot not_exploitable clustering relevance evolution.
    add_plot_of_evolution_per_iteration_to_graph(
        axis=axis_plot_clustering_relevance,
        list_of_x=list_of_iterations,
        dict_of_y={
            iter_plot: dict_of_clustering_relevance_MEAN[iter_plot]["not_exploitable"]
            for iter_plot in dict_of_clustering_relevance_MEAN.keys()
        },
        dict_of_y_err={
            iter_plot: dict_of_clustering_relevance_SEM[iter_plot]["not_exploitable"]
            for iter_plot in dict_of_clustering_relevance_SEM.keys()
        },
        label="Non exploitable",
        marker="",
        markersize=3,
        color="red",
        linewidth=2,
        linestyle="--",
        alpha=0.2,
    )
    
    # Plot partially_exploitable clustering relevance evolution.
    add_plot_of_evolution_per_iteration_to_graph(
        axis=axis_plot_clustering_relevance,
        list_of_x=list_of_iterations,
        dict_of_y={
            iter_plot: dict_of_clustering_relevance_MEAN[iter_plot]["partially_exploitable"]
            for iter_plot in dict_of_clustering_relevance_MEAN.keys()
        },
        dict_of_y_err={
            iter_plot: dict_of_clustering_relevance_SEM[iter_plot]["partially_exploitable"]
            for iter_plot in dict_of_clustering_relevance_SEM.keys()
        },
        label="Partiellement exploitable",
        marker="",
        markersize=3,
        color="orange",
        linewidth=2,
        linestyle="--",
        alpha=0.2,
    )
    
    # Plot exploitable clustering relevance evolution.
    add_plot_of_evolution_per_iteration_to_graph(
        axis=axis_plot_clustering_relevance,
        list_of_x=list_of_iterations,
        dict_of_y={
            iter_plot: dict_of_clustering_relevance_MEAN[iter_plot]["exploitable"]
            for iter_plot in dict_of_clustering_relevance_MEAN.keys()
        },
        dict_of_y_err={
            iter_plot: dict_of_clustering_relevance_SEM[iter_plot]["exploitable"]
            for iter_plot in dict_of_clustering_relevance_SEM.keys()
        },
        label="Exploitable",
        marker="",
        markersize=3,
        color="green",
        linewidth=2,
        linestyle="--",
        alpha=0.2,
    )

    # Set axis name.
    axis_plot_clustering_relevance.set_xlabel("itération [#]", fontsize=18,)
    axis_plot_clustering_relevance.set_ylabel("ratio de clusters [%]", fontsize=18,)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)

    # Plot the legend.
    axis_plot_clustering_relevance.legend(fontsize=15, loc="lower right",)

    # Plot the grid.
    axis_plot_clustering_relevance.grid(True)

    # Store the graph.
    fig_plot_clustering_relevance.savefig(
        "../results/etude-pertinence-llm-check-clustering-annotation-favori.png",
        dpi=300,
        transparent=True,
        bbox_inches="tight",
    )


    ###
    ### ANALYSE SUMMARY ANNOTATIONS
    ###
    print("    ", "Analyze summary annotation")
    
    # Analyze summary annotations.
    dict_of_summary_relevance: Dict[str, Dict[str, List[float]]] = {}
    for iteration_5 in list_of_iterations:
        dict_of_summary_relevance[iteration_5] = {
            "not_exploitable": [],
            "partially_exploitable": [],
            "exploitable": [],
        }
        # For all experiment...
        for experiment_5 in df_llm_annotations.keys():
            # Case of iteration exist in this experiment.
            if iteration_5 in df_llm_annotations[experiment_5].keys():
                # Count ratio of not exploitable clusters.
                dict_of_summary_relevance[iteration_5]["not_exploitable"].append(
                    sum([
                        annotation == "not_exploitable"
                        for annotation in df_llm_annotations[experiment_5][iteration_5]["[Q] summary relevant?"]
                    ]) / len(df_llm_annotations[experiment_5][iteration_5]["cluster_id"])
                )
                # Count ratio of partially exploitable clusters.
                dict_of_summary_relevance[iteration_5]["partially_exploitable"].append(
                    sum([
                        annotation == "partially_exploitable"
                        for annotation in df_llm_annotations[experiment_5][iteration_5]["[Q] summary relevant?"]
                    ]) / len(df_llm_annotations[experiment_5][iteration_5]["cluster_id"])
                )
                # Count ratio of exploitable clusters.
                dict_of_summary_relevance[iteration_5]["exploitable"].append(
                    sum([
                        annotation == "exploitable"
                        for annotation in df_llm_annotations[experiment_5][iteration_5]["[Q] summary relevant?"]
                    ]) / len(df_llm_annotations[experiment_5][iteration_5]["cluster_id"])
                )
            # Case of iteration doesn't exist in this experiment, so take the last existing iteration.
            else:
                # Define the last existing iteration.
                max_experiment_iteration_5: str = max(df_llm_annotations[experiment_5].keys())
                # Count ratio of not exploitable clusters.
                dict_of_summary_relevance[iteration_5]["not_exploitable"].append(
                    sum([
                        annotation == "not_exploitable"
                        for annotation in df_llm_annotations[experiment_5][max_experiment_iteration_5]["[Q] summary relevant?"]
                    ]) / len(df_llm_annotations[experiment_5][max_experiment_iteration_5]["cluster_id"])
                )
                # Count ratio of partially exploitable clusters.
                dict_of_summary_relevance[iteration_5]["partially_exploitable"].append(
                    sum([
                        annotation == "partially_exploitable"
                        for annotation in df_llm_annotations[experiment_5][max_experiment_iteration_5]["[Q] summary relevant?"]
                    ]) / len(df_llm_annotations[experiment_5][max_experiment_iteration_5]["cluster_id"])
                )
                # Count ratio of exploitable clusters.
                dict_of_summary_relevance[iteration_5]["exploitable"].append(
                    sum([
                        annotation == "exploitable"
                        for annotation in df_llm_annotations[experiment_5][max_experiment_iteration_5]["[Q] summary relevant?"]
                    ]) / len(df_llm_annotations[experiment_5][max_experiment_iteration_5]["cluster_id"])
                )

    # Compute mean of summary relevance.
    dict_of_summary_relevance_MEAN: Dict[str, Dict[str, float]] = {
        iteration_5m: {
            level: np.mean(dict_of_summary_relevance[iteration_5m][level])
            for level in dict_of_summary_relevance[iteration_5m].keys()
        }
        for iteration_5m in dict_of_summary_relevance.keys()
    }

    # Compute sem of summary relevance.
    dict_of_summary_relevance_SEM: Dict[str, Dict[str, float]] = {
        iteration_5s: {
            level: scipystats.sem(dict_of_summary_relevance[iteration_5s][level])
            for level in dict_of_summary_relevance[iteration_5s].keys()
        }
        for iteration_5s in dict_of_summary_relevance.keys()
    }
        
    # Create a new figure.
    fig_plot_summary_relevance: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
    axis_plot_summary_relevance = fig_plot_summary_relevance.gca()

    # Set range of axis.
    axis_plot_summary_relevance.set_xlim(xmin=-0.5, xmax=int(max(list_of_iterations))+0.5)
    axis_plot_summary_relevance.set_ylim(ymin=-0.01, ymax=1.01)
    
    # Plot not_exploitable summary relevance evolution.
    add_plot_of_evolution_per_iteration_to_graph(
        axis=axis_plot_summary_relevance,
        list_of_x=list_of_iterations,
        dict_of_y={
            iter_plot: dict_of_summary_relevance_MEAN[iter_plot]["not_exploitable"]
            for iter_plot in dict_of_summary_relevance_MEAN.keys()
        },
        dict_of_y_err={
            iter_plot: dict_of_summary_relevance_SEM[iter_plot]["not_exploitable"]
            for iter_plot in dict_of_summary_relevance_SEM.keys()
        },
        label="Non exploitable",
        marker="",
        markersize=3,
        color="red",
        linewidth=2,
        linestyle="--",
        alpha=0.2,
    )
    
    # Plot partially_exploitable summary relevance evolution.
    add_plot_of_evolution_per_iteration_to_graph(
        axis=axis_plot_summary_relevance,
        list_of_x=list_of_iterations,
        dict_of_y={
            iter_plot: dict_of_summary_relevance_MEAN[iter_plot]["partially_exploitable"]
            for iter_plot in dict_of_summary_relevance_MEAN.keys()
        },
        dict_of_y_err={
            iter_plot: dict_of_summary_relevance_SEM[iter_plot]["partially_exploitable"]
            for iter_plot in dict_of_summary_relevance_SEM.keys()
        },
        label="Partiellement exploitable",
        marker="",
        markersize=3,
        color="orange",
        linewidth=2,
        linestyle="--",
        alpha=0.2,
    )
    
    # Plot exploitable summary relevance evolution.
    add_plot_of_evolution_per_iteration_to_graph(
        axis=axis_plot_summary_relevance,
        list_of_x=list_of_iterations,
        dict_of_y={
            iter_plot: dict_of_summary_relevance_MEAN[iter_plot]["exploitable"]
            for iter_plot in dict_of_summary_relevance_MEAN.keys()
        },
        dict_of_y_err={
            iter_plot: dict_of_summary_relevance_SEM[iter_plot]["exploitable"]
            for iter_plot in dict_of_summary_relevance_MEAN.keys()
        },
        label="Exploitable",
        marker="",
        markersize=3,
        color="green",
        linewidth=2,
        linestyle="--",
        alpha=0.2,
    )

    # Set axis name.
    axis_plot_summary_relevance.set_xlabel("itération [#]", fontsize=18,)
    axis_plot_summary_relevance.set_ylabel("ratio de résumés [%]", fontsize=18,)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)

    # Plot the legend.
    axis_plot_summary_relevance.legend(fontsize=15, loc="lower right",)

    # Plot the grid.
    axis_plot_summary_relevance.grid(True)

    # Store the graph.
    fig_plot_summary_relevance.savefig(
        "../results/etude-pertinence-llm-check-resume-annotation-favori.png",
        dpi=300,
        transparent=True,
        bbox_inches="tight",
    )