# ==== INTERACTIVE CLUSTERING : BUSINESS RELEVANCE STUDY ====
> ### Stage 1 : Evaluate business relevance on previous experiments.

------------------------------
## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at compute business relevance on interactive clustering experiments**.
- Computations are based on previous experiments (cf. efficience study) exported in `./previous` folder.
- Environments results are stored in their `.JSON` files in the `/experiments` folder.

Then, **go to the notebook `2_Plot_some_figures.ipynb` to plot several figures according to these computations**.

### Description each steps

The several computations are :
- `consistency_score`: Test a TF-IDF + Linear SVM model on its own trainset with a high prediction score threshold in order to check data consistency. The consistency score is the f1-score computed. If f1-score is low (i.e. less than `0.75`), then trainset can be inconsistent : Data may be badly labeled or classes may overlap. Otherwise, f1-score should tend to 100%.
- `llm_summarization`: Use a large language model to summarize each cluster as a one sentence description.

Some combinations of parameters are studied:
- `simple_prep` + `tfidf` + `closest-50` + `hier_avg-10c`: best to reach 90% of v-measure (cf. efficiency study)
- `lemma_prep` + `tfidf` + `closest-50` + `kmeans_COP-10c`: best to reach 100% of v-measure (cf. efficiency study)
- `lemma_prep` + `tfidf` + `in_same-50` + `kmeans_COP-10c`: best to reach annotation completeness (cf. efficiency study)
- `simple_prep` + `tfidf` + `closest-50` + `kmeans_COP-10c`: choice of author (cf. cost study)

------------------------------
## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Dict, List, Any, Optional
import os
import json
from consistency_score import compute_consistency_score, display_consistency_score
import time
import haystack
from haystack.nodes import PromptTemplate
from haystack.nodes import PromptNode

In [None]:
import warnings
warnings.filterwarnings("ignore")

------------------------------
## 2. LOAD EXPERIMENTS TO STUDY

Find all implementations to analyze and associated experiments.

In [None]:
# List of run tasks to parallelize.
LIST_OF_TASKS: List[Dict[str, str]] = [
    {
        "implementation": implementation,  # Environment of experiment.
        "experiments": [
            exp_path
            for exp_path in os.listdir("../previous/"+implementation)
            if ".json" in exp_path
        ]
    }
    for implementation in os.listdir("../previous")
    if (
        os.path.isdir("../previous/" + implementation)
        and implementation in [
            "bank_cards_v1_-_simple_prep_-_tfidf_-_closest-50_-_hier_avg-10c",  # best to reach 90% of v-measure (cf. efficiency study)
            "bank_cards_v1_-_lemma_prep_-_tfidf_-_closest-50_-_kmeans_COP-10c",  # best to reach 100% of v-measure (cf. efficiency study)
            "bank_cards_v1_-_lemma_prep_-_tfidf_-_in_same-50_-_kmeans_COP-10c",  # best to reach annotation completeness (cf. efficiency study)
            "bank_cards_v1_-_simple_prep_-_tfidf_-_closest-50_-_kmeans_COP-10c",  # choice of author (cf. cost study)
            #"bank_cards_v1_-_simple_prep_-_tfidf",  # Mean.
            #"bank_cards_v1",  # Mean.
        ]
    )
]
print("There are", "`" + str(len(LIST_OF_TASKS)) + "`", "implementations to analyze.")

Create one folder per implementation to analyze.

In [None]:
for task in LIST_OF_TASKS:
    
    # If folder exists: continue.
    if os.path.isdir("../experiments/" + task["implementation"]):
        continue
    
    # Create folder for analyses.
    os.mkdir("../experiments/" + task["implementation"])
    
    # Copy data.
    for experiment in task["experiments"]:
        with open("../previous/" + task["implementation"] + "/" + experiment, "r") as file_previous_results_r:
            previous_results = json.load(file_previous_results_r)
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "w") as file_previous_results_w:
            json.dump(
                previous_results,
                file_previous_results_w
            )

------------------------------
## 3. START ANALYSES

----------
### 3.1. Analyze clustering consistency.

Compute all consistency scores.

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        print("    ", "experiment:", experiment)
    
        # If folder exists: continue.
        if os.path.exists("../experiments/" + task["implementation"] + "/constistency_score___" + experiment):
            continue
        
        # Load data
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_preprocessed_texts: Dict[str, str] = experiment_data["dict_of_preprocessed_texts"]
        dict_of_true_intents: Dict[str, str] = experiment_data["dict_of_true_intents"]
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]
            
        # Compute consistency score of groundtruth.
        groundtruth_consistency_score: float = compute_consistency_score(
            x_train = [
                dict_of_preprocessed_texts[text_ID]
                for text_ID in dict_of_preprocessed_texts.keys()
            ],
            y_train = [
                dict_of_true_intents[text_ID]
                for text_ID in dict_of_preprocessed_texts.keys()
            ],
            prediction_score_threshold = 0.75,
        )

        # Compute consistency score of clustering.
        clustering_consistency_score_evolution: Dict[str, float] = {
            iteration: compute_consistency_score(
                x_train = [
                    dict_of_preprocessed_texts[text_ID]
                    for text_ID in dict_of_preprocessed_texts.keys()
                ],
                y_train = [
                    str(dict_of_clustering_results[iteration][text_ID])
                    for text_ID in dict_of_preprocessed_texts.keys()
                ],
                prediction_score_threshold = 0.75,
            )
            for iteration in dict_of_clustering_results.keys()
        }
            
        # Store results.
        with open("../experiments/" + task["implementation"] + "/constistency_score___" + experiment, "w") as file_constistency_score_w:
            json.dump(
                {
                    "groundtruth": groundtruth_consistency_score,
                    "evolution": clustering_consistency_score_evolution,
                },
                file_constistency_score_w
            )

Display consistency score evolution in a graph.

In [None]:
config_consistency_score = {
    "bank_cards_v1_-_simple_prep_-_tfidf_-_closest-50_-_hier_avg-10c": {
        "plot_label": "Score de cohérence moyen des tentatives ayant le meilleur paramétrage moyen\npour atteindre une annotation partielle (90% de v-measure).",
        "plot_color": "green",
        "graph_filename": "etude-pertinence-consistence-annotation-partielle.png",
    },  # best to reach 90% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_lemma_prep_-_tfidf_-_closest-50_-_kmeans_COP-10c": {
        "plot_label": "Score de cohérence moyen des tentatives ayant le meilleur paramétrage moyen\npour atteindre une annotation suffisante (100% de v-measure).",
        "plot_color": "blue",
        "graph_filename": "etude-pertinence-consistence-annotation-suffisante.png",
    },  # best to reach 100% of v-measure (cf. efficiency study)
    "bank_cards_v1_-_lemma_prep_-_tfidf_-_in_same-50_-_kmeans_COP-10c": {
        "plot_label": "Score de cohérence moyen des tentatives ayant le meilleur paramétrage moyen\npour atteindre une annotation exhaustive (toutes les contraintes).",
        "plot_color": "red",
        "graph_filename": "etude-pertinence-consistence-annotation-exhaustive.png",
    },  # best to reach annotation completeness (cf. efficiency study)
    "bank_cards_v1_-_simple_prep_-_tfidf_-_closest-50_-_kmeans_COP-10c": {
        "plot_label": "Score de cohérence moyen des tentatives ayant le paramétrage favori\n(atteindre 90% de v-measure avec un coût global minimal).",
        "plot_color": "gold",
        "graph_filename": "etude-pertinence-consistence-annotation-retenue.png",
    },  # choice of author (cf. cost study)
}

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"], "\n    (", config_consistency_score[task["implementation"]]["graph_filename"], ")")
    display_consistency_score(
        implementation=task["implementation"],
        list_of_experiments=task["experiments"],
        list_of_iterations=[str(i).zfill(4) for i in range(40)],
        plot_label=config_consistency_score[task["implementation"]]["plot_label"],
        plot_color=config_consistency_score[task["implementation"]]["plot_color"],
        graph_filename=config_consistency_score[task["implementation"]]["graph_filename"],
    )
    
    

----------
### 3.2. Summarize clusters by a large language model.

Load credentials.
> Need a file `credentials.py` in projet home (`../..`)
> with content `OPENAI_API_TOKEN = "..."` from https://platform.openai.com/account/api-keys

In [None]:
import sys
sys.path.append("../..")
import credentials  # Need a file `credentials.py` in projet home (`..`) with content `OPENAI_API_TOKEN = "..."` from https://platform.openai.com/account/api-keys

Define learge language model to call and prompt template

In [None]:
# Get language model.
language_model: PromptNode = PromptNode(
    model_name_or_path="text-davinci-003",  # "text-davinci-003", "gpt-3.5-turbo"
    api_key=credentials.OPENAI_API_TOKEN,
)

In [None]:
# Define prompt template.
template: PromptTemplate = PromptTemplate(
    prompt_text="""
    Trouver la thématique traitée dans le texte donné.
    Texte : $text
    """,
    name="Description d'un topic",
)

Summarize clusters.

In [None]:
MAX_RETRY: int = 50  # for proxy error...

In [None]:
# For each implementation.
for task in LIST_OF_TASKS:
    print("IMPLEMENTATION:", task["implementation"])
    
    # For each experiments.
    for experiment in task["experiments"]:
        print("    ", "experiment:", experiment)
        
        # Load data.
        with open("../experiments/" + task["implementation"] + "/previous_results___" + experiment, "r") as file_experiment_data_r:
            experiment_data: Dict[str, Any] = json.load(file_experiment_data_r)
        dict_of_texts: Dict[str, str] = experiment_data["dict_of_texts"]
        dict_of_true_intents: Dict[str, str] = experiment_data["dict_of_true_intents"]
        dict_of_clustering_results: Dict[str, Dict[str, str]] = experiment_data["dict_of_clustering_results"]
        
        # Intialize clustering summaries if need.
        if not os.path.exists("../experiments/" + task["implementation"] + "/clustering_summary___" + experiment):
            # Define list of iteration to analyze.
            list_of_iteration: List[str] = [
                i
                #for i in ["0000", "0005", "0010", "0015", "0020", "0025", "0030", "0035", "0040", max(dict_of_clustering_results.keys())]
                for i in [max(dict_of_clustering_results.keys())]
                if i in dict_of_clustering_results.keys()
            ]
            # Define initial clustering summaries for each cluster of each iteration to analyze.
            initial_clustering_summaries: Dict[str, Dict[str, Optional[str]]] = {
                iteration: {
                    str(cluster_id): None  # need to force cluster_id to str for serialization
                    for cluster_id in sorted(set(dict_of_clustering_results[iteration].values()))
                }
                for iteration in list_of_iteration
            }
            # Store initial clustering summaries.
            with open("../experiments/" + task["implementation"] + "/clustering_summary___" + experiment, "w") as file_initial_clustering_summaries_w:
                json.dump(initial_clustering_summaries, file_initial_clustering_summaries_w)
        
        # Load clustering summaries already done.
        with open("../experiments/" + task["implementation"] + "/clustering_summary___" + experiment, "r") as file_clustering_summary_r:
            summaries: Dict[str, Dict[str, Optional[str]]] = json.load(file_clustering_summary_r)
        
        # For each iteration...
        for iteration in summaries.keys():
            print("    ", "    ", "iteration:", iteration)
            
            # For each cluster
            for cluster_id in summaries[iteration].keys():
                print("    ", "    ", "    ", cluster_id, end=": ")
                
                # If already done: continue
                if summaries[iteration][str(cluster_id)] is not None:
                    print("(already done)")
                    continue                    
                
                # Get texts of this cluster.
                cluster = [
                    text
                    for text_id, text in dict_of_texts.items()
                    if dict_of_clustering_results[iteration][text_id] == int(cluster_id)
                ]
                print("(cluster length: {0})".format(len(cluster)), end=" ; ")
                
                # Call the LLM to summarize the document (use loop to by-pass timeout).
                it: int = 0
                last_err: Exception = None
                while it<MAX_RETRY:
                    time.sleep(.01)
                    try : 
                        it += 1
                        answer: List[str] = language_model.prompt(
                            prompt_template=template,
                            #text="\n".join(cluster),
                            #text=" \n ".join(cluster),
                            text=cluster,
                        )
                        break
                    # except OSError :
                    except Exception as err:
                        last_err = err
                        continue
                
                # If error: continue...
                if it==MAX_RETRY:
                    print(last_err)
                    continue
                
                # Get summary from answer.
                cluster_summary: str = answer[0]
                print(cluster_summary)
                
                # Store updated summaries.
                #summaries[iteration][str(cluster_id)] = cluster_summary
                #with open("../experiments/" + task["implementation"] + "/clustering_summary___" + experiment, "w") as file_clustering_summary_w:
                #    json.dump(summaries, file_clustering_summary_w)