# Interactive Clustering: How analyze an annotation project ?

> Table of contents

- [ ] TODO
- [ ] TODO
- [ ] TODO

> Load Python dependencies.

In [None]:
# pip install numpy pandas plotly cognitivefactory_features_maximization_metric openai

In [18]:
# Typing.
from typing import List, Dict, Optional, Tuple

# File and path management.
import os
import json
import pathlib
import pickle

# Data management matrix.
import pandas as pd
from scipy.sparse import csr_matrix

# Statistics.
import numpy as np
from sklearn import metrics

# FMC analysis.
from sklearn.feature_extraction.text import TfidfVectorizer
from cognitivefactory.features_maximization_metric.fmc import FeaturesMaximizationMetric

# LLM call.
import openai

# Graph display.
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML

-----
## I. Load project

> ### Set the path to the project data:

In [19]:
PATH_TO_PROJECT_DATA: pathlib.Path = pathlib.Path("./demo-project-data")

> ### List accessible files in the project.

In [20]:
print(f"Accessible files are: {os.listdir(PATH_TO_PROJECT_DATA)}")

Accessible files are: ['clustering.json', 'constraints.json', 'constraints_manager.pkl', 'metadata.json', 'modelization.json', 'sampling.json', 'settings.json', 'status.json', 'texts.json', 'vectors.pkl', 'vectors_2D.json', 'vectors_3D.json']


> ### Display project metadata (`project_id`, `project_name`, ...).
> _It needs the files `metadata.json`._

In [21]:
# Load data in a DataFrame/Series.
with open(PATH_TO_PROJECT_DATA / "metadata.json", "r") as metadata_r:
    project_medatata = json.load(metadata_r)
df_metadata: pd.DataFrame = pd.Series(project_medatata)

# Display loaded data.
df_metadata

project_id             1680008094996461
project_name                       DEMO
creation_timestamp    1680008094.996461
dtype: object

> ### Display project status (`iteration_id`, `state`, `task`).
> _It needs the files `status.json`._

In [22]:
# Load data in a DataFrame/Series.
with open(PATH_TO_PROJECT_DATA / "status.json", "r") as status_r:
    project_status = json.load(status_r)
df_status: pd.Series = pd.Series(project_status)

# Display loaded data.
df_status

iteration_id                                                    3
state           ANNOTATION_WITH_OUTDATED_MODELIZATION_WITH_CON...
task                                                         None
dtype: object

> ### Load texts (`text`, `text_preprocessed`, `is_deleted`, ...).
> _It needs the files `texts.json`._

In [23]:
# Load data in a DataFrame/Series.
with open(PATH_TO_PROJECT_DATA / "texts.json", "r") as texts_r:
    project_texts = json.load(texts_r)
df_texts: pd.DataFrame = pd.DataFrame.from_dict(project_texts, orient='index')

# Display loaded data.
print(f">> Total number of texts: '{len(df_texts)}'")
print(f">> Number of deleted texts: '{len(df_texts[df_texts['is_deleted']==True])}'")
df_texts.head()

>> Total number of texts: '1000'
>> Number of deleted texts: '0'


Unnamed: 0,text_original,text,text_preprocessed,is_deleted
0,À combien s'élève le solde de mon compte ?,À combien s'élève le solde de mon compte ?,a combien s eleve le solde de mon compte,False
1,A combien s'élève les disponibilités sur mes c...,A combien s'élève les disponibilités sur mes c...,a combien s eleve les disponibilites sur mes c...,False
2,A quoi sert les numéros de cartes virtuelles ?,A quoi sert les numéros de cartes virtuelles ?,a quoi sert les numeros de cartes virtuelles,False
3,activer le moyen de paiement NFC sur ma carte ...,activer le moyen de paiement NFC sur ma carte ...,activer le moyen de paiement nfc sur ma carte ...,False
4,activer le paiement nfc sur une Mastercard,activer le paiement nfc sur une Mastercard,activer le paiement nfc sur une mastercard,False


> ### Load annotated constraints (`data`, `constraint_type`, `to_annotate`, `is_hidden`, `to_fix_conflict`, ...).
> _It needs the files `constraints.json`._

In [24]:
# Load data in a DataFrame/Series.
with open(PATH_TO_PROJECT_DATA / "constraints.json", "r") as constraints_r:
    project_constraints = json.load(constraints_r)
df_constraints: pd.DataFrame = pd.DataFrame.from_dict(project_constraints, orient='index')
df_constraints["data_id_1"] = df_constraints.apply(lambda row: row["data"]["id_1"], axis=1)
df_constraints["data_id_2"] = df_constraints.apply(lambda row: row["data"]["id_2"], axis=1)

# Display loaded data.
print(f">> Total number of constraints: '{len(df_constraints)}'")
print(f">> Number of annotated constraints: '{len(df_constraints[(df_constraints['to_annotate']==False) & (df_constraints['is_hidden']==False)])}'")
print(f">> Number of constraints in conflict: '{len(df_constraints[(df_constraints['to_fix_conflict']==True) & (df_constraints['is_hidden']==False)])}'")
df_constraints.head()

>> Total number of constraints: '50'
>> Number of annotated constraints: '50'
>> Number of constraints in conflict: '3'


Unnamed: 0,data,constraint_type,constraint_type_previous,is_hidden,to_annotate,to_review,to_fix_conflict,comment,date_of_update,iteration_of_sampling,data_id_1,data_id_2
"(487,488)","{'id_1': '487', 'id_2': '488'}",MUST_LINK,[None],False,False,False,False,,1680008000.0,1,487,488
"(841,842)","{'id_1': '841', 'id_2': '842'}",CANNOT_LINK,[None],False,False,False,False,,1680008000.0,1,841,842
"(727,728)","{'id_1': '727', 'id_2': '728'}",MUST_LINK,[None],False,False,False,False,,1710519000.0,1,727,728
"(488,489)","{'id_1': '488', 'id_2': '489'}",MUST_LINK,[None],False,False,False,False,,1710519000.0,1,488,489
"(780,781)","{'id_1': '780', 'id_2': '781'}",MUST_LINK,[None],False,False,False,False,,1710519000.0,1,780,781


> ### Load clustering results.
> _It needs the files `clustering.json`. Result can be `NaN` if text was deleted._

In [25]:
# Load data in a DataFrame.
with open(PATH_TO_PROJECT_DATA / "clustering.json", "r") as clustering_r:
    project_clustering = json.load(clustering_r)

df_clusterings: pd.DataFrame = pd.DataFrame.from_dict(project_clustering)
df_clusterings.fillna(value=-1, inplace=True)  # fill `NaN` values by `-1`.
df_clusterings = df_clusterings.astype(int).astype(str)  # to `int` to remove float, then to `str` to have categorical values.
df_clusterings.sort_index(key=lambda x: x.astype(int), inplace=True)

# Display loaded data.
print(f">> Number of clustering iterations: '{len(project_clustering)}'")
df_clusterings.head()

>> Number of clustering iterations: '3'


Unnamed: 0,0,1,2
0,-1,0,0
1,-1,0,0
2,0,0,0
3,0,0,0
4,0,0,0


> ### Load vectorial representation.
> _It needs files `vectors_2D.json` and `vectors.pkl`._

In [26]:
# Load data in a csr_matrix.
with open(PATH_TO_PROJECT_DATA / "vectors.pkl", "rb") as vectors_r:
    project_vectors: Dict[str, csr_matrix] = pickle.load(vectors_r)

In [27]:
# Load data in a DataFrame.
with open(PATH_TO_PROJECT_DATA / "vectors_3D.json", "r") as vectors_3d_r:
    project_vectors_3d = json.load(vectors_3d_r)
df_vectors_3d: pd.DataFrame = pd.DataFrame.from_dict(project_vectors_3d, orient="index")

# Display loaded data.
df_vectors_3d

Unnamed: 0,x,y,z
0,31.306932,3.156924,-20.107103
1,32.993519,15.797846,-21.651396
2,-19.677374,24.773836,4.550860
3,-4.766092,21.231388,-4.144927
4,-1.561593,24.885757,-3.984297
...,...,...,...
995,23.185547,17.572229,9.212809
996,-5.224369,6.395019,36.467518
997,-8.432920,-18.672470,26.864052
998,18.604958,-9.611234,3.791892


-----
## II. Analyze clustering evolution

> ### Display evolution of clustering differences.
> _It is computed by `1 - vmeasure(current_clustering, previous_clustering)`_

In [28]:
# Compute clustering difference score for each iteration.
clustering_differences: Dict[str, float] = {}
for iteration in range(1, int(df_status["iteration_id"])):
    # Get iteration ids.
    current_iteration_id: str = str(iteration)
    previous_iteration_id: str = str(iteration-1)
    # Format clustering results: Get common text ids.
    list_of_common_text_ids: List[str] = [
        text_id
        for text_id in df_clusterings.index
        if (
            df_texts["is_deleted"][text_id] == False
        ) and (
            df_clusterings[previous_iteration_id][text_id] != "-1"
        ) and (
            df_clusterings[current_iteration_id][text_id] != "-1"
        )
    ]
    # Compute scores.
    clustering_differences[current_iteration_id] = 1.0 - metrics.v_measure_score(
        labels_true=df_clusterings[previous_iteration_id][list_of_common_text_ids],
        labels_pred=df_clusterings[current_iteration_id][list_of_common_text_ids],
    )
df_clustering_differences: pd.DataFrame = pd.DataFrame.from_dict(clustering_differences, orient="index", columns=["difference rate"])
df_clustering_differences["iteration"] = df_clustering_differences.index

# Display clustering difference scores.
fig = px.line(
    df_clustering_differences,
    x="iteration",
    y="difference rate",
    markers=True,
    title="<b>Evolution of clustering results differences</b>",
    color_discrete_sequence=["red"],
)
fig.update_layout(yaxis=dict(range=[0.0, 1.0]))
fig.show()

-----
## III. Explore clustering content

> ### Define iteration to analyze.
> NB: _Replace with the iteration you want to display.
> Defaults to the latest iteration._

In [63]:
ITERATION_ID: str = "0"  # max(df_clusterings.columns)
print(f"Iteration to analyze: {ITERATION_ID}")

Iteration to analyze: 0


> ### Limit clusters to analyze.
> NB: _Defaults to all clusters without garbage._

In [64]:
LIST_OF_CLUSTER_IDS: List[str] = sorted([
    cluster_id
    for cluster_id in df_clusterings[ITERATION_ID].unique()
    if cluster_id != "-1"  # do not analyse garbage clusters.
])
print(f"List of clusters to analyze: {LIST_OF_CLUSTER_IDS}")

List of clusters to analyze: ['0', '1', '10', '3', '4']


> ### Limit texts to analyze.
> NB: _Defaults to all texts that are not deleted and that are clustered._

In [67]:
LIST_OF_TEXTS_IDS: pd.Index = (
    (df_texts["is_deleted"]==False)
    & (df_clusterings[ITERATION_ID].isin(LIST_OF_CLUSTER_IDS))
)
print(f"Number of texts to analyze: {sum(LIST_OF_TEXTS_IDS)}")
print(f"List of texts to analyze:\n{LIST_OF_TEXTS_IDS}")

Number of texts to analyze: 998
List of texts to analyze:
0      False
1      False
2       True
3       True
4       True
       ...  
995     True
996     True
997     True
998     True
999     True
Length: 1000, dtype: bool


> ### Display clustering content in 3D.

In [68]:
# Prepare data and filter deleted data.
df_clustering_3d: pd.DataFrame = df_vectors_3d.copy()
df_clustering_3d["cluster"] = df_clusterings[ITERATION_ID]
df_clustering_3d["text"] = df_texts["text"]
df_clustering_3d = df_clustering_3d[LIST_OF_TEXTS_IDS]

# Display clusters.
fig = px.scatter_3d(
    df_clustering_3d, 
    x="x",
    y="y",
    z="z",
    color="cluster",
    hover_name="text",
    title=f"<b>Clustering result at iteration '{ITERATION_ID}'.</b>",
    width=800,
    height=800,
)
fig.update_layout(
    xaxis=dict(range=[min(df_clustering_3d["x"])-1, max(df_clustering_3d["x"])+1]),
    yaxis=dict(range=[min(df_clustering_3d["y"])-1, max(df_clustering_3d["y"])+1]),
)
fig.show()

> ### Compute relevant linguistic pattern according to `FMC` analysis.
> _`FMC` means `Features Maximization Contrast`: it selects the list of linguistic patterns that are the most relevant to represent a cluster and distinguish it from other clusters._

In [31]:
# Prepare data and filter deleted data.
df_data_for_fmc: pd.DataFrame = pd.DataFrame()
df_data_for_fmc["text_preprocessed"] = df_texts["text_preprocessed"]
df_data_for_fmc["cluster"] = df_clusterings[ITERATION_ID]
df_data_for_fmc = df_data_for_fmc[LIST_OF_TEXTS_IDS]

# Define vectorizer.
vectorizer = TfidfVectorizer(min_df=0.0, ngram_range=(1, 3), analyzer="word", sublinear_tf=True)
matrix_of_vectors: csr_matrix = vectorizer.fit_transform(df_data_for_fmc["text_preprocessed"])
list_of_possible_vectors_features: List[str] = list(vectorizer.get_feature_names_out())

# Define FMC modelization.
fmc_computer: FeaturesMaximizationMetric = FeaturesMaximizationMetric(
    data_vectors=matrix_of_vectors,
    data_classes=df_data_for_fmc["cluster"],
    list_of_possible_features=list_of_possible_vectors_features,
    amplification_factor=1,
)

# Get most active linguistic patterns accoding to FMC for each cluster.
dict_of_cluster_most_active_linguistic_patterns: Dict[str, List[str]] = {
    cluster_id: [
        linguistic_pattern
        for linguistic_pattern in fmc_computer.get_most_active_features_by_a_classe(
            classe=cluster_id,
            activation_only=True,
            sort_by="fmeasure",  # "fmeasure" or "contrast"
            max_number=50,  # number of linguistic patterns to retain.
        )
        if fmc_computer.get_most_activated_classes_by_a_feature(linguistic_pattern) == [cluster_id]
    ]
    for cluster_id in LIST_OF_CLUSTER_IDS
}

# Display selected linguistic patterns for each cluster
for cluster_id, most_active_linguistic_patterns in dict_of_cluster_most_active_linguistic_patterns.items():
    print(f"Most relevant linguistic patterns for cluster '{cluster_id}':\n>> {most_active_linguistic_patterns}")

Most relevant linguistic patterns for cluster '-1':
>> ['combien eleve', 'eleve', 'eleve le solde', 'eleve le', 'combien eleve le', 'les disponibilites sur', 'les disponibilites', 'eleve les disponibilites', 'eleve les', 'disponibilites sur mes', 'combien eleve les', 'disponibilites sur', 'disponibilites', 'sur mes comptes', 'solde de mon', 'sur mes', 'de mon compte', 'le solde de', 'le solde', 'solde de', 'de mon', 'mes comptes', 'comptes']
Most relevant linguistic patterns for cluster '0':
>> ['de', 'de carte', 'carte de', 'de paiement', 'de credit', 'credit', 'de ma', 'carte de paiement', 'carte de credit', 'plafond de', 'de ma carte', 'ma carte de', 'nfc', 'de decouvert', 'numero', 'un numero', 'gerer', 'perte', 'augmenter', 'perte de', 'numero de carte', 'numero de', 'vol', 'changer de carte', 'changer de', 'sur ma', 'un numero de', 'de carte virtuelle', 'vol de', 'sur ma carte', 'de carte de']
Most relevant linguistic patterns for cluster '4':
>> ['combien ai je', 'combien ai', '

In [73]:
# Define translation of token in text according to FMC activation.
def translate_token_in_text(text: str, dict_of_translation: Dict[str, str]) -> str:
    """
        In a text, replace all token according to a translation dictionnary.
        
        Args:
            text (str): The text to translate.
            dict_of_translation (Dict[str, str]): The list of translations. Fro exemple, `card` can be translated in `<b>card</b>`.
        
        Return:
            str: The translated text.
    """
    text_tokenized: List[str] = text.split(" ")
    for token_text, token_translated in dict_of_translation.items():
        text_tokenized = [
            token_translated if (token == token_text) else token
            for token in text_tokenized
        ]
    return " ".join(text_tokenized)

# Highlight relevant linguistic patterns for the current cluster that are present.
df_data_for_fmc["text_with_appropriate_highlighting"] = df_data_for_fmc.apply(
    lambda row: translate_token_in_text(
        text=row["text_preprocessed"],
        dict_of_translation={
            token: f"<i style='color: #00DD00;'>{token}</i>"
            for token in dict_of_cluster_most_active_linguistic_patterns[row["cluster"]]
        }
    ),
    axis=1,
)

# Highlight relevant linguistic patterns for other clusters that are present.
df_data_for_fmc["text_with_inappropriate_highlighting"] = df_data_for_fmc.apply(
    lambda row: translate_token_in_text(
        text=row["text_preprocessed"],
        dict_of_translation={
            token: f"<i style='color: #DD0000;'>{token}</i>"
            for other_cluster_id in LIST_OF_CLUSTER_IDS
            for token in dict_of_cluster_most_active_linguistic_patterns[other_cluster_id]
            if row["cluster"] != other_cluster_id
        }
    ),
    axis=1,
)

> ### Sum up clustering topics by a Large Language Model.
> _To be able to call a model from OpenAI, you need to register and create a token api key on https://platform.openai.com/account/api-keys, then store it in a file named `credentials.py` near this notebook. The content of this file should look like : `OPENAI_API_TOKEN: str = "..."`._

In [74]:
# Define OpenAI model settings.
import credentials  # The file near this notebook that contains credentials.
openai.api_key = credentials.OPENAI_API_TOKEN  # Given by https://platform.openai.com/account/api-keys.
OPENAI_MODEL: str = "gpt-3.5-turbo"
MAX_RETRY: int = 3  # Retry when timeout.
COOLDOWN_RETRY: int = 21  # # To avoid Error "Rate limit reached" => "default-gpt-3.5-turbo" limited at 3 request per minute.

# Define prompt (adapt according your context).
#PROMPT_SYSTEM: str = """
#Tu es un expert des secteurs banque, assurance et finance.
#Ton objectif est de résumer les thématiques contenues dans les textes suivants.
#Tu répondras en une description concise contenant les informations clées.
#Par exemples: 'Perte et vol de carte bancaire' ou 'Simulation d'un prêt immobilier'.
#"""
PROMPT_SYSTEM: str = """
Tu es un expert du domaine autombile, de la mécanique et de la relation client.
Ton objectif est de résumer les thématiques contenues dans les textes suivants.
Tu répondras en une description concise contenant les informations clées.
Par exemples: 'Réalisation du contrôle technique' ou 'Changement des pneus'.
"""

In [75]:
# Prepare data and filter deleted data.
df_data_for_llm: pd.DataFrame = pd.DataFrame()
df_data_for_llm["text"] = df_texts["text"]
df_data_for_llm["text_formatted"] = df_data_for_llm.apply(lambda row: f"- {row['text']}", axis=1)
df_data_for_llm["cluster"] = df_clusterings[ITERATION_ID]
df_data_for_llm = df_data_for_llm[LIST_OF_TEXTS_IDS]

# Prepare storage of clustering summary.
dict_of_clustering_summary: Dict[str, Optional[str]] = {
    cluster_id: None
    for cluster_id in LIST_OF_CLUSTER_IDS
}

In [None]:
# Loop on cluster and sum up their topics.
for cluster_id in LIST_OF_CLUSTER_IDS:

    # Call the LLM to summarize the document (use loop to by-pass timeout).
    nb_of_try: int = 0
    last_error: Exception = None
    while nb_of_try<MAX_RETRY:
        time.sleep(21)  # To avoid "Rate limit reached" => "default-gpt-3.5-turbo" limited at 3 request per minute.
        # Try to get a response from the model.
        try : 
            nb_of_try += 1
            # Get model completion.
            chat_answers = openai.ChatCompletion.create(
                model=OPENAI_MODEL,
                messages=[
                    {
                        "role": "system",
                        "content": f"{PROMPT_SYSTEM}"
                    },
                    {
                        "role": "user",
                        "content": f"{'\n'.join(df_data_for_llm[df_data_for_llm["cluster"]==cluster_id]["text_formatted"])}"
                    }
                ]
            )
            break
        # Catch error.
        except Exception as err:
            last_error = err
            continue
    # If error: continue...
    if it==MAX_RETRY:
        print(last_err)
        continue
    # Otherwise: store summary.
    dict_of_clustering_summary[cluster_id] = chat_answers.choices[0].message.content
    

> ### Display clustering with its summary and its content highlighted according to FMC analysis.

In [77]:
# Display summary of clusters.
for cluster_id in LIST_OF_CLUSTER_IDS:
	display(HTML(f"""
		<hr>
        <details>
			<summary style="font-weight: bold; font-size: 1.2em;">Display summary of cluster '{cluster_id}':</summary>
            <p style='text-align:center;'>
                <b>Topics:</b>
				<i>{dict_of_clustering_summary[cluster_id]}</i>
            </p>
			<table>
				<thead>
					<tr>
						<th>Text ID</th>
						<th>Text with appropriate highlighting</th>
						<th>Text with inappropriate highlighting</th>
					</tr>
				</thead>
				<tbody>
					{
						''.join([
							f'''
							<tr>
								<th>{text_id}</th>
								<td>{row["text_with_appropriate_highlighting"]}</td>
								<td>{row["text_with_inappropriate_highlighting"]}</td>
							</tr>
							'''
							for text_id, row in df_data_for_fmc.iterrows()
							if row['cluster'] == cluster_id
						])
					}
				</tbody>
			</table>
		</details>
	"""))

Text ID,Text with appropriate highlighting,Text with inappropriate highlighting
2,a quoi sert les numeros de cartes virtuelles,a quoi sert les numeros de cartes virtuelles
3,activer le moyen de paiement nfc sur ma carte gold,activer le moyen de paiement nfc sur ma carte gold
4,activer le paiement nfc sur une mastercard,activer le paiement nfc sur une mastercard
5,activer les achats avec un numero virtuel,activer les achats avec un numero virtuel
6,activer les numeros de carte virtuelle,activer les numeros de carte virtuelle
7,activer les paiements avec mon telephone,activer les paiements avec mon telephone
8,activer les paiements sans contact avec mon telephone,activer les paiements sans contact avec mon telephone
9,activer l option nfc sur ma mastercard,activer l option nfc sur ma mastercard
10,activer option nfc,activer option nfc
11,activer paiement nfc,activer paiement nfc


Text ID,Text with appropriate highlighting,Text with inappropriate highlighting
21,ai je l obligation d avoir un compte bancaire crediteur,ai je l obligation d avoir un compte bancaire crediteur
26,ai je une extension de garantie avec ma carte gold pour un achat d eletromenager,ai je une extension de garantie avec ma carte gold pour un achat d eletromenager
27,ai je une extension de garantie pour ma machine a laver,ai je une extension de garantie pour ma machine a laver
28,ai je une extension de garantie pour ma television,ai je une extension de garantie pour ma television
29,ajouter une carte online,ajouter une carte online
32,apres plusieurs mauvais codes ma carte a ete bloquee que faire,apres plusieurs mauvais codes ma carte a ete bloquee que faire
33,apres plusieurs mauvais codes ma carte a ete bloquee,apres plusieurs mauvais codes ma carte a ete bloquee
34,assurance comprises avec ma carte bancaire pour un voyage a l etranger,assurance comprises avec ma carte bancaire pour un voyage a l etranger
35,assure moi que je ne suis pas debiteur,assure moi que je ne suis pas debiteur
36,au secours j ai fais trois mauvais code avec ma carte,au secours j ai fais trois mauvais code avec ma carte


Text ID,Text with appropriate highlighting,Text with inappropriate highlighting
110,changement de plafond carte bancaire,changement de plafond carte bancaire


Text ID,Text with appropriate highlighting,Text with inappropriate highlighting
117,changer ma limite de paiements par carte,changer ma limite de paiements par carte
118,changer plafond carte,changer plafond carte
119,changer pour une carte gold,changer pour une carte gold


Text ID,Text with appropriate highlighting,Text with inappropriate highlighting
12,activer paiement sans contact,activer paiement sans contact
120,choisir une nouvelle carte,choisir une nouvelle carte
121,choisir une nouvelle carte bancaire,choisir une nouvelle carte bancaire
122,combien ai je d argent de disponible,combien ai je d argent de disponible
123,combien ai je depense ce mois ci,combien ai je depense ce mois ci
124,combien ai je sur mon compte,combien ai je sur mon compte
125,combien ai je sur mon compte courant,combien ai je sur mon compte courant
126,combien d argent me reste t il,combien d argent me reste t il
