In [None]:
#! pip install mistralai

In [1]:
#Imports
import pandas as pd # type: ignore
from pathlib import Path
import os
import numpy as np #type: ignore
from tqdm import tqdm #type: ignore
import json
import demjson3 # type: ignore
from IPython.display import clear_output # type: ignore
from dotenv import load_dotenv #type: ignore
from mistralai import Mistral #type: ignore

In [2]:
f = open("keys.txt")
key = f.readline().strip('\n')
f.close()
key = key[12:]

In [3]:
api_key = key
model = "open-mistral-nemo"

client = Mistral(api_key=api_key)

In [4]:
# Reprise de la même architecture que celle utilisée par Luciano

PROMPT_FOLDER_OPENAI = 'prompts_openai'
PROMPT_FOLDER_MISTRAL = 'prompts_mistral'
PROMPT_FOLDER_MISTRAL_VOUVOIEMENT = 'prompts_mistral_vouvoiement'
PROMPT_FOLDER_MISTRAL_FORMV2 = 'prompts_mistral_form_v2'
OUT_FILE = 'dialogue_acts_example' + '.csv'
OUT_FOLDER = 'labeled_conversations'
LABEL_COLUMN_1 = 'forme'
LABEL_COLUMN_2 = 'ton'
LABEL_COLUMN_3 = 'content'
LABEL_COLUMN_4 = 'nature'
LABEL_COLUMN_5 = 'formv2'
#ajouter ici les autres labels des attributs
CATEGORY_COLUMN = 'type'

In [None]:
def convert_to_dataframe(list_of_json: list, 
                        label_columns: list):

    '''
    Converts a list of JSON strings into a flattened Pandas DataFrame.

    This function processes a list of JSON strings, each representing a conversation with
    messages and annotations. It flattens the nested structure of the JSON and constructs
    a DataFrame with columns for activity ID, user ID, message metadata, and annotations.

    Args:
        list_of_json (list): A list of JSON strings, where each string represents a conversation.
        label_column (str): The name of the column to store annotations (e.g., labels).

    Returns:
        pd.DataFrame: A DataFrame containing the flattened conversation data with columns:
            - activity_id: The ID of the activity.
            - user_id: The ID of the user.
            - role: The role of the speaker (e.g., user, model).
            - message_num: The message number in the conversation.
            - utterance_num: The utterance number within the same message and role.
            - sentence_en: The English sentence of the message.
            - <label_column>: The annotation or label for the message.

    Raises:
        JSONDecodeError: If a JSON string cannot be decoded using `json.loads`.
        demjson3.JSONDecodeError: If a JSON string cannot be decoded using `demjson3.decode`.
    '''
    # A list to save every entry (row for our dataframe)
    flattened_data = []

    for entry in list_of_json:
        # First try to decode the string using vanilla JSON module
        try: 
            # Try to do the conversion 
            json_entry = json.loads(entry)

        # In case of error use demjson3 which is capable to deal
        # with JSON-like string (that use single quotes instead of double quotes)
        except:
            # Print the entry that trigger the exception for manual inspection 
            print(entry)
            json_entry = demjson3.decode(entry)

        # Get the IDs of the activity and the conversation
        seance_id = json_entry['seance_ID']
        groupe_id = json_entry['groupe_ID']
        # For every utterance in the conversation
        for message in json_entry['conversation']:
            # Create an entry (row) to use then in the dataframe
            flattened_data.append({
                'seance_ID' : seance_id,
                'groupe_ID': groupe_id,
                'messageID': message['messageID'],
                'role': message['role'],
                'message': message['message'],
                label_columns[0] : message['annotation']['form'],    #ajouter ici les autres attributs
                label_columns[1] : message['annotation']['tone'],
                label_columns[2] : message['annotation']['content'],
                label_columns[3] : message['annotation']['nature'],
                label_columns[4] : message['annotation']['form2']
            })

    # Convert the entries in a dataframe
    out_df = pd.DataFrame(flattened_data)
    # Calculate an index for each utterance
    out_df['utterance_num'] = out_df.groupby(['groupe_ID', 'messageID', 'role']).cumcount()
    # Sort the columns of the dataframe
    out_df = out_df[['seance_ID', 'groupe_ID', 'role', 
                        'messageID', 'utterance_num','message', label_columns[0],label_columns[1],label_columns[2],label_columns[3],label_columns[4]]]
    return out_df


In [19]:
def read_prompts(folder: str,name:str):
    '''
    Reads system, user, and agent prompts from text files in a specified folder.

    This function constructs file paths for `prompt_system.txt`, `prompt_user.txt`, and
    `prompt_agent.txt` within the given folder, reads their contents, and returns them
    as a tuple of strings.

    Args:
        folder (str): The name of the folder (relative to the parent of the current working
                        directory under the "coding_schemes" directory) containing the prompt files.

    Returns:
        tuple[str, str, str]: A tuple containing three strings:
            - PROMPT_SYSTEM: The content of `prompt_system.txt`.
            - PROMPT_USER: The content of `prompt_user.txt`.
            - PROMPT_AGENT: The content of `prompt_agent.txt`.

    '''
    path = Path(os.getcwd()) / folder
    # Read the three prompts
    PROMPT_SYSTEM_NAME = path / f"prompt_system_{name}.txt"
    PROMPT_USER_NAME   = path / f"prompt_user_from_corpus_{name}.txt"
    PROMPT_AGENT_NAME  = path / f"prompt_agent_from_corpus_{name}.txt"

    
    # Get the system, user and agent prompts as a string
    with open(PROMPT_SYSTEM_NAME) as f:
        PROMPT_SYSTEM= f.read()

    with open(PROMPT_USER_NAME) as f:
        PROMPT_USER = f.read()

    with open(PROMPT_AGENT_NAME) as f:
        PROMPT_AGENT = f.read()
    # Return a tuple with the prompts
    return (PROMPT_SYSTEM, PROMPT_USER, PROMPT_AGENT)

# For error capturing
errors = []

In [7]:
def classify_sentences_mistral(client: Mistral, 
                        message: str,
                        prompt_system: str,
                        prompt_user: str,
                        prompt_agent: str):
    
    '''
    Classifies a given message using a Mistral model with predefined prompts.

    This function sends a message to the Mistral API along with system, user, and agent prompts
    to generate a classification response. It handles errors by logging them and returning `None`
    if the classification fails.

    Args:
        client (Mistral): An instance of the Mistral client used to interact with the API.
        message (str): The message to be classified.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.

    Returns:
        str: The content of the model's response if the classification is successful.
        None: If an error occurs during the classification process.
    '''
    
    prompt = prompt_system + "\n Un exemple d'entrée est :\n" + prompt_user +"\n et un exemple de sortie est :\n"+ prompt_agent+".\n Les messages que tu dois annoter sont :\n " +message
    #print(prompt)
    

    response = client.chat.complete(
        model="open-mistral-nemo",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        stream=False,
        response_format = {
            "type": "json_object",
        }
    )


    # Capture errors
    try:
        # If not error, we get the response
        return response.choices[0].message.content
    except:
        # If error, the function returns None, but the error message is appended
        # to a global array
        errors.append({'message': message, 'response': response})
        return None


def classify_conversation_mistral(mistral_client: Mistral,
                        prompt_system: str,
                        prompt_user: str,
                        prompt_agent: str,
                        conversations: list):
    '''
    Classifies a list of conversations using an Mistral model with predefined prompts.

    This function iterates over a list of conversations, classifies each one using the
    `classify_sentences` function, and collects the results. A progress bar is displayed
    to track the classification process.

    Args:
        openai_client (OpenAI): An instance of the OpenAI client used to interact with the API.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.
        conversations (list): A list of conversations (strings) to be classified.

    Returns:
        list: A list of classification results corresponding to each conversation. Each result
                is the output of the `classify_sentences` function, which may be a string or `None`
                if an error occurred during classification.
    '''

    out = []
    for conversation in tqdm(conversations, desc='Classifying...'):
        response = classify_sentences_mistral(mistral_client, 
                                        str(conversation), 
                                        prompt_system, 
                                        prompt_user, 
                                        prompt_agent)
        out.append(response)
    return out


# Recuperation des messages

In [15]:
df = pd.read_csv("SousCorpusMessagesAgreement.csv")
df.head()

Unnamed: 0,messageID,user,role,message,groupID,fileID,date,time,seanceID,nothing,...,Unnamed: 14,Contenu Jose,Contenu Chloe,Contenu Sebastien,Contenu Julien,Unnamed: 19,Forme Jose,Forme Chloe,Forme Sebastien,Forme Julien
0,24,145797.0,Driver,tu pense si je fais un for in in range avec un...,4,1db9dee2-2702-457f-aa07-6b60589446ce,2024-10-09,04:25:31,1,,...,,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,,demande_validation,demande_validation,demande_conseil,demande_validation
1,30,200807.0,Navigator,"en vrai c'est pas mieux de faire un ""if len(a)...",4,1db9dee2-2702-457f-aa07-6b60589446ce,2024-10-09,04:28:50,1,,...,,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,,proposition_conseil,proposition_validation,proposition_validation,proposition_conseil
2,34,200807.0,Navigator,quand je teste le code il me disent y'a des er...,4,1db9dee2-2702-457f-aa07-6b60589446ce,2024-10-09,04:30:08,1,,...,,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,,proposition_validation,proposition_validation,proposition_validation,proposition_validation
3,56,182622.0,Navigator,"def coincide(tableau1, tableau2): # On s'a...",7,2855677e-e74f-4a3f-82d7-be74864de417,2024-10-09,04:30:42,1,,...,,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,,proposition_conseil,proposition_conseil,proposition_conseil,proposition_conseil
4,73,199277.0,Navigator,ta pas besoin du return [],7,2855677e-e74f-4a3f-82d7-be74864de417,2024-10-09,04:37:01,1,,...,,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,relatedToProgramming_relatedToTask,,proposition_conseil,proposition_conseil,proposition_validation,proposition_conseil


In [17]:

# Ensure that the message num is an integer for sorting
df['messageID'] = df['messageID'].astype(int) 
df['groupID'] = df['groupID'].astype(int) 
df['seanceID'] = df['seanceID'].astype(int)  

# Group the dataframe by conversation
grouped = df.groupby(['seanceID','groupID'])

# We'll save the conversations as an array of dictionaries were every conversation is an entry
conversations_dict = []
# For every conversation in the dataframe
for activity_id, group in grouped:
    # Convert the utterances into a list of dictionaries, then sort entries by message num
    conversation_text = group.sort_values('messageID').apply(
        lambda row: {"messageID": row['messageID'], "role": row['role'], "message": row['message']},
        axis=1
    ).tolist()
    
    # Create a conversation
    conversation = {
        "seance_ID": activity_id[0],
        "groupe_ID": activity_id[1],
        "conversation": conversation_text
    }
    # Append the conversations as a dictionary
    conversations_dict.append(conversation)

# This is not a JSON, it will be important later
conversations_dict[:5]

[{'seance_ID': np.int64(-1),
  'groupe_ID': np.int64(-1),
  'conversation': [{'messageID': -1,
    'role': '/',
    'message': 'Tu fais un while et dedans tu incrementes'}]},
 {'seance_ID': np.int64(1),
  'groupe_ID': np.int64(4),
  'conversation': [{'messageID': 24,
    'role': 'Driver',
    'message': 'tu pense si je fais un for in in range avec un for j in range dedans �a peut marcher? '},
   {'messageID': 30,
    'role': 'Navigator',
    'message': 'en vrai c\'est pas mieux de faire un "if len(a) != len(b)"'},
   {'messageID': 34,
    'role': 'Navigator',
    'message': "quand je teste le code il me disent y'a des erreurs de type"}]},
 {'seance_ID': np.int64(1),
  'groupe_ID': np.int64(7),
  'conversation': [{'messageID': 56,
    'role': 'Navigator',
    'message': 'def coincide(tableau1, tableau2):     # On s\'assure que les deux tableaux ont la m�me longueur     if len(tableau1) != len(tableau2):         return []      # On cr�e un tableau vide pour stocker les indices o� les �l�

# Nature

In [20]:
client = Mistral(api_key=api_key)
prompt_system, prompt_user, prompt_agent = read_prompts("prompts_by_category","nature")
labeled_data = classify_conversation_mistral(client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])

Classifying...: 100%|██████████| 5/5 [00:06<00:00,  1.40s/it]


In [21]:
labeled_data

['{"annotations": [{"message_num": -1, "utterances": [{"nature": "Informations sur comment proceder"}]}]}',
 '{\n  "annotations": [\n    {\n      "message_num": 24,\n      "utterances": [\n        {\n          "nature": "Demande d\'aide"\n        }\n      ]\n    },\n    {\n      "message_num": 30,\n      "utterances": [\n        {\n          "nature": "Informations sur comment proceder"\n        }\n      ]\n    },\n    {\n      "message_num": 34,\n      "utterances": [\n        {\n          "nature": "Erreurs, idees fausses"\n        }\n      ]\n    }\n  ]\n}',
 '{\n  "annotations": [\n    {\n      "message_num": 56,\n      "utterances": [\n        {\n          "nature": "Reponse correcte"\n        }\n      ]\n    },\n    {\n      "message_num": 73,\n      "utterances": [\n        {\n          "nature": "Validite de la reponse"\n        }\n      ]\n    },\n    {\n      "message_num": 75,\n      "utterances": [\n        {\n          "nature": "Validite de la reponse"\n        }\n      ]

In [34]:
out_df = convert_to_dataframe_nature(labeled_data)

KeyError: 'message'

# Contenu

In [None]:
client = Mistral(api_key=api_key)
prompt_system, prompt_user, prompt_agent = read_prompts("prompts_by_category","contenu")
labeled_data = classify_conversation_mistral(client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])

# Forme

In [None]:
client = Mistral(api_key=api_key)
prompt_system, prompt_user, prompt_agent = read_prompts("prompts_by_category","forme")
labeled_data = classify_conversation_mistral(client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])