In [108]:
#Imports
import pandas as pd # type: ignore
from pathlib import Path
import os
from openai import OpenAI #type: ignore
import numpy as np #type: ignore
from tqdm import tqdm #type: ignore
import json
import demjson3 # type: ignore
from IPython.display import clear_output # type: ignore
from dotenv import load_dotenv #type: ignore
from mistralai import Mistral #type: ignore

# Nombre de mots, nombre de caractères et source du message

In [151]:
all_messages = pd.read_csv('all_messages.csv')
all_messages.head(2)

Unnamed: 0,messageID,user,role,message,groupID,fileID,date,time,seanceID
0,0,186202.0,Navigator,"Tu es qui ,",0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:10,1
1,1,186202.0,Navigator,On se met 5 partout ok ?,0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:23,1


In [3]:
attributed = all_messages.copy()

In [4]:
attributed['taille'] = [len(str(m)) for m in attributed['message']]
attributed['nombre mots'] = [len(list(filter(None,str(m).split(' ')))) for m in attributed['message']] #Code complexe pour éviter le comptage des chaines vides dues au split (ex message 4)
attributed['source'] = attributed['role']
attributed.head(5)

Unnamed: 0,messageID,user,role,message,groupID,fileID,date,time,seanceID,taille,nombre mots,source
0,0,186202.0,Navigator,"Tu es qui ,",0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:10,1,11,4,Navigator
1,1,186202.0,Navigator,On se met 5 partout ok ?,0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:23,1,24,7,Navigator
2,2,183897.0,Driver,Opé,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:22,1,3,1,Driver
3,3,183897.0,Driver,Opé lmkt,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:39,1,8,2,Driver
4,4,193392.0,Navigator,A PAR SA LE COUZ,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:46,1,17,5,Navigator


# Classification automatique : données et fonctions générales

In [152]:
# Read the .csv file with the messages
df = pd.read_csv('all_messages.csv')

df.head(3)

Unnamed: 0,messageID,user,role,message,groupID,fileID,date,time,seanceID
0,0,186202.0,Navigator,"Tu es qui ,",0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:10,1
1,1,186202.0,Navigator,On se met 5 partout ok ?,0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:23,1
2,2,183897.0,Driver,Opé,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:22,1


### Structuration des données

In [153]:
# Ensure that the message num is an integer for sorting
df['messageID'] = df['messageID'].astype(int) 
df['groupID'] = df['groupID'].astype(int) 
df['seanceID'] = df['seanceID'].astype(int)
df['fileID'] = df['fileID'].astype('string') 

# Group the dataframe by conversation
grouped = df.groupby(['seanceID','groupID'])

map_messageId_fileId = {}

# We'll save the conversations as an array of dictionaries were every conversation is an entry
conversations_dict = []
# For every conversation in the dataframe
for activity_id, group in grouped:
    # Convert the utterances into a list of dictionaries, then sort entries by message num
    conversation_text = group.sort_values('messageID').apply(
        lambda row: {"messageID": row['messageID'], "role": row['role'], "message": row['message']},
        axis=1
    ).tolist()
    
    for idx, row in group.iterrows():
        map_messageId_fileId[row['messageID']] = row['fileID']

    # Create a conversation
    conversation = {
        "seance_ID": activity_id[0],
        "groupe_ID": activity_id[1],
        "conversation": conversation_text
    }
    # Append the conversations as a dictionary
    conversations_dict.append(conversation)

# This is not a JSON, it will be important later
conversations_dict[0]

{'seance_ID': 1,
 'groupe_ID': 0,
 'conversation': [{'messageID': 0,
   'role': 'Navigator',
   'message': 'Tu es qui ,'},
  {'messageID': 1,
   'role': 'Navigator',
   'message': 'On se met 5 partout ok ?'}]}

In [155]:
#conversations_dict[4]
map_messageId_fileId[4]

'0435458c-dc49-4094-b970-1db125fc235c'

### Lecture des prompts

In [156]:
def read_prompts(folder: str):
    '''
    Reads system, user, and agent prompts from text files in a specified folder.

    This function constructs file paths for `prompt_system.txt`, `prompt_user.txt`, and
    `prompt_agent.txt` within the given folder, reads their contents, and returns them
    as a tuple of strings.

    Args:
        folder (str): The name of the folder (relative to the parent of the current working
                        directory under the "coding_schemes" directory) containing the prompt files.

    Returns:
        tuple[str, str, str]: A tuple containing three strings:
            - PROMPT_SYSTEM: The content of `prompt_system.txt`.
            - PROMPT_USER: The content of `prompt_user.txt`.
            - PROMPT_AGENT: The content of `prompt_agent.txt`.

    '''
    path = Path(os.getcwd()) / folder
    # Read the three prompts
    PROMPT_SYSTEM_NAME = path / 'prompt_system.txt'
    PROMPT_USER_NAME = path / 'prompt_user.txt'
    PROMPT_AGENT_NAME = path / 'prompt_user.txt'
    
    # Get the system, user and agent prompts as a string
    with open(PROMPT_SYSTEM_NAME) as f:
        PROMPT_SYSTEM= f.read()

    with open(PROMPT_USER_NAME) as f:
        PROMPT_USER = f.read()

    with open(PROMPT_AGENT_NAME) as f:
        PROMPT_AGENT = f.read()
    # Return a tuple with the prompts
    return (PROMPT_SYSTEM, PROMPT_USER, PROMPT_AGENT)

# For error capturing
errors = []

### Transformation json en df

In [None]:
def convert_to_dataframe(list_of_json: list, 
                        label_columns: list):

    '''
    Converts a list of JSON strings into a flattened Pandas DataFrame.

    This function processes a list of JSON strings, each representing a conversation with
    messages and annotations. It flattens the nested structure of the JSON and constructs
    a DataFrame with columns for activity ID, user ID, message metadata, and annotations.

    Args:
        list_of_json (list): A list of JSON strings, where each string represents a conversation.
        label_column (str): The name of the column to store annotations (e.g., labels).

    Returns:
        pd.DataFrame: A DataFrame containing the flattened conversation data with columns:
            - activity_id: The ID of the activity.
            - user_id: The ID of the user.
            - role: The role of the speaker (e.g., user, model).
            - message_num: The message number in the conversation.
            - utterance_num: The utterance number within the same message and role.
            - sentence_en: The English sentence of the message.
            - <label_column>: The annotation or label for the message.

    Raises:
        JSONDecodeError: If a JSON string cannot be decoded using `json.loads`.
        demjson3.JSONDecodeError: If a JSON string cannot be decoded using `demjson3.decode`.
    '''
    # A list to save every entry (row for our dataframe)
    flattened_data = []

    for entry in list_of_json:
        # First try to decode the string using vanilla JSON module
        try: 
            # Try to do the conversion 
            json_entry = json.loads(entry)

        # In case of error use demjson3 which is capable to deal
        # with JSON-like string (that use single quotes instead of double quotes)
        except:
            # Print the entry that trigger the exception for manual inspection 
            print(entry)
            try:
                json_entry = demjson3.decode(entry)
            except:
                continue

        # Get the IDs of the activity and the conversation
        seance_id = json_entry['seance_ID']
        groupe_id = json_entry['groupe_ID']
        # For every utterance in the conversation
        for message in json_entry['conversation']:
            # Create an entry (row) to use then in the dataframe
            try:
                flattened_data.append({
                    'seance_ID' : seance_id,
                    'groupe_ID': groupe_id,
                    'file_ID': map_messageId_fileId[message['messageID']],
                    'messageID': message['messageID'],
                    'role': message['role'],
                    'message': message['message'],
                    label_columns[0] : message['annotation']['form'],    #ajouter ici les autres attributs
                    label_columns[1] : message['annotation']['tone'],
                    label_columns[2] : message['annotation']['content'],
                    label_columns[3] : message['annotation']['nature'],
                    label_columns[4] : message['annotation']['form2']
                })

            except KeyError as key_error:
                # DEBUG
                print(str(message['messageID']) + " is missing a key : " + str(key_error))

    # Convert the entries in a dataframe
    out_df = pd.DataFrame(flattened_data)
    # Calculate an index for each utterance
    out_df['utterance_num'] = out_df.groupby(['groupe_ID', 'messageID', 'role']).cumcount()
    # Sort the columns of the dataframe
    out_df = out_df[['seance_ID', 'groupe_ID', 'file_ID', 'role', 
                        'messageID', 'utterance_num','message', label_columns[0],label_columns[1],label_columns[2],label_columns[3],label_columns[4]]]
    return out_df


### Variables

In [115]:
# Reprise de la même architecture que celle utilisée par Luciano

PROMPT_FOLDER_OPENAI = 'prompts_openai'
PROMPT_FOLDER_MISTRAL = 'prompts_mistral'
PROMPT_FOLDER_MISTRAL_VOUVOIEMENT = 'prompts_mistral_vouvoiement'
PROMPT_FOLDER_MISTRAL_FORMV2 = 'prompts_mistral_form_v2'
OUT_FILE = 'dialogue_acts_example' + '.csv'
OUT_FOLDER = 'labeled_conversations'
LABEL_COLUMN_1 = 'forme'
LABEL_COLUMN_2 = 'ton'
LABEL_COLUMN_3 = 'content'
LABEL_COLUMN_4 = 'nature'
LABEL_COLUMN_5 = 'formv2'
#ajouter ici les autres labels des attributs
CATEGORY_COLUMN = 'type'

# Classification automatique : en utilisant une API Open AI

Code de Luciano : https://github.com/Lucky-Hidalgo/IRIT_Few-Shot-Tagging-Example

In [15]:
%pip install pandas
%pip install python-dotenv
%pip install openai
%pip install tqdm
%pip install demjson3

clear_output()

## Fonctions pour la classification

In [116]:
def classify_sentences(client: OpenAI, 
                        message: str,
                        prompt_system: str,
                        prompt_user: str,
                        prompt_agent: str):
    
    '''
    Classifies a given message using an OpenAI model with predefined prompts.

    This function sends a message to the OpenAI API along with system, user, and agent prompts
    to generate a classification response. It handles errors by logging them and returning `None`
    if the classification fails.

    Args:
        client (OpenAI): An instance of the OpenAI client used to interact with the API.
        message (str): The message to be classified.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.

    Returns:
        str: The content of the model's response if the classification is successful.
        None: If an error occurs during the classification process.
    '''
    # Docs: https://platform.openai.com/docs/api-reference/chat/create 
    # It is possible to thinker with different parameters such as  response_format, seed,
    # frequency_penalty and so on, but in this case we specified:
    # Model which is going to do the classification, in this case gpt-4o-mini
    # The messages:
    #   - The system message is the task and role of the model
    #   - The user message is an example of an expected input to feed the model
    #   - The agent message is an example of an expected output for the provided model
    #   - The second user message is the text to label (The dictionary with the entire conversation)
    # Temperature: Set to zero to provide an output a bit more deterministic, but is not guaranteed to
    #              be deterministic (For that you should use a seed, which is in beta right now)
    # Stream: Set to False, to only recieve the entire response message, to avoid processing the stream after
    # Timeout: Set to 25 minutes, because conversations can take a lot of time to process.
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt_system},
            {"role": "user", "content": prompt_user},
            {"role": "assistant", "content": prompt_agent},
            {"role": "user", "content": message}
        ],
        temperature=0,
        stream=False,
        timeout=1500  # Set the timeout to 1500 seconds
    )

    # Capture errors
    try:
        # If not error, we get the response
        return response.choices[0].message.content
    except:
        # If error, the function returns None, but the error message is appended
        # to a global array
        errors.append({'message': message, 'response': response})
        return None


def classify_conversation(openai_client: OpenAI,
                            prompt_system: str,
                            prompt_user: str,
                            prompt_agent: str,
                            conversations: list):
    '''
    Classifies a list of conversations using an OpenAI model with predefined prompts.

    This function iterates over a list of conversations, classifies each one using the
    `classify_sentences` function, and collects the results. A progress bar is displayed
    to track the classification process.

    Args:
        openai_client (OpenAI): An instance of the OpenAI client used to interact with the API.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.
        conversations (list): A list of conversations (strings) to be classified.

    Returns:
        list: A list of classification results corresponding to each conversation. Each result
                is the output of the `classify_sentences` function, which may be a string or `None`
                if an error occurred during classification.
    '''

    out = []
    for conversation in tqdm(conversations, desc='Classifying...'):
        response = classify_sentences(openai_client, 
                                        str(conversation), 
                                        prompt_system, 
                                        prompt_user, 
                                        prompt_agent)
        out.append(response)
    return out


## Instanciation du client Open AI et appel de la fonction pour classifier

In [45]:
f = open("env.txt")
key = f.read()
f.close()
key = key[15:]

In [50]:
# Load my OPEN AI API Key
load_dotenv() 

# Save value of OPEN AI API KEY to a variable
#OPENAI_KEY = os.getenv("OPENAI_API_KEY")

# Create an instance of the OPEN AI client using the API key
openai_client = OpenAI(api_key=key)   

# Read the prompts
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_OPENAI)


In [51]:
# Do the classification
labeled_data = classify_conversation(openai_client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5]) #modifier ici pour la data in 

Classifying...:  20%|██        | 1/5 [00:06<00:26,  6.72s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-Hb86CiMZyMYd9WwCXvjB4As5 on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [16]:
print(labeled_data[3])

{
  "seance_ID": "np.int64(1)",
  "groupe_ID": "np.int64(3)",
  "conversation": [
    {
      "messageID": 15,
      "role": "Driver",
      "message": "je ne sais pas comment voir le résultat ",
      "annotation": {
        "form": "negativeSentence",
        "tone": "positiveTone",
        "content": "notRelatedToProgramming_Other",
        "nature": "noFeedback"
      }
    },
    {
      "messageID": 16,
      "role": "Navigator",
      "message": "faut mettre print(coincide)",
      "annotation": {
        "form": "imperativeSentence",
        "tone": "positiveTone",
        "content": "relatedToProgramming_relatedToTask",
        "nature": "correctResponse"
      }
    },
    {
      "messageID": 17,
      "role": "Driver",
      "message": "essaye",
      "annotation": {
        "form": "imperativeSentence",
        "tone": "positiveTone",
        "content": "notRelatedToProgramming_Other",
        "nature": "noFeedback"
      }
    },
    {
      "messageID": 18,
      "role":

In [17]:
errors

[]

In [None]:
out_df = convert_to_dataframe(labeled_data, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])

out_df

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton,content,nature
0,1,0,Navigator,0,0,"Tu es qui ,",question,positiveTone,notRelatedToProgramming_Other,noFeedback
1,1,0,Navigator,1,0,On se met 5 partout ok ?,question,positiveTone,notRelatedToProgramming_Other,noFeedback
2,np.int64(1),np.int64(1),Driver,2,0,Opé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
3,np.int64(1),np.int64(1),Driver,3,0,Opé lmkt,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
4,np.int64(1),np.int64(1),Navigator,4,0,A PAR SA LE COUZ,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
5,np.int64(1),np.int64(1),Navigator,5,0,Opé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
6,np.int64(1),np.int64(1),Navigator,6,0,Ou lé jolie ?,question,positiveTone,notRelatedToProgramming_Other,noFeedback
7,np.int64(1),np.int64(1),Navigator,7,0,A ou,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
8,np.int64(1),np.int64(1),Navigator,8,0,Commencé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
9,np.int64(1),np.int64(1),Navigator,9,0,Comment nous fé la,question,positiveTone,notRelatedToProgramming_Other,noFeedback


In [20]:
out_df.to_csv("annotation_openai_conv_0_to_5.csv")

In [21]:
out_df.iloc[34]

seance_ID                                              np.int64(1)
groupe_ID                                              np.int64(4)
role                                                     Navigator
messageID                                                       34
utterance_num                                                    0
message          quand je teste le code il me disent y'a des er...
forme                                             positiveSentence
ton                                                   positiveTone
content                         relatedToProgramming_relatedToTask
nature                                                      errors
Name: 34, dtype: object

In [22]:
out_df.iloc[34]['message']

"quand je teste le code il me disent y'a des erreurs de type"

In [23]:
out_df.iloc[42]

seance_ID                                 np.int64(1)
groupe_ID                                 np.int64(4)
role                                           Driver
messageID                                          42
utterance_num                                       0
message          je crois c'est bon , je suis pas sur
forme                                positiveSentence
ton                                      positiveTone
content                 notRelatedToProgramming_Other
nature                                     noFeedback
Name: 42, dtype: object

In [24]:
out_df.iloc[44]

seance_ID                                              np.int64(1)
groupe_ID                                              np.int64(4)
role                                                     Navigator
messageID                                                       44
utterance_num                                                    0
message          il y a des erreurs sur plusieurs lignes mais l...
forme                                             positiveSentence
ton                                                   positiveTone
content                         relatedToProgramming_relatedToTask
nature                                                      errors
Name: 44, dtype: object

In [25]:
out_df.iloc[44]['message']

"il y a des erreurs sur plusieurs lignes mais le dernier j'ai pas"

In [26]:
out_df.to_csv("classified_test.csv")

## Messages de test

In [27]:
test_message = [{'seance_ID': np.int64(99),
 'groupe_ID': np.int64(99),
 'conversation': [{'messageID': 99,
   'role': 'Navigator',
   'message': 'va plus vite tes trop lent'}]}]

In [None]:
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_OPENAI)

In [113]:
test_df = classify_conversation(openai_client,
                                  prompt_system,
                                     prompt_user,
                                     prompt_agent,
                                     [conversations_dict[4]])

Classifying...: 100%|██████████| 1/1 [00:28<00:00, 28.41s/it]


In [114]:
test_df

['{\n  "seance_ID": "np.int64(1)",\n  "groupe_ID": "np.int64(4)",\n  "conversation": [\n    {\n      "messageID": 22,\n      "role": "Driver",\n      "message": "Bonjour",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "noFeedback"\n      }\n    },\n    {\n      "messageID": 23,\n      "role": "Navigator",\n      "message": "Bonjour",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "noFeedback"\n      }\n    },\n    {\n      "messageID": 24,\n      "role": "Driver",\n      "message": "tu pense si je fais un for in in range avec un for j in range dedans ça peut marcher? ",\n      "annotation": {\n        "form": "question",\n        "tone": "positiveTone",\n        "content": "relatedToProgramming_relatedToTask",\n        "nature": "noFeedback"\n      }\n 

In [115]:
out_test_df = convert_to_dataframe(test_df, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])
out_test_df

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton,content,nature
0,np.int64(1),np.int64(4),Driver,22,0,Bonjour,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
1,np.int64(1),np.int64(4),Navigator,23,0,Bonjour,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
2,np.int64(1),np.int64(4),Driver,24,0,tu pense si je fais un for in in range avec un...,question,positiveTone,relatedToProgramming_relatedToTask,noFeedback
3,np.int64(1),np.int64(4),Navigator,25,0,oui,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
4,np.int64(1),np.int64(4),Driver,26,0,tu veux editer?,question,positiveTone,notRelatedToProgramming_relativeToRoles,noFeedback
5,np.int64(1),np.int64(4),Navigator,27,0,comme tu veux,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
6,np.int64(1),np.int64(4),Driver,28,0,je teste si ça marche pas on echange,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
7,np.int64(1),np.int64(4),Navigator,29,0,vsy,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
8,np.int64(1),np.int64(4),Navigator,30,0,"en vrai c'est pas mieux de faire un ""if len(a)...",negativeSentence,positiveTone,relatedToProgramming_relatedToTask,howToProceed
9,np.int64(1),np.int64(4),Driver,31,0,tu veux teste?,question,positiveTone,notRelatedToProgramming_relativeToRoles,noFeedback


In [None]:
conversations_dict[6]

{'seance_ID': np.int64(1),
 'groupe_ID': np.int64(7),
 'conversation': [{'messageID': 49, 'role': 'Driver', 'message': 'Yo!'},
  {'messageID': 50, 'role': 'Navigator', 'message': 'OUI'},
  {'messageID': 51, 'role': 'Navigator', 'message': ',????'},
  {'messageID': 52, 'role': 'Driver', 'message': 'Oui'},
  {'messageID': 53, 'role': 'Navigator', 'message': "C'est bon pour toi"},
  {'messageID': 54, 'role': 'Driver', 'message': "C'est bon"},
  {'messageID': 55,
   'role': 'Driver',
   'message': 'Est ce que faut vérifié si les listes sont de la même longueur ?'},
  {'messageID': 56,
   'role': 'Navigator',
   'message': 'def coincide(tableau1, tableau2):     # On s\'assure que les deux tableaux ont la même longueur     if len(tableau1) != len(tableau2):         return []      # On crée un tableau vide pour stocker les indices où les éléments sont identiques     result = []          # On parcourt les deux tableaux et on compare les éléments     for i in range(len(tableau1)):         if ta

In [None]:
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_OPENAI)

In [96]:
test_df = classify_conversation(openai_client,
                                  prompt_system,
                                     prompt_user,
                                     prompt_agent,
                                     [conversations_dict[6]])

Classifying...: 100%|██████████| 1/1 [00:37<00:00, 37.08s/it]


In [159]:
test_df

['{\n  "seance_ID": "np.int64(1)",\n  "groupe_ID": "np.int64(4)",\n  "conversation": [\n    {\n      "messageID": 22,\n      "role": "Driver",\n      "message": "Bonjour",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "noFeedback"\n      }\n    },\n    {\n      "messageID": 23,\n      "role": "Navigator",\n      "message": "Bonjour",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "noFeedback"\n      }\n    },\n    {\n      "messageID": 24,\n      "role": "Driver",\n      "message": "tu pense si je fais un for in in range avec un for j in range dedans ça peut marcher? ",\n      "annotation": {\n        "form": "question",\n        "tone": "positiveTone",\n        "content": "relatedToProgramming_relatedToTask",\n        "nature": "noFeedback"\n      }\n 

In [161]:
out_test_df = convert_to_dataframe(test_df, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])
out_test_df

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton,content,nature
0,np.int64(1),np.int64(4),Driver,22,0,Bonjour,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
1,np.int64(1),np.int64(4),Navigator,23,0,Bonjour,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
2,np.int64(1),np.int64(4),Driver,24,0,tu pense si je fais un for in in range avec un...,question,positiveTone,relatedToProgramming_relatedToTask,noFeedback
3,np.int64(1),np.int64(4),Navigator,25,0,oui,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
4,np.int64(1),np.int64(4),Driver,26,0,tu veux editer?,question,positiveTone,notRelatedToProgramming_relativeToRoles,noFeedback
5,np.int64(1),np.int64(4),Navigator,27,0,comme tu veux,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
6,np.int64(1),np.int64(4),Driver,28,0,je teste si ça marche pas on echange,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
7,np.int64(1),np.int64(4),Navigator,29,0,vsy,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
8,np.int64(1),np.int64(4),Navigator,30,0,"en vrai c'est pas mieux de faire un ""if len(a)...",negativeSentence,positiveTone,relatedToProgramming_relatedToTask,howToProceed
9,np.int64(1),np.int64(4),Driver,31,0,tu veux teste?,question,positiveTone,notRelatedToProgramming_relativeToRoles,noFeedback


## Classification sur le coprus pour calcul du kappa

In [None]:
df = pd.read_csv("SousCorpusMessages.csv")
# Ensure that the message num is an integer for sorting
df['messageID'] = df['messageID'].astype(int) 
df['groupID'] = df['groupID'].astype(int) 
df['seanceID'] = df['seanceID'].astype(int)  

# Group the dataframe by conversation
grouped = df.groupby(['seanceID','groupID'])

# We'll save the conversations as an array of dictionaries were every conversation is an entry
conversations_dict = []
# For every conversation in the dataframe
for activity_id, group in grouped:
    # Convert the utterances into a list of dictionaries, then sort entries by message num
    conversation_text = group.sort_values('messageID').apply(
        lambda row: {"messageID": row['messageID'], "role": row['role'], "message": row['message']},
        axis=1
    ).tolist()
    
    # Create a conversation
    conversation = {
        "seance_ID": activity_id[0],
        "groupe_ID": activity_id[1],
        "conversation": conversation_text
    }
    # Append the conversations as a dictionary
    conversations_dict.append(conversation)

# This is not a JSON, it will be important later
conversations_dict[0]

{'seance_ID': 1,
 'groupe_ID': 4,
 'conversation': [{'messageID': 24,
   'role': 'Driver',
   'message': 'tu pense si je fais un for in in range avec un for j in range dedans �a peut marcher? ',
   'fileId': '1db9dee2-2702-457f-aa07-6b60589446ce'},
  {'messageID': 30,
   'role': 'Navigator',
   'message': 'en vrai c\'est pas mieux de faire un "if len(a) != len(b)"',
   'fileId': '1db9dee2-2702-457f-aa07-6b60589446ce'},
  {'messageID': 34,
   'role': 'Navigator',
   'message': "quand je teste le code il me disent y'a des erreurs de type",
   'fileId': '1db9dee2-2702-457f-aa07-6b60589446ce'}]}

In [19]:
len(conversations_dict)

102

In [None]:
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_OPENAI)
classified = classify_conversation(openai_client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])

Classifying...:   0%|          | 0/5 [00:02<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-Hb86CiMZyMYd9WwCXvjB4As5 on tokens per min (TPM): Limit 100000, Used 100000, Requested 1772. Please try again in 12h45m30.24s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [154]:
classified

['{\n  "seance_ID": 1,\n  "groupe_ID": 4,\n  "conversation": [\n    {\n      "messageID": 24,\n      "role": "Driver",\n      "message": "tu pense si je fais un for in in range avec un for j in range dedans ça peut marcher? ",\n      "form": {\n        "annotation": "question"\n      },\n      "tone": {\n        "annotation": "positiveTone"\n      },\n      "content": {\n        "annotation": "relatedToProgramming_relatedToTask"\n      },\n      "nature": {\n        "annotation": "howToProceed"\n      }\n    },\n    {\n      "messageID": 30,\n      "role": "Navigator",\n      "message": "en vrai c\'est pas mieux de faire un \\"if len(a) != len(b)\\"",\n      "form": {\n        "annotation": "question"\n      },\n      "tone": {\n        "annotation": "positiveTone"\n      },\n      "content": {\n        "annotation": "relatedToProgramming_relatedToTask"\n      },\n      "nature": {\n        "annotation": "howToProceed"\n      }\n    },\n    {\n      "messageID": 34,\n      "role": "Nav

In [162]:
out_df = convert_to_dataframe(classified, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])
out_df

KeyError: 'annotation'

In [157]:
classified2 = classify_conversation(openai_client,
                                  prompt_system,
                                     prompt_user,
                                     prompt_agent,
                                     conversations_dict[8:])

Classifying...: 100%|██████████| 12/12 [03:18<00:00, 16.52s/it]


In [None]:
out_df2 = convert_to_dataframe(classified2, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])
out_df2

## Classifications avec prompt en français

In [17]:
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_MISTRAL) #Prompts en français
classified = classify_conversation(openai_client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])

NameError: name 'openai_client' is not defined

In [48]:
out_df = convert_to_dataframe(classified, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])
out_df

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton,content,nature
0,np.int64(1),np.int64(0),Navigator,0,0,"Tu es qui ,",question,positiveTone,notRelatedToProgramming_Other,noFeedback
1,np.int64(1),np.int64(0),Navigator,1,0,On se met 5 partout ok ?,question,positiveTone,notRelatedToProgramming_Other,noFeedback
2,np.int64(1),np.int64(1),Driver,2,0,Opé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
3,np.int64(1),np.int64(1),Driver,3,0,Opé lmkt,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
4,np.int64(1),np.int64(1),Navigator,4,0,A PAR SA LE COUZ,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
5,np.int64(1),np.int64(1),Navigator,5,0,Opé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
6,np.int64(1),np.int64(1),Navigator,6,0,Ou lé jolie ?,question,positiveTone,notRelatedToProgramming_Other,noFeedback
7,np.int64(1),np.int64(1),Navigator,7,0,A ou,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
8,np.int64(1),np.int64(1),Navigator,8,0,Commencé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
9,np.int64(1),np.int64(1),Navigator,9,0,Comment nous fé la,question,positiveTone,notRelatedToProgramming_Other,noFeedback


In [49]:
out_df.to_csv("annotation_openai_conv_0_to_5_french_prompt.csv")

# Classification automatique : API Mistral AI

## Instanciation du modèle

In [11]:
#! pip install mistralai

In [117]:
f = open("keys.txt")
key = f.readline().strip('\n')
f.close()
#key = key[12:]

In [118]:
api_key = key
model = "open-mistral-nemo"

client = Mistral(api_key=api_key)

## Fonctions de classification

In [135]:
def classify_sentences_mistral(client: Mistral, 
                        message: str,
                        prompt_system: str,
                        prompt_user: str,
                        prompt_agent: str):
    
    '''
    Classifies a given message using a Mistral model with predefined prompts.

    This function sends a message to the Mistral API along with system, user, and agent prompts
    to generate a classification response. It handles errors by logging them and returning `None`
    if the classification fails.

    Args:
        client (Mistral): An instance of the Mistral client used to interact with the API.
        message (str): The message to be classified.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.

    Returns:
        str: The content of the model's response if the classification is successful.
        None: If an error occurs during the classification process.
    '''
    
    prompt = prompt_system + "\n Un exemple d'entrée est :\n" + prompt_user +"\n et un exemple de sortie est :\n"+ prompt_agent+".\n Les messages que tu dois annoter sont :\n " +message
    #print(prompt)
    

    response = client.chat.complete(
        model="open-mistral-nemo",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        stream=False,
        response_format = {
            "type": "json_object",
        }
    )


    # Capture errors
    try:
        # If not error, we get the response
        return response.choices[0].message.content
    except:
        # If error, the function returns None, but the error message is appended
        # to a global array
        errors.append({'message': message, 'response': response})
        return None


def classify_conversation_mistral(mistral_client: Mistral,
                        prompt_system: str,
                        prompt_user: str,
                        prompt_agent: str,
                        conversations: list):
    '''
    Classifies a list of conversations using an OpenAI model with predefined prompts.

    This function iterates over a list of conversations, classifies each one using the
    `classify_sentences` function, and collects the results. A progress bar is displayed
    to track the classification process.

    Args:
        openai_client (OpenAI): An instance of the OpenAI client used to interact with the API.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.
        conversations (list): A list of conversations (strings) to be classified.

    Returns:
        list: A list of classification results corresponding to each conversation. Each result
                is the output of the `classify_sentences` function, which may be a string or `None`
                if an error occurred during classification.
    '''

    out = []
    for conversation in tqdm(conversations, desc='Classifying...'):     
        response = classify_sentences_mistral(mistral_client, 
                                        str(conversation), 
                                        prompt_system, 
                                        prompt_user, 
                                        prompt_agent)

        out.append(response)
    return out


## Classification

In [120]:
conversations_dict[0]

{'seance_ID': 1,
 'groupe_ID': 0,
 'conversation': [{'messageID': 0,
   'role': 'Navigator',
   'message': 'Tu es qui ,'},
  {'messageID': 1,
   'role': 'Navigator',
   'message': 'On se met 5 partout ok ?'}]}

In [121]:
client = Mistral(api_key=api_key)
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_MISTRAL)
labeled_data = classify_conversation_mistral(client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])

Classifying...: 100%|██████████| 5/5 [00:25<00:00,  5.18s/it]


In [122]:
labeled_data

['{"seance_ID": 1, "groupe_ID": 0, "conversation": [{"messageID": 0, "role": "Navigator", "message": "Tu es qui ,", "annotation": {"form": "question", "tone": "positiveTone", "content": "notRelatedToProgramming_Other", "nature": "noFeedback"}}, {"messageID": 1, "role": "Navigator", "message": "On se met 5 partout ok ?", "annotation": {"form": "imperativeSentence", "tone": "positiveTone", "content": "notRelatedToProgramming_Other", "nature": "noFeedback"}}]}',
 '{"seance_ID": 1, "groupe_ID": 1, "conversation": [{"messageID": 2, "role": "Driver", "message": "Opé", "annotation": {"form": "positiveSentence", "tone": "positiveTone", "content": "notRelatedToProgramming_Other", "nature": "noFeedback"}}, {"messageID": 3, "role": "Driver", "message": "Opé lmkt", "annotation": {"form": "positiveSentence", "tone": "positiveTone", "content": "notRelatedToProgramming_Other", "nature": "noFeedback"}}, {"messageID": 4, "role": "Navigator", "message": "A PAR SA LE COUZ ", "annotation": {"form": "posit

In [107]:
out_df = convert_to_dataframe(labeled_data, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])

out_df.head()

IndexError: list index out of range

In [68]:
out_df.to_csv("annotation_mistral_conv_0_to_5.csv")

## Classification avec vouvoiement dans le prompt

In [69]:
client = Mistral(api_key=api_key)
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_MISTRAL_VOUVOIEMENT)
labeled_data = classify_conversation_mistral(client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])

Classifying...: 100%|██████████| 5/5 [00:47<00:00,  9.53s/it]


In [70]:
out_df = convert_to_dataframe(labeled_data, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4])

out_df.head()

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton,content,nature
0,1,0,Navigator,0,0,Tu es qui ?,question,positiveTone,notRelatedToProgramming_Other,askingForHelp
1,1,0,Navigator,1,0,On se met 5 partout ok ?,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
2,1,1,Driver,2,0,Opé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
3,1,1,Driver,3,0,Opé lmkt,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback
4,1,1,Navigator,4,0,A PAR SA LE COUZ,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback


In [71]:
out_df.to_csv("annotation_mistral_vouvoiement_conv_0_to_5.csv")

## Form v2

In [148]:
client = Mistral(api_key=api_key)
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_MISTRAL_FORMV2)
labeled_data = classify_conversation_mistral(client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict[:5])

Classifying...:   0%|          | 0/5 [00:00<?, ?it/s]

Classifying...: 100%|██████████| 5/5 [00:25<00:00,  5.13s/it]


In [149]:
labeled_data

['{\n  "seance_ID": 1,\n  "groupe_ID": 0,\n  "conversation": [\n    {\n      "messageID": 0,\n      "role": "Navigator",\n      "message": "Tu es qui ,",\n      "annotation": {\n        "form": "question",\n        "tone": "neutral",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "askingForHelp",\n        "form2": "question"\n      }\n    },\n    {\n      "messageID": 1,\n      "role": "Navigator",\n      "message": "On se met 5 partout ok ?",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "noFeedback",\n        "form2": "neutral"\n      }\n    }\n  ]\n}',
 '{\n  "seance_ID": 1,\n  "groupe_ID": 1,\n  "conversation": [\n    {\n      "messageID": 2,\n      "role": "Driver",\n      "message": "Opé",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Other",\n    

In [159]:
out_df = convert_to_dataframe(labeled_data, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4,LABEL_COLUMN_5])

out_df.head()

Unnamed: 0,seance_ID,groupe_ID,file_ID,role,messageID,utterance_num,message,forme,ton,content,nature,formv2
0,1,0,023a3090-7b0a-48c4-93f0-c200d9afce41,Navigator,0,0,"Tu es qui ,",question,neutral,notRelatedToProgramming_Other,askingForHelp,question
1,1,0,023a3090-7b0a-48c4-93f0-c200d9afce41,Navigator,1,0,On se met 5 partout ok ?,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback,neutral
2,1,1,0435458c-dc49-4094-b970-1db125fc235c,Driver,2,0,Opé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback,neutral
3,1,1,0435458c-dc49-4094-b970-1db125fc235c,Driver,3,0,Opé lmkt,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback,neutral
4,1,1,0435458c-dc49-4094-b970-1db125fc235c,Navigator,4,0,A PAR SA LE COUZ,positiveSentence,negativeTone,notRelatedToProgramming_Other,noFeedback,neutral


In [160]:
out_df.to_csv("form2.csv")

## Classification complète

In [161]:
client = Mistral(api_key=api_key)
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER_MISTRAL_FORMV2)
labeled_data = classify_conversation_mistral(client,
                                    prompt_system,
                                    prompt_user,
                                    prompt_agent,
                                    conversations_dict)

Classifying...:   0%|          | 0/102 [00:00<?, ?it/s]

Classifying...: 100%|██████████| 102/102 [25:07<00:00, 14.78s/it]


In [165]:
labeled_data

['{\n  "seance_ID": 1,\n  "groupe_ID": 0,\n  "conversation": [\n    {\n      "messageID": 0,\n      "role": "Navigator",\n      "message": "Tu es qui ,",\n      "annotation": {\n        "form": "question",\n        "tone": "neutral",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "askingForHelp",\n        "form2": "question"\n      }\n    },\n    {\n      "messageID": 1,\n      "role": "Navigator",\n      "message": "On se met 5 partout ok ?",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Other",\n        "nature": "noFeedback",\n        "form2": "imperativeSentence"\n      }\n    }\n  ]\n}',
 '{\n  "seance_ID": 1,\n  "groupe_ID": 1,\n  "conversation": [\n    {\n      "messageID": 2,\n      "role": "Driver",\n      "message": "Opé",\n      "annotation": {\n        "form": "positiveSentence",\n        "tone": "positiveTone",\n        "content": "notRelatedToProgramming_Ot

In [175]:
out_df = convert_to_dataframe(labeled_data, [LABEL_COLUMN_1,LABEL_COLUMN_2,LABEL_COLUMN_3,LABEL_COLUMN_4,LABEL_COLUMN_5])

out_df.head()

56 is missing a key : 'annotation'
184 is missing a key : 'form'
185 is missing a key : 'form'
186 is missing a key : 'form'
187 is missing a key : 'form'
188 is missing a key : 'form'
189 is missing a key : 'form'
190 is missing a key : 'form'
191 is missing a key : 'form'
192 is missing a key : 'form'
193 is missing a key : 'form'
194 is missing a key : 'form'
195 is missing a key : 'form'
196 is missing a key : 'form'
197 is missing a key : 'form'
198 is missing a key : 'form'
199 is missing a key : 'form'
200 is missing a key : 'form'
498 is missing a key : 'form'
1264 is missing a key : 'form'
1268 is missing a key : 'form'
1381 is missing a key : 'form'
{
  "seance_ID": 2,
  "groupe_ID": 75,
  "conversation": [
    {
      "messageID": 1540,
      "role": "Driver",
      "message": "c qui?",
      "annotation": {
        "form": "negativeSentence",
        "tone": "positiveTone",
        "content": "relatedToProgramming_notRelatedToTask",
        "nature": "askingForHelp",
      

Unnamed: 0,seance_ID,groupe_ID,file_ID,role,messageID,utterance_num,message,forme,ton,content,nature,formv2
0,1,0,023a3090-7b0a-48c4-93f0-c200d9afce41,Navigator,0,0,"Tu es qui ,",question,neutral,notRelatedToProgramming_Other,askingForHelp,question
1,1,0,023a3090-7b0a-48c4-93f0-c200d9afce41,Navigator,1,0,On se met 5 partout ok ?,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback,imperativeSentence
2,1,1,0435458c-dc49-4094-b970-1db125fc235c,Driver,2,0,Opé,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback,neutral
3,1,1,0435458c-dc49-4094-b970-1db125fc235c,Driver,3,0,Opé lmkt,positiveSentence,positiveTone,notRelatedToProgramming_Other,noFeedback,neutral
4,1,1,0435458c-dc49-4094-b970-1db125fc235c,Navigator,4,0,A PAR SA LE COUZ,positiveSentence,negativeTone,notRelatedToProgramming_Other,noFeedback,neutral


In [176]:
out_df.to_csv("complete.csv")