### Nombre de mots, nombre de caractères et source du message

In [220]:
import pandas as pd

In [221]:
all_messages = pd.read_csv('all_messages.csv')
all_messages.head(2)

Unnamed: 0,messageID,user,role,message,groupID,fileID,date,time,seanceID
0,0,186202.0,Navigator,"Tu es qui ,",0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:10,1
1,1,186202.0,Navigator,On se met 5 partout ok ?,0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:23,1


In [222]:
attributed = all_messages.copy()

In [223]:
attributed['taille'] = [len(str(m)) for m in attributed['message']]
attributed['nombre mots'] = [len(list(filter(None,str(m).split(' ')))) for m in attributed['message']] #Code complexe pour éviter le comptage des chaines vides dues au split (ex message 4)
attributed['source'] = attributed['role']
attributed.head(5)

Unnamed: 0,messageID,user,role,message,groupID,fileID,date,time,seanceID,taille,nombre mots,source
0,0,186202.0,Navigator,"Tu es qui ,",0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:10,1,11,4,Navigator
1,1,186202.0,Navigator,On se met 5 partout ok ?,0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:23,1,24,7,Navigator
2,2,183897.0,Driver,Opé,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:22,1,3,1,Driver
3,3,183897.0,Driver,Opé lmkt,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:39,1,8,2,Driver
4,4,193392.0,Navigator,A PAR SA LE COUZ,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:46,1,17,5,Navigator


### Forme Grammaticale, Groupe et Ton du message : en utilisant une API Open AI

Code grandement inspiré par le code de Luciano : https://github.com/Lucky-Hidalgo/IRIT_Few-Shot-Tagging-Example

In [224]:
from IPython.display import clear_output # type: ignore

%pip install pandas
%pip install python-dotenv
%pip install openai
%pip install tqdm
%pip install demjson3

clear_output()

#### Lecture du DataFrame avec les messages

In [225]:
import pandas as pd # type: ignore
from pathlib import Path
import os

# Read the .csv file with the messages
df = pd.read_csv('all_messages.csv')

df.head(3)

Unnamed: 0,messageID,user,role,message,groupID,fileID,date,time,seanceID
0,0,186202.0,Navigator,"Tu es qui ,",0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:10,1
1,1,186202.0,Navigator,On se met 5 partout ok ?,0,023a3090-7b0a-48c4-93f0-c200d9afce41,2024-10-09,04:38:23,1
2,2,183897.0,Driver,Opé,1,0435458c-dc49-4094-b970-1db125fc235c,2024-10-09,04:21:22,1


#### Structuration des données

In [226]:
# Ensure that the message num is an integer for sorting
df['messageID'] = df['messageID'].astype(int) 
df['groupID'] = df['groupID'].astype(int) 
df['seanceID'] = df['seanceID'].astype(int)  

# Group the dataframe by conversation
grouped = df.groupby(['seanceID','groupID'])

# We'll save the conversations as an array of dictionaries were every conversation is an entry
conversations_dict = []
# For every conversation in the dataframe
for activity_id, group in grouped:
    # Convert the utterances into a list of dictionaries, then sort entries by message num
    conversation_text = group.sort_values('messageID').apply(
        lambda row: {"messageID": row['messageID'], "role": row['role'], "message": row['message']},
        axis=1
    ).tolist()
    
    # Create a conversation
    conversation = {
        "seance_ID": activity_id[0],
        "groupe_ID": activity_id[1],
        "conversation": conversation_text
    }
    # Append the conversations as a dictionary
    conversations_dict.append(conversation)

# This is not a JSON, it will be important later
conversations_dict[0]

{'seance_ID': np.int64(1),
 'groupe_ID': np.int64(0),
 'conversation': [{'messageID': 0,
   'role': 'Navigator',
   'message': 'Tu es qui ,'},
  {'messageID': 1,
   'role': 'Navigator',
   'message': 'On se met 5 partout ok ?'}]}

In [227]:
conversations_dict[4]

{'seance_ID': np.int64(1),
 'groupe_ID': np.int64(4),
 'conversation': [{'messageID': 22, 'role': 'Driver', 'message': 'Bonjour'},
  {'messageID': 23, 'role': 'Navigator', 'message': 'Bonjour'},
  {'messageID': 24,
   'role': 'Driver',
   'message': 'tu pense si je fais un for in in range avec un for j in range dedans ça peut marcher? '},
  {'messageID': 25, 'role': 'Navigator', 'message': 'oui '},
  {'messageID': 26, 'role': 'Driver', 'message': 'tu veux editer?'},
  {'messageID': 27, 'role': 'Navigator', 'message': 'comme tu veux'},
  {'messageID': 28,
   'role': 'Driver',
   'message': 'je teste si ça marche pas on echange'},
  {'messageID': 29, 'role': 'Navigator', 'message': 'vsy '},
  {'messageID': 30,
   'role': 'Navigator',
   'message': 'en vrai c\'est pas mieux de faire un "if len(a) != len(b)"'},
  {'messageID': 31, 'role': 'Driver', 'message': 'tu veux teste?'},
  {'messageID': 32, 'role': 'Driver', 'message': "demande l'édition"},
  {'messageID': 33, 'role': 'Navigator', '

#### Fonctions pour la classification

In [228]:
import os
from pathlib import Path
from openai import OpenAI #type: ignore
import numpy as np #type: ignore
from tqdm import tqdm #type: ignore


def read_prompts(folder: str):
    '''
    Reads system, user, and agent prompts from text files in a specified folder.

    This function constructs file paths for `prompt_system.txt`, `prompt_user.txt`, and
    `prompt_agent.txt` within the given folder, reads their contents, and returns them
    as a tuple of strings.

    Args:
        folder (str): The name of the folder (relative to the parent of the current working
                     directory under the "coding_schemes" directory) containing the prompt files.

    Returns:
        tuple[str, str, str]: A tuple containing three strings:
            - PROMPT_SYSTEM: The content of `prompt_system.txt`.
            - PROMPT_USER: The content of `prompt_user.txt`.
            - PROMPT_AGENT: The content of `prompt_agent.txt`.

    '''
    path = Path(os.getcwd()) / folder
    # Read the three prompts
    PROMPT_SYSTEM_NAME = path / 'prompt_system.txt'
    PROMPT_USER_NAME = path / 'prompt_user.txt'
    PROMPT_AGENT_NAME = path / 'prompt_user.txt'
    
    # Get the system, user and agent prompts as a string
    with open(PROMPT_SYSTEM_NAME) as f:
        PROMPT_SYSTEM= f.read()

    with open(PROMPT_USER_NAME) as f:
        PROMPT_USER = f.read()

    with open(PROMPT_AGENT_NAME) as f:
        PROMPT_AGENT = f.read()
    # Return a tuple with the prompts
    return (PROMPT_SYSTEM, PROMPT_USER, PROMPT_AGENT)

# For error capturing
errors = []

def classify_sentences(client: OpenAI, 
                       message: str,
                       prompt_system: str,
                       prompt_user: str,
                       prompt_agent: str):
    
    '''
    Classifies a given message using an OpenAI model with predefined prompts.

    This function sends a message to the OpenAI API along with system, user, and agent prompts
    to generate a classification response. It handles errors by logging them and returning `None`
    if the classification fails.

    Args:
        client (OpenAI): An instance of the OpenAI client used to interact with the API.
        message (str): The message to be classified.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.

    Returns:
        str: The content of the model's response if the classification is successful.
        None: If an error occurs during the classification process.
    '''
    # Docs: https://platform.openai.com/docs/api-reference/chat/create 
    # It is possible to thinker with different parameters such as  response_format, seed,
    # frequency_penalty and so on, but in this case we specified:
    # Model which is going to do the classification, in this case gpt-4o-mini
    # The messages:
    #   - The system message is the task and role of the model
    #   - The user message is an example of an expected input to feed the model
    #   - The agent message is an example of an expected output for the provided model
    #   - The second user message is the text to label (The dictionary with the entire conversation)
    # Temperature: Set to zero to provide an output a bit more deterministic, but is not guaranteed to
    #              be deterministic (For that you should use a seed, which is in beta right now)
    # Stream: Set to False, to only recieve the entire response message, to avoid processing the stream after
    # Timeout: Set to 25 minutes, because conversations can take a lot of time to process.
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt_system},
            {"role": "user", "content": prompt_user},
            {"role": "assistant", "content": prompt_agent},
            {"role": "user", "content": message}
        ],
        temperature=0,
        stream=False,
        timeout=1500  # Set the timeout to 1500 seconds
    )

    # Capture errors
    try:
        # If not error, we get the response
        return response.choices[0].message.content
    except:
        # If error, the function returns None, but the error message is appended
        # to a global array
        errors.append({'message': message, 'response': response})
        return None


def classify_conversation(openai_client: OpenAI,
                          prompt_system: str,
                          prompt_user: str,
                          prompt_agent: str,
                          conversations: list):
    '''
    Classifies a list of conversations using an OpenAI model with predefined prompts.

    This function iterates over a list of conversations, classifies each one using the
    `classify_sentences` function, and collects the results. A progress bar is displayed
    to track the classification process.

    Args:
        openai_client (OpenAI): An instance of the OpenAI client used to interact with the API.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.
        conversations (list): A list of conversations (strings) to be classified.

    Returns:
        list: A list of classification results corresponding to each conversation. Each result
              is the output of the `classify_sentences` function, which may be a string or `None`
              if an error occurred during classification.
    '''

    out = []
    for conversation in tqdm(conversations, desc='Classifying...'):
        response = classify_sentences(openai_client, 
                                      str(conversation), 
                                      prompt_system, 
                                      prompt_user, 
                                      prompt_agent)
        out.append(response)
    return out


#### Variables

In [229]:
# Reprise de la même architecture que celle utilisée par Luciano

PROMPT_FOLDER = 'prompts'
OUT_FILE = 'dialogue_acts_example' + '.csv'
OUT_FOLDER = 'labeled_conversations'
LABEL_COLUMN_1 = 'forme'
LABEL_COLUMN_2 = 'ton'
#ajouter ici les autres labels des attributs
CATEGORY_COLUMN = 'type'

#### Instanciation du client Open AI et appel de la fonction pour classifier

In [230]:
from dotenv import load_dotenv #type: ignore
import os 

# Load my OPEN AI API Key
load_dotenv() 

# Save value of OPEN AI API KEY to a variable
#OPENAI_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_KEY = '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'

# Creat an instance of the OPEN AI client using the API key
openai_client = OpenAI(api_key=OPENAI_KEY)

# Read the prompts
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER)

# Do the classification
labeled_data = classify_conversation(openai_client,
                                  prompt_system,
                                     prompt_user,
                                     prompt_agent,
                                     conversations_dict[0:5]) #modifier ici pour la data in 
                                    

Classifying...: 100%|██████████| 5/5 [01:08<00:00, 13.65s/it]


In [231]:
print(labeled_data[4]) #Ne classifie pas bien pour l'instant, il faut plus d'exemples

{
  "seance_ID": "1",
  "groupe_ID": "4",
  "conversation": [
    {
      "messageID": 22,
      "role": "Driver",
      "message": "Bonjour",
      "annotation": {
        "form": "positiveSentence",
        "tone": "positiveTone"
      }
    },
    {
      "messageID": 23,
      "role": "Navigator",
      "message": "Bonjour",
      "annotation": {
        "form": "positiveSentence",
        "tone": "positiveTone"
      }
    },
    {
      "messageID": 24,
      "role": "Driver",
      "message": "tu pense si je fais un for in in range avec un for j in range dedans ça peut marcher? ",
      "annotation": {
        "form": "question",
        "tone": "positiveTone"
      }
    },
    {
      "messageID": 25,
      "role": "Navigator",
      "message": "oui ",
      "annotation": {
        "form": "positiveSentence",
        "tone": "positiveTone"
      }
    },
    {
      "messageID": 26,
      "role": "Driver",
      "message": "tu veux editer?",
      "annotation": {
        "form

In [232]:
errors

[]

In [233]:
import json
import pandas as pd # type: ignore
import demjson3 # type: ignore

def convert_to_dataframe(list_of_json: list, 
                         label_columns: list):

    '''
    Converts a list of JSON strings into a flattened Pandas DataFrame.

    This function processes a list of JSON strings, each representing a conversation with
    messages and annotations. It flattens the nested structure of the JSON and constructs
    a DataFrame with columns for activity ID, user ID, message metadata, and annotations.

    Args:
        list_of_json (list): A list of JSON strings, where each string represents a conversation.
        label_column (str): The name of the column to store annotations (e.g., labels).

    Returns:
        pd.DataFrame: A DataFrame containing the flattened conversation data with columns:
            - activity_id: The ID of the activity.
            - user_id: The ID of the user.
            - role: The role of the speaker (e.g., user, model).
            - message_num: The message number in the conversation.
            - utterance_num: The utterance number within the same message and role.
            - sentence_en: The English sentence of the message.
            - <label_column>: The annotation or label for the message.

    Raises:
        JSONDecodeError: If a JSON string cannot be decoded using `json.loads`.
        demjson3.JSONDecodeError: If a JSON string cannot be decoded using `demjson3.decode`.
    '''
    # A list to save every entry (row for our dataframe)
    flattened_data = []

    for entry in list_of_json:
        # First try to decode the string using vanilla JSON module
        try: 
            # Try to do the conversion 
            json_entry = json.loads(entry)

        # In case of error use demjson3 which is capable to deal
        # with JSON-like string (that use single quotes instead of double quotes)
        except:
            # Print the entry that trigger the exception for manual inspection 
            print(entry)
            json_entry = demjson3.decode(entry)

        # Get the IDs of the activity and the conversation
        seance_id = json_entry['seance_ID']
        groupe_id = json_entry['groupe_ID']
        # For every utterance in the conversation
        for message in json_entry['conversation']:
            # Create an entry (row) to use then in the dataframe
            flattened_data.append({
                'seance_ID' : seance_id,
                'groupe_ID': groupe_id,
                'messageID': message['messageID'],
                'role': message['role'],
                'message': message['message'],
                label_columns[0] : message['annotation']['form'],    #ajouter ici les autres attributs
                label_columns[1] : message['annotation']['tone']
            })

    # Convert the entries in a dataframe
    out_df = pd.DataFrame(flattened_data)
    # Calculate an index for each utterance
    out_df['utterance_num'] = out_df.groupby(['groupe_ID', 'messageID', 'role']).cumcount()
    # Sort the columns of the dataframe
    out_df = out_df[['seance_ID', 'groupe_ID', 'role', 
                           'messageID', 'utterance_num','message', label_columns[0],label_columns[1]]]
    return out_df


In [234]:
out_df = convert_to_dataframe(labeled_data, [LABEL_COLUMN_1,LABEL_COLUMN_2])

out_df.head()

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton
0,1,0,Navigator,0,0,"Tu es qui ,",question,positiveTone
1,1,0,Navigator,1,0,On se met 5 partout ok ?,question,positiveTone
2,1,1,Driver,2,0,Opé,positiveSentence,positiveTone
3,1,1,Driver,3,0,Opé lmkt,positiveSentence,positiveTone
4,1,1,Navigator,4,0,A PAR SA LE COUZ,positiveSentence,positiveTone


In [235]:
out_df

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton
0,1,0,Navigator,0,0,"Tu es qui ,",question,positiveTone
1,1,0,Navigator,1,0,On se met 5 partout ok ?,question,positiveTone
2,1,1,Driver,2,0,Opé,positiveSentence,positiveTone
3,1,1,Driver,3,0,Opé lmkt,positiveSentence,positiveTone
4,1,1,Navigator,4,0,A PAR SA LE COUZ,positiveSentence,positiveTone
5,1,1,Navigator,5,0,Opé,positiveSentence,positiveTone
6,1,1,Navigator,6,0,Ou lé jolie ?,question,positiveTone
7,1,1,Navigator,7,0,A ou,positiveSentence,positiveTone
8,1,1,Navigator,8,0,Commencé,positiveSentence,positiveTone
9,1,1,Navigator,9,0,Comment nous fé la,question,positiveTone


In [236]:
out_df.iloc[34]

seance_ID                                                        1
groupe_ID                                                        4
role                                                     Navigator
messageID                                                       34
utterance_num                                                    0
message          quand je teste le code il me disent y'a des er...
forme                                             negativeSentence
ton                                                   positiveTone
Name: 34, dtype: object

In [237]:
out_df.iloc[34]['message']

"quand je teste le code il me disent y'a des erreurs de type"

In [238]:
out_df.iloc[42]

seance_ID                                           1
groupe_ID                                           4
role                                           Driver
messageID                                          42
utterance_num                                       0
message          je crois c'est bon , je suis pas sur
forme                                negativeSentence
ton                                      positiveTone
Name: 42, dtype: object

In [239]:
out_df.iloc[44]

seance_ID                                                        1
groupe_ID                                                        4
role                                                     Navigator
messageID                                                       44
utterance_num                                                    0
message          il y a des erreurs sur plusieurs lignes mais l...
forme                                             negativeSentence
ton                                                   positiveTone
Name: 44, dtype: object

In [240]:
out_df.iloc[44]['message']

"il y a des erreurs sur plusieurs lignes mais le dernier j'ai pas"

In [241]:
test_message = [{'seance_ID': np.int64(99),
 'groupe_ID': np.int64(99),
 'conversation': [{'messageID': 99,
   'role': 'Navigator',
   'message': 'va plus vite tes trop lent'}]}]

In [242]:
test_df = classify_conversation(openai_client,
                                  prompt_system,
                                     prompt_user,
                                     prompt_agent,
                                     test_message)

Classifying...:   0%|          | 0/1 [00:21<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-Hb86CiMZyMYd9WwCXvjB4As5 on tokens per min (TPM): Limit 100000, Used 99559, Requested 906. Please try again in 3h20m52.799s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
out_test_df = convert_to_dataframe(test_df, [LABEL_COLUMN_1,LABEL_COLUMN_2])
out_test_df

Unnamed: 0,seance_ID,groupe_ID,role,messageID,utterance_num,message,forme,ton
0,np.int64(99),np.int64(99),Navigator,99,0,va plus vite tes trop lent,negativeSentence,negativeTone


In [None]:
conversations_dict[6]

{'seance_ID': np.int64(1),
 'groupe_ID': np.int64(7),
 'conversation': [{'messageID': 49, 'role': 'Driver', 'message': 'Yo!'},
  {'messageID': 50, 'role': 'Navigator', 'message': 'OUI'},
  {'messageID': 51, 'role': 'Navigator', 'message': ',????'},
  {'messageID': 52, 'role': 'Driver', 'message': 'Oui'},
  {'messageID': 53, 'role': 'Navigator', 'message': "C'est bon pour toi"},
  {'messageID': 54, 'role': 'Driver', 'message': "C'est bon"},
  {'messageID': 55,
   'role': 'Driver',
   'message': 'Est ce que faut vérifié si les listes sont de la même longueur ?'},
  {'messageID': 56,
   'role': 'Navigator',
   'message': 'def coincide(tableau1, tableau2):     # On s\'assure que les deux tableaux ont la même longueur     if len(tableau1) != len(tableau2):         return []      # On crée un tableau vide pour stocker les indices où les éléments sont identiques     result = []          # On parcourt les deux tableaux et on compare les éléments     for i in range(len(tableau1)):         if ta