In [1]:
from IPython.display import clear_output # type: ignore

%pip install pandas
%pip install python-dotenv
%pip install openai
%pip install tqdm
%pip install demjson3

clear_output()

## Read the dataframe with all the SIMBA conversations

In [2]:
import pandas as pd # type: ignore
from pathlib import Path
import os

# Define path for the data
path = Path(os.getcwd()).parent / "data"

# Read the .csv file with the conversations from SIMBA
df = pd.read_csv(path / 'df_chats.csv')

df.head()

Unnamed: 0,activity_id,user_id,group,role,message_num,sentence_num,sentence_en
0,asst_Get3WE5ozTjkHcdGhnqGGqgL,s140,1,model,0.0,0,"Hello! 😸 I am SIMBA, and I will help you think..."
1,asst_Get3WE5ozTjkHcdGhnqGGqgL,s140,2,user,1.0,0,"The main ideas are: T-V diagrams, P-V diagrams..."
2,asst_Get3WE5ozTjkHcdGhnqGGqgL,s140,3,model,2.0,0,Great start! 🌟 You mentioned important concept...
3,asst_Get3WE5ozTjkHcdGhnqGGqgL,s140,4,user,3.0,0,T-V diagrams show a curve where we can distinc...
4,asst_Get3WE5ozTjkHcdGhnqGGqgL,s140,5,model,4.0,0,Very well explained! 🥳 You have well understoo...


## Convert conversations to an structured format

In [3]:
# Ensure that the message num is an integer for sorting
df['message_num'] = df['message_num'].astype(int)  

# Group the dataframe by conversation
grouped = df.groupby(['activity_id','user_id'])

# We'll save the conversations as an array of dictionaries were every conversation is an entry
conversations_dict = []
# For every conversation in the dataframe
for activity_id, group in grouped:
    # Convert the utterances into a list of dictionaries, then sort entries by message num
    conversation_text = group.sort_values('message_num').apply(
        lambda row: {"message_num": row['message_num'], "role": row['role'], "sentence_en": row['sentence_en']},
        axis=1
    ).tolist()
    
    # Create a conversation
    conversation = {
        "activity_id": activity_id[0],
        "user_id": activity_id[1],
        "conversation": conversation_text
    }
    # Append the conversations as a dictionary
    conversations_dict.append(conversation)

# This is not a JSON, it will be important later
conversations_dict[0]

{'activity_id': 'asst_Get3WE5ozTjkHcdGhnqGGqgL',
 'user_id': 's103',
 'conversation': [{'message_num': 0,
   'role': 'model',
   'sentence_en': "Hello! 😸 I am SIMBA, and I will help you think about the following questions:\n**Question 1:** In your opinion, what are the 2 to 5 main ideas to remember from the course entitled 'Transformations and first principle'?\n**Question 2:** What are the points of the course that still seem obscure to you?\nYour turn! 🌟"},
  {'message_num': 1,
   'role': 'user',
   'sentence_en': "In this course, we saw the notions of critical point, when P increases, the liquid-vapor equilibrium line narrows until it becomes a point called the critical point.\nFor pressures above the critical point, there is no more phase change, the transition between the liquid state and the gaseous state occurs smoothly.\nWe also saw the dew and boiling curves, these two curves meet at the critical point.\nFor a boiling curve, adding energy leads to the appearance of the first v

## Functions for doing the classification

In [4]:
import os
from pathlib import Path
from openai import OpenAI #type: ignore
import numpy as np #type: ignore
from tqdm import tqdm #type: ignore


def read_prompts(folder: str):
    '''
    Reads system, user, and agent prompts from text files in a specified folder.

    This function constructs file paths for `prompt_system.txt`, `prompt_user.txt`, and
    `prompt_agent.txt` within the given folder, reads their contents, and returns them
    as a tuple of strings.

    Args:
        folder (str): The name of the folder (relative to the parent of the current working
                     directory under the "coding_schemes" directory) containing the prompt files.

    Returns:
        tuple[str, str, str]: A tuple containing three strings:
            - PROMPT_SYSTEM: The content of `prompt_system.txt`.
            - PROMPT_USER: The content of `prompt_user.txt`.
            - PROMPT_AGENT: The content of `prompt_agent.txt`.

    '''
    path = Path(os.getcwd()).parent / folder
    # Read the three prompts
    PROMPT_SYSTEM_NAME = path / 'prompt_system.txt'
    PROMPT_USER_NAME = path / 'prompt_user.txt'
    PROMPT_AGENT_NAME = path / 'prompt_user.txt'
    
    # Get the system, user and agent prompts as a string
    with open(PROMPT_SYSTEM_NAME) as f:
        PROMPT_SYSTEM= f.read()

    with open(PROMPT_USER_NAME) as f:
        PROMPT_USER = f.read()

    with open(PROMPT_AGENT_NAME) as f:
        PROMPT_AGENT = f.read()
    # Return a tuple with the prompts
    return (PROMPT_SYSTEM, PROMPT_USER, PROMPT_AGENT)

# For error capturing
errors = []

def classify_sentences(client: OpenAI, 
                       message: str,
                       prompt_system: str,
                       prompt_user: str,
                       prompt_agent: str):
    
    '''
    Classifies a given message using an OpenAI model with predefined prompts.

    This function sends a message to the OpenAI API along with system, user, and agent prompts
    to generate a classification response. It handles errors by logging them and returning `None`
    if the classification fails.

    Args:
        client (OpenAI): An instance of the OpenAI client used to interact with the API.
        message (str): The message to be classified.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.

    Returns:
        str: The content of the model's response if the classification is successful.
        None: If an error occurs during the classification process.
    '''
    # Docs: https://platform.openai.com/docs/api-reference/chat/create 
    # It is possible to thinker with different parameters such as  response_format, seed,
    # frequency_penalty and so on, but in this case we specified:
    # Model which is going to do the classification, in this case gpt-4o-mini
    # The messages:
    #   - The system message is the task and role of the model
    #   - The user message is an example of an expected input to feed the model
    #   - The agent message is an example of an expected output for the provided model
    #   - The second user message is the text to label (The dictionary with the entire conversation)
    # Temperature: Set to zero to provide an output a bit more deterministic, but is not guaranteed to
    #              be deterministic (For that you should use a seed, which is in beta right now)
    # Stream: Set to False, to only recieve the entire response message, to avoid processing the stream after
    # Timeout: Set to 25 minutes, because conversations can take a lot of time to process.
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt_system},
            {"role": "user", "content": prompt_user},
            {"role": "assistant", "content": prompt_agent},
            {"role": "user", "content": message}
        ],
        temperature=0,
        stream=False,
        timeout=1500  # Set the timeout to 1500 seconds
    )

    # Capture errors
    try:
        # If not error, we get the response
        return response.choices[0].message.content
    except:
        # If error, the function returns None, but the error message is appended
        # to a global array
        errors.append({'message': message, 'response': response})
        return None


def classify_conversation(openai_client: OpenAI,
                          prompt_system: str,
                          prompt_user: str,
                          prompt_agent: str,
                          conversations: list):
    '''
    Classifies a list of conversations using an OpenAI model with predefined prompts.

    This function iterates over a list of conversations, classifies each one using the
    `classify_sentences` function, and collects the results. A progress bar is displayed
    to track the classification process.

    Args:
        openai_client (OpenAI): An instance of the OpenAI client used to interact with the API.
        prompt_system (str): The system prompt to guide the model's behavior.
        prompt_user (str): The user prompt to provide context or instructions.
        prompt_agent (str): The agent prompt to simulate the assistant's role.
        conversations (list): A list of conversations (strings) to be classified.

    Returns:
        list: A list of classification results corresponding to each conversation. Each result
              is the output of the `classify_sentences` function, which may be a string or `None`
              if an error occurred during classification.
    '''

    out = []
    for conversation in tqdm(conversations, desc='Clasifying...'):
        response = classify_sentences(openai_client, 
                                      str(conversation), 
                                      prompt_system, 
                                      prompt_user, 
                                      prompt_agent)
        out.append(response)
    return out


## Variables for each run

In [5]:
PROMPT_FOLDER = 'prompts'
OUT_FILE = 'dialogue_acts_example' + '.csv'
OUT_FOLDER = 'labeled_conversations'
LABEL_COLUMN = 'label'
CATEGORY_COLUMN = 'type'

## OpenAI client instantiation and function call for classification

In [6]:
from dotenv import load_dotenv #type: ignore
import os 

# Load my OPEN AI API Key
load_dotenv() 

# Save value of OPEN AI API KEY to a variable
OPENAI_KEY = os.getenv("OPENAI_API_KEY")

# Creat an instance of the OPEN AI client using the API key
openai_client = OpenAI(api_key=OPENAI_KEY)

# Read the prompts
prompt_system, prompt_user, prompt_agent = read_prompts(PROMPT_FOLDER)

# Do the classification
labeled_data = classify_conversation(openai_client,
                                  prompt_system,
                                     prompt_user,
                                     prompt_agent,
                                     conversations_dict[0:5])
                                    

Clasifying...: 100%|██████████| 5/5 [02:02<00:00, 24.54s/it]


In [7]:
print(labeled_data[0])

{
  "activity_id": "asst_Get3WE5ozTjkHcdGhnqGGqgL",
  "user_id": "s103",
  "conversation": [
    {
      "message_num": 0,
      "role": "model",
      "sentence_en": "Hello! 😸 I am SIMBA, and I will help you think about the following questions:",
      "annotation": "greeting"
    },
    {
      "message_num": 0,
      "role": "model",
      "sentence_en": "In your opinion, what are the 2 to 5 main ideas to remember from the course entitled 'Transformations and first principle'?",
      "annotation": "setQuestion"
    },
    {
      "message_num": 0,
      "role": "model",
      "sentence_en": "What are the points of the course that still seem obscure to you?",
      "annotation": "setQuestion"
    },
    {
      "message_num": 0,
      "role": "model",
      "sentence_en": "Your turn! 🌟",
      "annotation": "inform"
    },
    {
      "message_num": 1,
      "role": "user",
      "sentence_en": "In this course, we saw the notions of critical point, when P increases, the liquid-vapor

In [8]:
errors

[]

In [9]:
import json
import pandas as pd # type: ignore
import demjson3 # type: ignore

def convert_to_dataframe(list_of_json: list, 
                         label_column: str):

    '''
    Converts a list of JSON strings into a flattened Pandas DataFrame.

    This function processes a list of JSON strings, each representing a conversation with
    messages and annotations. It flattens the nested structure of the JSON and constructs
    a DataFrame with columns for activity ID, user ID, message metadata, and annotations.

    Args:
        list_of_json (list): A list of JSON strings, where each string represents a conversation.
        label_column (str): The name of the column to store annotations (e.g., labels).

    Returns:
        pd.DataFrame: A DataFrame containing the flattened conversation data with columns:
            - activity_id: The ID of the activity.
            - user_id: The ID of the user.
            - role: The role of the speaker (e.g., user, model).
            - message_num: The message number in the conversation.
            - utterance_num: The utterance number within the same message and role.
            - sentence_en: The English sentence of the message.
            - <label_column>: The annotation or label for the message.

    Raises:
        JSONDecodeError: If a JSON string cannot be decoded using `json.loads`.
        demjson3.JSONDecodeError: If a JSON string cannot be decoded using `demjson3.decode`.
    '''
    # A list to save every entry (row for our dataframe)
    flattened_data = []

    for entry in list_of_json:
        # First try to decode the string using vanilla JSON module
        try: 
            # Try to do the conversion 
            json_entry = json.loads(entry)

        # In case of error use demjson3 which is capable to deal
        # with JSON-like string (that use single quotes instead of double quotes)
        except:
            # Print the entry that trigger the exception for manual inspection 
            print(entry)
            json_entry = demjson3.decode(entry)

        # Get the IDs of the activity and the conversation
        activity_id = json_entry['activity_id']
        user_id = json_entry['user_id']
        # For every utterance in the conversation
        for message in json_entry['conversation']:
            # Create an entry (row) to use then in the dataframe
            flattened_data.append({
                'activity_id' : activity_id,
                'user_id': user_id,
                'message_num': message['message_num'],
                'role': message['role'],
                'sentence_en': message['sentence_en'],
                label_column : message['annotation']    
            })

    # Convert the entries in a dataframe
    out_df = pd.DataFrame(flattened_data)
    # Calculate an index for each utterance
    out_df['utterance_num'] = out_df.groupby(['user_id', 'message_num', 'role']).cumcount()
    # Sort the columns of the dataframe
    out_df = out_df[['activity_id', 'user_id', 'role', 
                           'message_num', 'utterance_num','sentence_en', label_column]]
    return out_df


In [10]:
out_df = convert_to_dataframe(labeled_data, LABEL_COLUMN)

out_df.head()

Unnamed: 0,activity_id,user_id,role,message_num,utterance_num,sentence_en,label
0,asst_Get3WE5ozTjkHcdGhnqGGqgL,s103,model,0,0,"Hello! 😸 I am SIMBA, and I will help you think...",greeting
1,asst_Get3WE5ozTjkHcdGhnqGGqgL,s103,model,0,1,"In your opinion, what are the 2 to 5 main idea...",setQuestion
2,asst_Get3WE5ozTjkHcdGhnqGGqgL,s103,model,0,2,What are the points of the course that still s...,setQuestion
3,asst_Get3WE5ozTjkHcdGhnqGGqgL,s103,model,0,3,Your turn! 🌟,inform
4,asst_Get3WE5ozTjkHcdGhnqGGqgL,s103,user,1,0,"In this course, we saw the notions of critical...",inform


## Check that the model only use the labels provided in System Prompt

In [11]:
import json
# Leer coding scheme
path = Path(os.getcwd()).parent / PROMPT_FOLDER
with open(path / 'tags.json') as f:    
    code = json.load(f)

code


{'Information-seeking': ['propositionalQuestion (Yes/No)',
  'propositionalQuestion',
  'setQuestion (Who/What/Where/How)',
  'setQuestion',
  'choiceQuestion',
  'checkQuestion'],
 'Information-providing': ['inform (Statement)',
  'inform',
  'answer',
  'agreement',
  'disagreement',
  'correction',
  'confirm',
  'disconfirm'],
 'Commissive': ['offer',
  'conditionalAccept (Consider/Address a Request/Suggestion/Offer)',
  'conditionalAccept',
  'accept (Request/Suggestion/Offer)',
  'accept',
  'decline (Request/Suggestion/Offer)',
  'decline'],
 'Directive': ['request', 'suggest'],
 'Feedback': ['autoPositive (Positive Understanding/Feedback)',
  'autoPositive',
  'autoNegative (Negative Understanding/Feedback)',
  'autoNegative'],
 'Time Management': ['stalling (Pausing)', 'stalling'],
 'Own and Partner Communication Management': ['retraction (Abandon)',
  'retraction'],
 'Social Obligations Management': ['greeting',
  'goodbye',
  'thanking',
  'acceptThanking',
  'apology',
  'a

# If the model hallucinate some label this would be printed here

In [12]:
tags = []
for e in code:
    tags += code[e]

labels = out_df[LABEL_COLUMN].unique()
print('Checking tags...')
hallucinations = False
for label in labels:
    if label not in tags:
        print(label)
        hallucinations = True

if hallucinations:
    print('The model hallucinated some tags')
else:
    print('No problems detected')

Checking tags...
No problems detected


# If you need to fix some labels it is possible to do it like this

In [13]:
# If theres any error, you can replace broken tags in this
#out_df[LABEL_COLUMN] = out_df[LABEL_COLUMN].str.replace('HALLUCINATED LABEL', 'CORRECT LABEL')

# When labels have been corrected we can assign a first-level label (Or a type if exist)

In [14]:
# The dictionary category: label is reversed to have a label: category
reversed_coding_scheme = {label: category for category, labels in code.items() for label in labels}

# If there is some label without any category we can fill it with "Undefined"
out_df[CATEGORY_COLUMN] = out_df[LABEL_COLUMN].map(reversed_coding_scheme).fillna('Undefined')

# See if there's any row without category
out_df[out_df[CATEGORY_COLUMN] == 'Undefined']

Unnamed: 0,activity_id,user_id,role,message_num,utterance_num,sentence_en,label,type


In [15]:

from pathlib import Path
import os

# Define out path
path = Path(os.getcwd()).parent / OUT_FOLDER

# Create the folder in case it doesn't exist
path.mkdir(parents=True, exist_ok=True)

# Save the file as .csv
out_df.to_csv(path / OUT_FILE, index=False)