# GPT-predict Labels
This notebook includes the prompt engineering process of predicting event-related information using GPT-4o

In [1]:
import openai
import os
import json
import time
import glob
import pandas as pd
import spacy
import itertools
from dotenv import load_dotenv
import shutil
import ast


import utils

## 1. Set up the OpenAI API Key

In [2]:
openai.api_key = '' # Enter OpenAI API key
print("OpenAI API Key: {}".format(openai.api_key))

OpenAI API Key: 


## 2. Prompt GPT to Generate Labels

In [4]:
# Prompt gpt to generate index in the result (Input the index information of each token)

def gpt_label_zeroshot(conversations, MODEL="gpt-4o"):
    activity_dict = {}

    for i, con in enumerate(conversations):
            conversation_index = i + 1
            print(f"Processing conversation {conversation_index}/{len(conversations)}")
            # print(con)
            query = [
                {"role": "system", "content": "You are an expert in healthcare. I will provide you some conversations between\
                 a chatbot and an elderly person. Please extract the information contained in each conversation: \
                'activity index', 'activity', 'participants', 'place', and 'time', and format every activity as a list."},
                {"role": "system", "content": "Each token in the conversation is provided with three information:\
                 conversation id, sentence id, and token id."},
                {"role": "system", "content": "When extracting the information, please use the words and phrases appeared \
                in the original conversation. Please also indicate the sentence id and token id of the activity information."},
                {"role": "system", "content": f"The conversation index for this conversation is {conversation_index}. \
                If the conversation contains more than one activity, generate a list for each activity, using the\
                 same conversation index but different activity indices. For a new activity, increment the activity index by 1."},
                {"role": "system", "content": "Generate the activity information only based on the conversation.\
                 Do not use any external information."},
                {"role": "system", "content": "If there are no participants, place, or time of the activity mentioned\
                 in the conversation, please mark as 'None' in the output."},
                {"role": "user", "content": f"Conversation: {con}"},
                {"role": "system", "content": "Please provide the output in the following JSON format:\
                 [{'activity_index': 1, 'activity': 'activity 1', 'activity_sentence_id': 1, 'activity_token_ids': [1, 2], \
                'participants': 'participant 1', 'participants_sentence_id': 1, 'participants_token_ids': [3], \
                'place': 'place 1', 'place_sentence_id': 2, 'place_token_ids': [5], 'time': 'time 1', 'time_sentence_id': 3, \
                'time_token_ids': [7]}, {...}]. Please provide the output without Markdown code blocks, \
                and do not include the newline marker \\n in the output."},
            ]

            response_query = openai.ChatCompletion.create(
                model=MODEL,
                messages=query,
                temperature=0,
                max_tokens=1500
            )

            response_text = response_query.choices[0].message['content']
            activity_dict[f"conversation_{conversation_index}"] = response_text
    return activity_dict

## 3. Generate BIO Labels
Map the results back to the tokens in the conversation, and generate BIO format labels.

### Process all the files

In [7]:
if not os.path.exists("../response_data/files_with_id"):
    os.makedirs("../response_data/files_with_id")


 # Process all the files
for file in glob.glob("../response_data/resized_files/*.tsv"):
    print(f"Processing file: {file}")
    data_with_id = utils.tsv_add_id(file)

    
    output_file = f"../response_data/files_with_id/{os.path.basename(file)}"


    df = pd.DataFrame(data_with_id, columns=['conversation', 'sent_id', 'token_id', 'token'])
    df.to_csv(output_file, sep='\t', index=False)

    conversation_list = utils.extract_conversations(output_file)

    activity_dict = gpt_label_zeroshot(conversation_list, MODEL = "gpt-4o")

    # Remove empty rows from the DataFrame
    df = df[df['conversation'] != '']
    # Add a label column to the DataFrame, and initialize it with 'O'
    df['event1']= 'O'
    df['event2']= 'O'
    df['event3']= 'O'
    df['event4']= 'O'
    df['event5']= 'O'


    df = utils.get_label(df, activity_dict)
    output_file = f"../response_data/dataset/{os.path.basename(file).replace('.tsv', '_gpt_4o_labelled.tsv')}"
    df.to_csv(output_file, sep='\t', index=False)
    
    # save the file to jsonl (when input as conversation)
    utils.tsv_to_jsonl(output_file, output_file.split('/')[-1].split('_')[0])

Processing file: ./response_data/resized_files/test.tsv
Processing conversation 1/54
Processing conversation 2/54
Processing conversation 3/54
Processing conversation 4/54
Processing conversation 5/54
Processing conversation 6/54
Processing conversation 7/54
Processing conversation 8/54
Processing conversation 9/54
Processing conversation 10/54
Processing conversation 11/54
Processing conversation 12/54
Processing conversation 13/54
Processing conversation 14/54
Processing conversation 15/54
Processing conversation 16/54
Processing conversation 17/54
Processing conversation 18/54
Processing conversation 19/54
Processing conversation 20/54
Processing conversation 21/54
Processing conversation 22/54
Processing conversation 23/54
Processing conversation 24/54
Processing conversation 25/54
Processing conversation 26/54
Processing conversation 27/54
Processing conversation 28/54
Processing conversation 29/54
Processing conversation 30/54
Processing conversation 31/54
Processing conversation

### Trying to give BERT sentence by sentence as input

In [19]:
# Dataset directories

if not os.path.exists("../response_data/jsonl_for_finetune"):
    os.makedirs("../response_data/jsonl_for_finetune")

tsv_dir = '../response_data/dataset/'
jsonl_dir = '../response_data/jsonl_for_finetune'

# Convert TSV files to JSONL files
utils.convert_tsv_to_jsonl(tsv_dir, jsonl_dir)

## 4. Combine Training Set Files

In [20]:
# Dataset directories
dataset_directory = "../response_data/jsonl_for_finetune/"

# Merged file names
train_merged_file_name =  "../response_data/dataset/train.jsonl"

# Delete train.jsonl file if it exists
if os.path.exists(train_merged_file_name):
    os.remove(train_merged_file_name)
    print("Deleted existing train.jsonl file.")


# Initialize lists to store train and test JSONL files
train_jsonl_files = []

# Get all JSONL files in the train directory
for file in os.listdir(dataset_directory):
    if file.endswith(".jsonl") and "test" not in file and "dev" not in file:
        train_jsonl_files.append(os.path.join(dataset_directory, file))

print(f"Train JSONL files: {train_jsonl_files}")

Train JSONL files: ['./response_data/jsonl_for_finetune/self-care_clean_1_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/self-care_clean_3_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/domestic life_clean_4_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/domestic life_clean_2_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/mobility_clean_1_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/mobility_clean_3_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/mobility_clean_4_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/self-care_clean_2_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/self-care_clean_4_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/domestic life_clean_3_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/mobility_clean_2_gpt_4o_labelled.jsonl', './response_data/jsonl_for_finetune/domestic life_clean_1_gpt_4o_labelled.jsonl']


In [21]:
# Process Train Files
with open(train_merged_file_name, "w", encoding="utf-8") as train_merged_file:
    # Iterate over each JSONL file in train set
    for file_name in train_jsonl_files:
        # Open the current file in train set
        with open(file_name, "r", encoding="utf-8") as file:
            # Read the content of the current file line by line and write it to the new file for train set
            for line in file:
                train_merged_file.write(line)


## Extra: Check Ambiguity: Overlapped Spans

Check if there are any events that share the same tokens for different activity information.  
   
E.g. A token is labelled as "time" for both "event 1" and "event 2"

In [23]:
# Define the directory path
directory = '../response_data/dataset/'


# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.tsv') and "annotation" not in filename:
        # Construct the file path
        filepath = os.path.join(directory, filename)
        
        # Read the TSV file
        df = pd.read_csv(filepath, sep='\t')
        
        # Flag to track if there are rows with more than one non-'O' and non-NaN event columns
        found_non_O_non_NaN = False
        
        # Check if there are more than one non-'O' and non-NaN event columns for each row
        for index, row in df.iterrows():
            # Get the columns starting with 'event'
            event_columns = [col for col in df.columns if col.startswith('event')]
            
            # Calculate the count of non-'O' and non-NaN events
            non_O_count = sum(row[column] != 'O' and not pd.isna(row[column]) for column in event_columns)
            
            # If the count of non-'O' and non-NaN events is greater than 1, print the corresponding token column
            if non_O_count > 1:
                found_non_O_non_NaN = True
                print(f'Processed file: {filename}, Token: {row["token"]}')
        
        # If no rows with more than one non-'O' and non-NaN event columns were found, print a message
        if not found_non_O_non_NaN:
            print(f'Processed file: {filename}, No tokens appear in more than one events found')




Processed file: self-care_clean_2_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: dev_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: self-care_clean_3_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: self-care_clean_1_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: test_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: self-care_clean_4_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: domestic life_clean_4_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: mobility_clean_1_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: mobility_clean_3_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Processed file: mobility_clean_2_gpt_4o_labelled.tsv, No tokens appear in more than one events found
Proces