In [1]:
import openai
import os
# import IPython
import time
import glob
import pandas as pd
import itertools
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
def split_list(lst, val): # code from https://www.geeksforgeeks.org/python-split-list-into-lists-by-particular-value/
    return [list(group) for k, group in itertools.groupby(lst, lambda x: x==val) if not k]

def convert_format_for_gptlabel(filepath):
    """convert tsv to the conversation format to feed to gpt to get the category labels..."""
    df = pd.read_csv(filepath, sep = "\t")
    l_u_sep = df["utterances"].to_list()
    lst_cons = split_list(l_u_sep,"-"*20)
    conversations = []
    for con in lst_cons:
        count = 0
        processed = ""
        for utterance in con:
            count += 1
            utterance = str(count) + ". " + utterance +'\n'
            processed += utterance
        conversations.append(processed)
        
    return conversations, l_u_sep

def gpt_label_zeroshot(conversation, MODEL = "gpt-3.5-turbo"):
    """
    This is to generate dialogues acccording to categories in ICF by feeding the definitions of the each category to the prompt
    """
    # count = 0
    query = [
            # {"role": "system", "content": "Think as you are an expert who is very proficient in ICF (International Classification of Functioning, Disability and Health)."}, # Human
            {"role": "system", "content": """Pick one most likely label for each utterance in the conversation with subcategories of activities and participation in ICF. The labels includes: 
             - learning and applying knowledge, 
             - general tasks and demands, 
             - communication, 
             - mobility, 
             - self-care, 
             - domestic life areas, 
             - interpersonal interactions and relationships, 
             - major life areas, 
             - community, social and civic life , 
             - none: if the utterance does not belong to any of the categories above, label it as 'none'
            """},
            {"role": "user", "content": """The format of the label is like:
            1. label
            2. label
            3. label
                ...
            """},
            {"role": "user", "content": f"Conversation:{conversation}"},
            {"role": "user", "content": "Let's think utterance by utterance."},
            {"role": "user", "content": "labels:"},
            ]
        # MODEL = "gpt-3.5-turbo"
    response_query = openai.ChatCompletion.create(
        model=MODEL,
        messages=query,
        temperature=0,
        max_tokens = 500
        )

    return response_query
        
def label_multiple_conversations(conversations):
    """"""
    raw_labels = []
    count_con = 0
    for c in conversations:
        response = gpt_label_zeroshot(c)
        raw_labels.append(response.choices[0]['message']['content'])
        count_con += 1
        print(f"finish {count_con} conversations", end="\r")
        time.sleep(18)
    print(f"finish {count_con} conversations")

    return raw_labels

def process_raw_labels(raw_labels):
    """"""
    labels = []
    for l in raw_labels:
        lst_index_labels = l.split("\n")
        for index_label in lst_index_labels:
            index_label = index_label.split(". ")
            label = index_label[1]
            labels.append(label)
        labels.append("-"*20)

    return labels

def combine_labels_conversations(l_u_sep, labels, filepath):
    """"""
    if len(l_u_sep) == len(labels):
        print(f"num of con matches num of labels, wrote to {filepath}")
        df_train = pd.DataFrame({
            "utterances": l_u_sep,
            "labels": labels
        })
        df_train.to_csv(filepath,sep="\t",index=False)
    else:
        print("ERROR: num of con DOES NOT match num of labels")
    
    

In [4]:
dir = '../response_data/'
files = glob.glob(f"{dir}files_to_ready_to_label/*.tsv")
# time.sleep(60)
print("start")
for f in files:
    print(f"processing data in {f}")
    conversations, l_u_sep = convert_format_for_gptlabel(f)
    raw_labels = label_multiple_conversations(conversations)
    labels = process_raw_labels(raw_labels)
    filecomponents = f.split("/")
    filename = filecomponents[-1]
    filepath = f"{dir}train_data/small_category/{filename}"
    combine_labels_conversations(l_u_sep, labels, filepath)
    time.sleep(60)


start
processing data in ./files_to_ready_to_label/trial annotation_2.tsv
finish 5 conversations
num of con matches num of labels, wrote to ./train_data/trial annotation_2.tsv
