In [1]:
from openai import OpenAI
import pandas as pd
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ems = """
admiration
amusement
anger
annoyance
approval
caring
confusion
curiosity
desire
disappointment
disapproval
disgust
embarrassment
excitement
fear
gratitude
grief
joy
love
nervousness
optimism
pride
realization
relief
remorse
sadness
surprise
neutral
    """

In [3]:
def emotions_to_categorical(df):
    res = []

    for i in df['emotions']:
        tmp = [0 for _ in range(28)]
        for j in i:
            tmp[j] = 1
        res.append(tmp)
    tmp_df = pd.DataFrame(res, columns=ems.split())
    
    return tmp_df

In [4]:
def emotions_to_ekman(df):
    # anger disgust fear joy sadness surprise neutral
    ekman = [3, 3, 0, 0, 3, 3, 5, 5, 3, 4, 0, 1, 4, 3, 2, 3, 4, 3, 3, 2, 3, 3, 5, 3, 4, 4, 5, 6]
    res = []

    for i in df:
        tmp = [0, 0, 0, 0, 0, 0, 0]
        for j in range(len(i)):
            if i[j] == 1:
                tmp[ekman[j]] = 1
        res.append(tmp)
    tmp_df = pd.DataFrame(res, columns=['angry', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral'])
    
    return tmp_df

In [5]:
def data_init(path = "../data/dev.tsv"):
    df = pd.read_csv(path, sep="\t", encoding = "utf-8", header=None)
    df.columns = ['text', 'emotions', 'id']
    df['emotions'] = list(map(lambda s : list(map(int, s.split(','))), df['emotions']))
    df = pd.concat([df, emotions_to_categorical(df)], axis=1)
    df = df.drop(columns=['emotions', 'id'])
    df['text'] = list(map(lambda s : s.replace('\\', '\\\\').replace('"', '\\"'), list(df['text']))) 
    return df.iloc[:2500, :]

In [6]:
def evaluation(original_df, emotion_res):
    emotions_list = ems.split()
    df = original_df
    predicted_df = pd.DataFrame(data = [[0 for _ in range(28)] for _ in range(len(df))], columns=emotions_list)
    for i in range(len(emotion_res)):
        for j in emotion_res[i]:
            if j in emotions_list:
                predicted_df.loc[i, j] = 1
    predicted = predicted_df.to_numpy()
    original = df.iloc[:,1:].to_numpy()
    
    
    accuracy = accuracy_score(original, predicted)
    
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        original, predicted, average='micro'
    )
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        original, predicted, average='macro'
    )
    
    precision_per_label, recall_per_label, f1_per_label, _ = precision_recall_fscore_support(
        original, predicted, average=None
    )

    precision_macro_std = np.std(precision_per_label)
    recall_macro_std = np.std(recall_per_label)
    f1_macro_std = np.std(f1_per_label)

    print("--- 모델 평가 결과 ---")
    print(f"전체 샘플에 대한 정확도 (Exact Match Accuracy): {accuracy:.4f}")
    print("\n--- Micro 평균 지표 ---")
    print(f"Precision (Micro): {precision_micro:.4f}")
    print(f"Recall (Micro): {recall_micro:.4f}")
    print(f"F1-Score (Micro): {f1_micro:.4f}")
    print("\n--- Macro 평균 지표 ---")
    print(f"Precision (Macro): {precision_macro:.4f}")
    print(f"Recall (Macro): {recall_macro:.4f}")
    print(f"F1-Score (Macro): {f1_macro:.4f}")
    
    print("\n--- 라벨별 지표 ---")
    for i in range(len(emotions_list)):
        print(f"{emotions_list[i]} - Precision: {precision_per_label[i]:.4f}, Recall: {recall_per_label[i]:.4f}, F1-Score: {f1_per_label[i]:.4f}")
    
    print(f"\nPrecision (Macro) 표준편차: {precision_macro_std:.4f}")
    print(f"Recall (Macro) 표준편차: {recall_macro_std:.4f}")
    print(f"F1-Score (Macro) 표준편차: {f1_macro_std:.4f}")

    return accuracy, f1_micro, f1_macro, precision_recall_fscore_support(original, predicted, average=None)

In [7]:
def evaluation_ekman(original_df, emotion_res):
    emotions_list = 'anger disgust fear joy sadness surprise neutral'.split()
    predicted_df = pd.DataFrame(data = [[0 for _ in range(28)] for _ in range(len(original_df))], columns=ems.split())
    for i in range(len(emotion_res)):
        for j in emotion_res[i]:
            if j in ems.split():
                predicted_df.loc[i, j] = 1
    predicted = emotions_to_ekman(predicted_df.to_numpy()).to_numpy()
    original = emotions_to_ekman(original_df.iloc[:,1:].to_numpy()).to_numpy()

    accuracy = accuracy_score(original, predicted)
    
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        original, predicted, average='micro'
    )
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        original, predicted, average='macro'
    )
    
    precision_per_label, recall_per_label, f1_per_label, _ = precision_recall_fscore_support(
        original, predicted, average=None
    )

    precision_macro_std = np.std(precision_per_label)
    recall_macro_std = np.std(recall_per_label)
    f1_macro_std = np.std(f1_per_label)

    print("--- 모델 평가 결과 ---")
    print(f"전체 샘플에 대한 정확도 (Exact Match Accuracy): {accuracy:.4f}")
    print("\n--- Micro 평균 지표 ---")
    print(f"Precision (Micro): {precision_micro:.4f}")
    print(f"Recall (Micro): {recall_micro:.4f}")
    print(f"F1-Score (Micro): {f1_micro:.4f}")
    print("\n--- Macro 평균 지표 ---")
    print(f"Precision (Macro): {precision_macro:.4f}")
    print(f"Recall (Macro): {recall_macro:.4f}")
    print(f"F1-Score (Macro): {f1_macro:.4f}")
    
    print("\n--- 라벨별 지표 ---")
    for i in range(len(emotions_list)):
        print(f"{emotions_list[i]} - Precision: {precision_per_label[i]:.4f}, Recall: {recall_per_label[i]:.4f}, F1-Score: {f1_per_label[i]:.4f}")
    
    print(f"\nPrecision (Macro) 표준편차: {precision_macro_std:.4f}")
    print(f"Recall (Macro) 표준편차: {recall_macro_std:.4f}")
    print(f"F1-Score (Macro) 표준편차: {f1_macro_std:.4f}")

    return accuracy, f1_micro, f1_macro, precision_recall_fscore_support(original, predicted, average=None)

In [8]:
def file_init():
    file_dict = {}
    file_names = {
        'persona': './prompt/persona.txt',
        'guidelines': './prompt/guidelines.txt',
        'output_structure': './prompt/output_structure.txt',
        'few_shot': './prompt/few_shot.txt',
        'few_shot_4': './prompt/few_shot_4.txt',
        'few_shot_8': './prompt/few_shot_8.txt',
        'few_shot_12': './prompt/few_shot_12.txt',
        'few_shot_16': './prompt/few_shot_16.txt',
        'few_shot_20': './prompt/few_shot_20.txt',
        'cot': './prompt/chain_of_thought.txt',
        'description':  './prompt/emotion_description.txt'
    }
    for key, value in file_names.items():
        file = open(value, 'r')
        file_dict[key] = file.read()
        file.close()
    return file_dict

In [9]:
files = file_init()

In [10]:
systems = {
    4: f"{files['persona']}{files['description']}{files['guidelines']}{files['output_structure']}{files['few_shot_4']}",
    8: f"{files['persona']}{files['description']}{files['guidelines']}{files['output_structure']}{files['few_shot_8']}",
    12: f"{files['persona']}{files['description']}{files['guidelines']}{files['output_structure']}{files['few_shot_12']}",
    16: f"{files['persona']}{files['description']}{files['guidelines']}{files['output_structure']}{files['few_shot_16']}",
    20: f"{files['persona']}{files['description']}{files['guidelines']}{files['output_structure']}{files['few_shot_20']}"
}

In [11]:
data = data_init()

In [17]:
it = [(0.25, 0.75), (0.00, 0.25),(0.50, 1.00)]

In [66]:
for key, value in systems.items():
    i = 0
    for j in it:
        with open(f"./inputs/few_shot_grid/version2/few_shot_grid_{key}_{i}.jsonl", "w") as f:
            k = 0
            for record in data["text"]:
                baseQuery = {
                "custom_id": f"query{k}",
                "method": "POST",
                "url": "/v1/responses",
                "body": {
                        "model": "gpt-4o-mini",
                        "temperature": j[0],
                        "top_p": j[1],
                        "input": [{
                            "role": "developer",
                            "content": f"{value}"
                        }, 
                        {
                            "role": "user",
                            "content": f"{record}"
                        }], 
                        "max_output_tokens": 1000,
                        "text": {
                            "format": {
                                "type": "json_schema",
                                "name": "result",
                                "strict": True,
                                "schema": {
                                    "type": "object",
                                    "properties": {
                                        "analysis": {
                                            "type": "array",
                                            "items": {
                                                "type": "object",
                                                "properties": {
                                                    "emotion": {
                                                        "type": "string",
                                                        "enum": [ "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral" ]
                                                    },
                                                    "reason": {
                                                        "type": "string"
                                                    }
                                                },
                                                "required": ["emotion", "reason"],
                                                "additionalProperties": False
                                            }
                                        }
                                    },
                                    "required": ["analysis"],
                                    "additionalProperties": False
                                }
                            }
                        }
                    }
                }
                k += 1
                f.write(json.dumps(baseQuery) + "\n")
        i += 1

In [13]:
key_file = open('./key/openai_key.txt', 'r')
api_key = key_file.readline()
key_file.close()
client = OpenAI(api_key=api_key)

In [93]:
batch_list = []

In [96]:
for i in range(20, 21, 4):
    for j in range(2, 3):
        batch_input_file = client.files.create(
            file=open(f"./inputs/few_shot_grid/version2/few_shot_grid_{i}_{j}.jsonl", "rb"),
            purpose='batch'
        )
        print(batch_input_file)
        
        batch_input_file_id = batch_input_file.id
        create_batch=client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/responses",
            completion_window="24h",
        )
        batch_list.append(create_batch.id)


FileObject(id='file-Y4uMoNsQfFa87VzGHPpMy7', bytes=29057821, created_at=1762145807, filename='few_shot_grid_20_2.jsonl', object='file', purpose='batch', status='processed', expires_at=1764737807, status_details=None)


In [73]:
for i in batch_list:
    print(i)

batch_690812716c3481909126d96df05787a9
batch_69081289c3648190acf68565aeb6f3c5
batch_690812a18dd881908da20eee396e18b7
batch_690812bd536c8190a484251bdf45bc08
batch_690812d8837881909669e212451a3f9c
batch_690812f417d88190be3cb7974508c624
batch_69081312faa08190a5a4ea0498a3bdf7
batch_690813326a008190a1fa91e968abcb21
batch_690813528d488190a152474917246d77
batch_69081d0790788190acd811022212212a
batch_69081d23e14481908d88e0a2eb8e7ca1
batch_69081d3b698c81909b8818ed1c973aeb
batch_69081d5933a481909b17ef0fbf593c46
batch_69081d73d5bc8190ac048b1741b3e28d
batch_69081d8ea1fc819092550a0a6c9537cc


In [52]:
batch_res = [0] * len(batch_list)

In [53]:
cnt = 0
for i in range(len(batch_list)):
    print(i, end=" ")
    if batch_res[i] != 0:
        print("done")
        cnt += 1
        continue
    batch = client.batches.retrieve(batch_list[i])
    result = None
    if batch.status == 'completed':
        out = batch.output_file_id
        if out != None:
            cnt += 1
            print('done!')
            result = client.files.content(out)
            batch_res[i] = result
        else:
            print('error')
            result = client.files.content(batch.error_file_id).text
            batch_res[i] = result
    elif batch.status == 'failed':
        print('failed')
        print(batch.errors)
    else:
        print('it does not finish yet')
        print(batch.status)
        print(batch.request_counts)
print(cnt, "/", len(batch_res))

0 done!
1 done!
2 done!
3 done!
4 done!
5 done!
6 done!
7 done!
8 done!
9 done!
10 it does not finish yet
in_progress
BatchRequestCounts(completed=2497, failed=0, total=2500)
11 it does not finish yet
in_progress
BatchRequestCounts(completed=2491, failed=0, total=2500)
12 it does not finish yet
in_progress
BatchRequestCounts(completed=2444, failed=16, total=2500)
13 done!
14 it does not finish yet
in_progress
BatchRequestCounts(completed=1666, failed=12, total=2500)
11 / 15


In [12]:
batch_res = []

for i in range(4, 21, 4):
    for j in range(3):
        file = open(f'./output/20251103/adjusted_fewshot_{i}_{j}.jsonl', 'r')
        batch_res.append(file.read())
        file.close()

In [None]:
json_res = []
for i in batch_res[0].text.split('\n')[:-1]:
    json_res.append(json.loads(i))
emotion_res = []

for i in json_res:
    tmp = []
    try:
        l = json.loads(i['response']['body']['output'][0]['content'][0]['text'])
        
        for j in l['analysis']:
            tmp.append(j['emotion'])
    except:
        print("============================error")
        print(i)
    
    emotion_res.append(tmp)

In [None]:
k = 0
result = []
for i in range(4, 21, 4):
    for j in it:
        json_res = []
        for l in batch_res[k].split('\n')[:-1]:
            json_res.append(json.loads(l))
        emotion_res = []

        for l in json_res:
            tmp = []
            try:
                n = json.loads(i['response']['body']['output'][0]['content'][0]['text'])
        
                for m in n['analysis']:
                    tmp.append(m['emotion'])
            except:
                print("============================error")
                print(l)
    
    emotion_res.append(tmp)

        k += 1

[(0.25, 0.75), (0.0, 0.25), (0.5, 1.0)]


In [17]:
evaluate = {}
precision = []
recall = []
f1 = []
ek_precision = []
ek_recall = []
ek_f1 = []
for k in range(5):
    print("--------------------------", k, end='--------------------------\n')
    json_res = []
    for i in batch_res[k].text.split('\n')[:-1]:
        json_res.append(json.loads(i))
    emotion_res = []

    for i in json_res:
        tmp = []
        try:
            l = json.loads(i['response']['body']['output'][0]['content'][0]['text'])
            
            for j in l['analysis']:
                tmp.append(j['emotion'])
        except:
            print("============================error")
            print(i)
        
        emotion_res.append(tmp)
    e = evaluation(data, emotion_res)
    ek = evaluation_ekman(data, emotion_res)
    precision.append(list(e[3][0]))
    recall.append(list(e[3][1]))
    f1.append(list(e[3][2]))
    ek_precision.append(list(ek[3][0]))
    ek_recall.append(list(ek[3][1]))
    ek_f1.append(list(ek[3][2]))
    # evaluate[it[k]] = list(e) + list(ek)
        

-------------------------- 0--------------------------
--- 모델 평가 결과 ---
전체 샘플에 대한 정확도 (Exact Match Accuracy): 0.2356

--- Micro 평균 지표 ---
Precision (Micro): 0.3210
Recall (Micro): 0.2758
F1-Score (Micro): 0.2967

--- Macro 평균 지표 ---
Precision (Macro): 0.3790
Recall (Macro): 0.3203
F1-Score (Macro): 0.2971

--- 라벨별 지표 ---
admiration - Precision: 0.6618, Recall: 0.1974, F1-Score: 0.3041
amusement - Precision: 0.3186, Recall: 0.6824, F1-Score: 0.4344
anger - Precision: 0.2355, Recall: 0.6778, F1-Score: 0.3496
annoyance - Precision: 0.1857, Recall: 0.0929, F1-Score: 0.1238
approval - Precision: 0.2706, Recall: 0.1329, F1-Score: 0.1783
caring - Precision: 0.3056, Recall: 0.3385, F1-Score: 0.3212
confusion - Precision: 0.1561, Recall: 0.4706, F1-Score: 0.2344
curiosity - Precision: 0.2587, Recall: 0.3217, F1-Score: 0.2868
desire - Precision: 0.3500, Recall: 0.2059, F1-Score: 0.2593
disappointment - Precision: 0.1885, Recall: 0.2911, F1-Score: 0.2289
disapproval - Precision: 0.2088, Recall: 0

In [224]:
precision_df = pd.DataFrame(precision, columns = ems.split(), index=[4, 8, 12, 16, 20])
recall_df = pd.DataFrame(recall, columns = ems.split(), index=[4, 8, 12, 16, 20])
f1_df = pd.DataFrame(f1, columns = ems.split(), index=[4, 8, 12, 16, 20])
ek_precision_df = pd.DataFrame(ek_precision, columns = 'anger disgust fear joy sadness surprise neutral'.split(), index=[4, 8, 12, 16, 20])
ek_recall_df = pd.DataFrame(ek_recall, columns = 'anger disgust fear joy sadness surprise neutral'.split(), index=[4, 8, 12, 16, 20])
ek_f1_df = pd.DataFrame(ek_f1, columns = 'anger disgust fear joy sadness surprise neutral'.split(), index=[4, 8, 12, 16, 20])

In [None]:
precision_df.to_csv('./output/20251103/precision.csv')
recall_df.to_csv('./output/20251103/recall.csv')
f1_df.to_csv('./output/20251103/f1.csv')
ek_precision_df.to_csv('./output/20251103/ek_precision.csv')
ek_recall_df.to_csv('./output/20251103/ek_recall.csv')
ek_f1_df.to_csv('./output/20251103/ek_f1.csv')