# Benchmarking Different ChatGPTs, to see whether the SOTA chatbots perform well on 0-shot and few-shot learning

In [None]:
import re
# read API key from local machine
with open("/home/andrej/Documents/open_ai/made-with-ml-key.txt", "r") as file:
    api_key = file.read()
    api_key = re.sub(r'\s+', '', api_key)

import openai
openai.api_key = api_key

from openai import OpenAI

### Load sentiment training dataset from reddit

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np

# pandas also relies on numpy for random sampling
np.random.seed(42)  # the answer to everything

dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

In [None]:
def get_single_labeled(df):
    single_labels = df['labels'].apply(lambda x: x if len(x) <= 1 else None)
    single_labels = single_labels.dropna()  #  leave out the multilabeled ones

    # extract the singlelabeled data by index via iloc
    single_df = df.iloc[single_labels.index]  

    # transform the singlelabeled data labels from list (e.g. [8]) into int (e.g. 8)
    single_df['labels'] = single_df['labels'].apply(lambda x: x[0])

    return single_df

In [None]:
labels = dataset['train'].features['labels'].feature.names

# labels are by default in a list, filter them and reassign them to an integer instead
long_train_df = pd.DataFrame(dataset['train'])
train_df = get_single_labeled(long_train_df)

long_test_df = pd.DataFrame(dataset['test'])
test_df = get_single_labeled(long_test_df)
before_adding = True


In [None]:
# train_df.info()
test_df.info()

# adding a few more made up grief comments to the testing dataset, there are only 2 examples labeled grief...
grief_additionals = [
    "I still can\'t bring myself to sit in his chair. Every time I walk past it, I just feel this overwhelming emptiness.",
    "Every time I walk through the house, I get hit with memories that make me cry all over again. It\'s like a never-ending cycle.",
    "I keep reaching for my phone to text him, then remember he\'s gone. It\'s like a punch in the gut every single time.",
    "The holidays used to be my favorite time of year, but now they\'re just painful reminders of the family we\'ve lost.",
    "Sometimes I find myself holding onto his old jacket because it still smells like him. It\'s comforting and heartbreaking all at once."
    ]

new_entries = {"text": [],
               "labels": [],
               "id": []
               }

for id, text in enumerate(grief_additionals):
    new_entries["text"].append(text)
    new_entries["labels"].append(16)  # the id for "grief" label
    new_entries["id"].append(f"manual_id_{id}")

if before_adding:
    new_entries_df = pd.DataFrame(new_entries)
    test_df = pd.concat([test_df, new_entries_df], ignore_index=True)
    before_adding = False
    
test_df.query('labels == 16').shape


In [None]:
def create_test_dict(df, n_samples_per_category=1):
    test_dict = {}
    for i, label in enumerate(labels):
        # print(f"Sampling label: {label} with label id: {i}")
        # print(f"There is ", df.query(f'labels == {i}').shape[0], f" of {label} in the Test set.")
        sample = df.query(f'labels == {i}').sample(n_samples_per_category)
        test_dict[label] = list(sample['text'])
    return test_dict


def sample_few_shot(df):
    """Samples one random item for each label from a given DataFrame."""
    few_shot_dict = {}
    for i, label in enumerate(labels):
        sample = df.query(f'labels == {i}').sample(1)
        while "[NAME]" in sample.text.item():
            sample = df.query(f'labels == {i}').sample(1)

        few_shot_dict[label] = sample.text.item()
    return few_shot_dict


In [None]:
few_shot_train = sample_few_shot(train_df)
test_dict = create_test_dict(test_df, 7)
print(f"Few-shot training data:\n{few_shot_train}")
print(f"Sampled benchmark dataset:\n{test_dict}")

### Prepare the OpenAI API GPT setup

In [None]:

def get_tag(model, system_content="", assistant_content="", user_content=""):
    try:
        # Get response from OpenAI
        response = openai.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_content},
                {"role": "assistant", "content": assistant_content},
                {"role": "user", "content": user_content},
            ],
            max_tokens=100,
        )
        predicted_tag = response.choices[0].message.content
        return predicted_tag

    except (openai.error.ServiceUnavailableError, openai.error.APIError) as e:
        return None

# Zero-shot testing of various GPT models by OpenAI for benchmarking.

### GPT 3.5-turbo
> NOTE: This model will soon be deprecated by OpenAI.

In [None]:
# Example usage
model = "gpt-3.5-turbo"
system_content = f"""
You are an NLP sentiment prediction tool. Your goal is to predict a label given an input sequence by the user.
You must choose between one of the following labels for each input: {labels}.
Only respond with the label name and nothing else."""
print(f"The system content is:\n{system_content}")

assistant_content = ""
user_content = test_dict['admiration'][0]

In [None]:
from tqdm import tqdm

def test_gpt(model, labels, data_dict, few_shot_str=None):
    gpt_eval = {"original_labels": [],
                  "text": [],
                  "prediction_labels":[],
                  "model": model}
    
    system_content = f"""
        You are an NLP sentiment prediction tool. Your goal is to predict a label given an input sequence by the user.
        You must choose between one of the following labels for each input: {labels}.
        Only respond with the label name and nothing else."""
    
    if few_shot_str is not None:
        system_content = few_shot_str
    for label in tqdm(labels):
        if label == "other":
            continue
        for instance in data_dict[label]:
            user_content = instance
            pred = get_tag(model,
                    system_content=system_content,
                    user_content=user_content)
            gpt_eval["original_labels"].append(label)
            gpt_eval["text"].append(instance)
            gpt_eval["prediction_labels"].append(pred)
    
    return gpt_eval
            

### Evaluate the chosen subset of the testing dataset with GPT4o, GPT4, and GPT3.5

In [None]:
import json

# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o-2024-05-13"
model = "gpt-4-turbo-2024-04-09"

# NOTE: Make sure that you are able to save to the given path (anaconda tbrougb jupyter vscod is problematic)
benchmarks_path = "/home/andrej/Code/story-vibe/benchmarks/GPT_evals"

dummy_eval = {"original_labels": [],
                  "text": [],
                  "prediction_labels":[],
                  "model": model}



In [None]:
def clean_predictions(y_pred, tags, default="other"):
    for i, item in enumerate(y_pred):
        if item not in tags:  # hallucinations
            y_pred[i] = default
        if item.startswith("'") and item.endswith("'"):  # GPT 4 likes to places quotes
            y_pred[i] = item[1:-1]
    return y_pred

### Plotting

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns; sns.set_theme()


def plot_tag_dist(y_true, y_pred, name="Model"):
    # Distribution of tags
    true_tag_freq = dict(Counter(y_true))
    pred_tag_freq = dict(Counter(y_pred))
    
    df_true = pd.DataFrame({"Label": list(true_tag_freq.keys()), "Freq": list(true_tag_freq.values()), "source": "true"})
    df_pred = pd.DataFrame({"Label": list(pred_tag_freq.keys()), "Freq": list(pred_tag_freq.values()), "source": "pred"})
    df = pd.concat([df_true, df_pred], ignore_index=True)

    # Plot
    plt.figure(figsize=(10, 3))
    plt.title(name, fontsize=14)
    ax = sns.barplot(x="Label", y="Freq", hue="source", data=df)
    ax.set_xticklabels(list(true_tag_freq.keys()), rotation=45, fontsize=8, ha="right")
    plt.legend()
    plt.tight_layout()
    plt.show()


def plot_confusion_matrix(y_true, y_pred, labels, name="Model"):
    print(f"len(y_true): {len(y_true)}")
    print(f"len(y_pred): {len(y_pred)}")
    cfm = np.zeros((len(labels), len(labels)))
    if "other" not in labels:
        labels.append("other")
    for id, y_t in enumerate(y_true):

        y_p = y_pred[id]  # gets synchronized word label from the prediciton list
        y_t_label_id = labels.index(y_t)
        y_p_label_id = labels.index(y_p)
        cfm[y_t_label_id, y_p_label_id] += 1

    conf_matrix_df = pd.DataFrame(cfm, index=labels, columns=labels)

    # Plot the confusion matrix using seaborn
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix_df, cmap='Blues')
    plt.title(f'Sentiment Confusion Matrix for {name}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.tight_layout()
    plt.show()
    
    return cfm


### Eval

In [None]:
import copy

with open(f"{benchmarks_path}/zero_shot/gpt-3.5-turbo-0125.json") as gpt35_file:
    gpt35_eval = json.load(gpt35_file)

with open(f"{benchmarks_path}/zero_shot/gpt-4-turbo-2024-04-09.json") as gpt4t_file:
    gpt4t_eval = json.load(gpt4t_file)

with open(f"{benchmarks_path}/zero_shot/gpt-4o-2024-05-13.json") as gpt4o_file:
    gpt4o_eval = json.load(gpt4o_file)

def plot_benchmark(eval_dict, labels, name="Model"):

    # load benchmarks
    y_pred = eval_dict['prediction_labels']
    y_pred_cleaned = clean_predictions(y_pred, labels)

    y_true = eval_dict['original_labels']
    if "other" not in y_true:
        y_true.append("other")
        
    plot_confusion_matrix(y_true, y_pred, labels=labels, name=name)

    plot_tag_dist(y_true, y_pred_cleaned, name=name)

plot_benchmark(gpt35_eval, labels, name="GPT 3.5")
plot_benchmark(gpt4t_eval, labels, name="GPT 4-turbo")
plot_benchmark(gpt4o_eval, labels, name="GPT 4o")



# Few-shot benchmark of GPT models

In [None]:
few_shot_string = f"""You are an NLP sentiment prediction tool. Your goal is to predict a label given an input sequence by the user.
You must choose between one of the following labels for each input: {labels}.
Only respond with the label name and nothing else.
Now follow some few-shot training examples. There will be one example for each label given before:\n\n"""

for fs_key, fs_data in few_shot_train.items():
    # print(f"{fs_key}: {fs_data}")
    few_shot_string += f"{fs_key}: {fs_data}\n"
print(few_shot_string)

# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o-2024-05-13"
model = "gpt-4-turbo-2024-04-09"

models = ["gpt-3.5-turbo-0125",
          "gpt-4o-2024-05-13",
          "gpt-4-turbo-2024-04-09"]


### Eval

In [None]:
with open(f"{benchmarks_path}/few_shot/gpt-3.5-turbo-0125.json") as gpt35_file:
    fs_gpt35_eval = json.load(gpt35_file)

with open(f"{benchmarks_path}/few_shot/gpt-4-turbo-2024-04-09.json") as gpt4t_file:
    fs_gpt4t_eval = json.load(gpt4t_file)

with open(f"{benchmarks_path}/few_shot/gpt-4o-2024-05-13.json") as gpt4o_file:
    fs_gpt4o_eval = json.load(gpt4o_file)

def plot_benchmark(eval_dict, labels, name="Model"):

    # load benchmarks
    y_pred = eval_dict['prediction_labels']
    y_pred_cleaned = clean_predictions(y_pred, labels)

    y_true = eval_dict['original_labels']
        
    plot_confusion_matrix(y_true, y_pred, labels=labels, name=name)
    
    if "other" not in y_true:
        y_true.append("other")

    plot_tag_dist(y_true, y_pred_cleaned, name=name)

plot_benchmark(fs_gpt35_eval, labels, name="Few Shot GPT 3.5")
plot_benchmark(fs_gpt4t_eval, labels, name="Few Shot GPT 4-turbo")
plot_benchmark(fs_gpt4o_eval, labels, name="Few Shot GPT 4o")