# Benchmarking Different ChatGPTs, to see whether the SOTA chatbots perform well on 0-shot and few-shot learning

In [None]:
import re
# read API key from local machine
with open("/home/andrej/Documents/open_ai/made-with-ml-key.txt", "r") as file:
    api_key = file.read()
    api_key = re.sub(r'\s+', '', api_key)

import openai
openai.api_key = api_key

from openai import OpenAI

### Load sentiment training dataset from reddit

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np

# pandas also relies on numpy for random sampling
np.random.seed(42)  # the answer to everything

dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

In [None]:
def get_single_labeled(df):
    single_labels = df['labels'].apply(lambda x: x if len(x) <= 1 else None)
    single_labels = single_labels.dropna()  #  leave out the multilabeled ones

    # extract the singlelabeled data by index via iloc
    single_df = df.iloc[single_labels.index]  

    # transform the singlelabeled data labels from list (e.g. [8]) into int (e.g. 8)
    single_df['labels'] = single_df['labels'].apply(lambda x: x[0])

    return single_df

In [130]:
labels = dataset['train'].features['labels'].feature.names

# labels are by default in a list, filter them and reassign them to an integer instead
long_train_df = pd.DataFrame(dataset['train'])
train_df = get_single_labeled(long_train_df)

long_test_df = pd.DataFrame(dataset['test'])
test_df = get_single_labeled(long_test_df)
before_adding = True



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_df['labels'] = single_df['labels'].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_df['labels'] = single_df['labels'].apply(lambda x: x[0])


In [143]:
# train_df.info()
test_df.info()

# adding a few more made up grief comments to the testing dataset, there are only 2 examples labeled grief...
grief_additionals = [
    "I still can\'t bring myself to sit in his chair. Every time I walk past it, I just feel this overwhelming emptiness.",
    "Every time I walk through the house, I get hit with memories that make me cry all over again. It\'s like a never-ending cycle.",
    "I keep reaching for my phone to text him, then remember he\'s gone. It\'s like a punch in the gut every single time.",
    "The holidays used to be my favorite time of year, but now they\'re just painful reminders of the family we\'ve lost.",
    "Sometimes I find myself holding onto his old jacket because it still smells like him. It\'s comforting and heartbreaking all at once."
    ]

new_entries = {"text": [],
               "labels": [],
               "id": []
               }

for id, text in enumerate(grief_additionals):
    new_entries["text"].append(text)
    new_entries["labels"].append(16)  # the id for "grief" label
    new_entries["id"].append(f"manual_id_{id}")

if before_adding:
    new_entries_df = pd.DataFrame(new_entries)
    test_df = pd.concat([test_df, new_entries_df], ignore_index=True)
    before_adding = False
    
test_df.query('labels == 16').shape


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4595 entries, 0 to 4594
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4595 non-null   object
 1   labels  4595 non-null   int64 
 2   id      4595 non-null   object
dtypes: int64(1), object(2)
memory usage: 107.8+ KB


(7, 3)

In [148]:
def create_test_dict(df, n_samples_per_category=1):
    test_dict = {}
    for i, label in enumerate(labels):
        # print(f"Sampling label: {label} with label id: {i}")
        # print(f"There is ", df.query(f'labels == {i}').shape[0], f" of {label} in the Test set.")
        sample = df.query(f'labels == {i}').sample(n_samples_per_category)
        test_dict[label] = list(sample['text'])
    return test_dict


def sample_few_shot(df):
    """Samples one random item for each label from a given DataFrame."""
    few_shot_dict = {}
    for i, label in enumerate(labels):
        sample = df.query(f'labels == {i}').sample(1)
        while "[NAME]" in sample.text.item():
            sample = df.query(f'labels == {i}').sample(1)

        few_shot_dict[label] = sample.text.item()
    return few_shot_dict


In [152]:
few_shot_train = sample_few_shot(train_df)
test_dict = create_test_dict(test_df, 7)
print(f"Few-shot training data:\n{few_shot_train}")
print(f"Sampled benchmark dataset:\n{test_dict}")

Few-shot training data:
{'admiration': "Seattle's train is nice... once you get there. It is an absolute hike from the terminal", 'amusement': 'Haha, only one species. More the Cockatoos', 'anger': 'Oh go fuck yourself with your BS generalization.', 'annoyance': 'Pretty sure it’s going on in the apartment across from me right now. My neighbors are trashy', 'approval': 'I think we can all agree that this is the right answer.', 'caring': 'Most likely it’s not a tumour since cancerous tumours aren’t usually painful. You still should go check it out ASAP.', 'confusion': 'I’m not sure what there is to disagree about. It was obviously a subjective claim.', 'curiosity': 'Can’t wait to see my IG page blow up with it smh', 'desire': 'I wish yall would stay off the fucking roads when youre drinking beer. Mind you, on a bike I may just get that wish after your first night out.', 'disappointment': 'I also posted about this a few days ago. Maybe they have different designers but none of them has fo

### Prepare the OpenAI API GPT setup

In [161]:

def get_tag(model, system_content="", assistant_content="", user_content=""):
    try:
        # Get response from OpenAI
        response = openai.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_content},
                {"role": "assistant", "content": assistant_content},
                {"role": "user", "content": user_content},
            ],
            max_tokens=100,
        )
        # predicted_tag = response.to_dict()["choices"][0].to_dict()["message"]["content"]
        predicted_tag = response.choices[0].message.content
        return predicted_tag

    except (openai.error.ServiceUnavailableError, openai.error.APIError) as e:
        return None

# Zero-shot testing of various GPT models by OpenAI for benchmarking.

### GPT 3.5-turbo
> NOTE: This model will soon be deprecated by OpenAI.

In [162]:
# Get tag
model = "gpt-3.5-turbo"
system_content = f"""
You are an NLP sentiment prediction tool. Your goal is to predict a label given an input sequence by the user.
You must choose between one of the following labels for each input: {labels}.
Only respond with the label name and nothing else."""
print(f"The system content is:\n{system_content}")

assistant_content = ""
user_content = test_dict['admiration'][0]
# print(f"User content: {user_content}")
tag = get_tag(model=model, system_content=system_content, assistant_content=assistant_content, user_content=user_content)
print(tag)

The system content is:

You are an NLP sentiment prediction tool. Your goal is to predict a label given an input sequence by the user.
You must choose between one of the following labels for each input: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'].
Only respond with the label name and nothing else.
admiration
