# Imports
Run this block first to import all necessary libraries.

In [1]:
import os
import time
import openai
from IPython.core.display_functions import clear_output
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm

# Paths, Variables and Setup
Update paths to point to the correct files if necessary, update variables, and run the setup code blocks.

In [2]:
# Paths
DATA_DIR="../data/dataset_splits/holdout_training/test{}.csv"
OUTPUT_DIR="../data/model_eval/chatgpt/"

In [None]:
# Variables
API_KEY_ENV_NAME="OPENAI_API_KEY"
class_mappings = {
    "Decreased": 0,
    "Neutral": 1,
    "Increased": 2
}

In [5]:
# Setup: Environment
load_dotenv()

True

In [4]:
# Setup: Function that appends a dataset instance to the predefined prompt.
def append_prompt(prompt, target, follow_up):
    return prompt + [{
        "role": "user", "content":
        f"""Sentence A: "{target}"
Sentence B: "{follow_up}\""""
    }]

In [None]:
# Setup: Function that extracts the response from the API call into a structured form.
def extract_response(target, follow_up, completion):
    completion_text = completion["choices"][0]["message"]["content"]
    cls_txt = completion_text.split("\n")[-1].strip()
    return {
        "target": target,
        "follow_up": follow_up,
        "explanation": completion_text.split("```")[1].strip(),
        "class": class_mappings[cls_txt] if cls_txt in class_mappings else -1
    }

# Notebook Summary
This notebook contains the logic for obtaining dataset results using OpenAI's API. We define a few-shot chain-of-thought reasoning prompt and evaluate all dataset instances over the OpenAI API. The total cost for GPT3.5 was around €5. We evaluate the model's predictions and calculate accuracy and exact match metrics.

# 1. Define Prompt
In our prompt, we provide a task as well as some few-shot examples to the model.

In [3]:
message_prompt = [
  {"role": "system", "content":
    """You are a language model specializing in temporal commonsense reasoning. Each prompt contains Sentence A and Sentence B. You should determine whether Sentence B changes the expected temporal validity duration of Sentence A, i.e., the duration for which the information in Sentence A is expected to be relevant to a reader.

To achieve this, in your responses, first, estimate for how long the average reader may expect Sentence A to be relevant on its own. Then, consider if the information introduced in Sentence B increases or decreases this duration. Surround this explanation in triple backticks (```).

After your explanation, respond with one of the three possible classes corresponding to your explanation: "Decreased", "Neutral", or "Increased"."""
   },
  {
    "role": "user", "content":
    """Sentence A: "i'm ready to go to the beach."
Sentence B: "I forgot all the beach towels are still in the dryer, but I'll be ready to go as soon as the dryer's done running.\""""
  },
  {
    "role": "assistant", "content":
    """```Going to the beach may take a few minutes to an hour, depending on the distance. However, if the author first needs to wait on the dryer to finish in order to retrieve their beach towels, this may take an additional 30-60 minutes.```

Increased"""
  },
  {
      "role": "user", "content":
      """"Sentence A: taking bad thoughts out of my mind thru grinding my assignments"
Sentence B: "I just have to get through a short math homework assignment and memorize a few spelling words so it shouldn't take long.\""""
  },
  {
      "role": "assistant", "content":
      """```Grinding through assignments may take several hours, depending on the number of assignments to complete. In Sentence B, the author states they only have a few short assignments remaining, so they may only take an hour or less to finish them.```

Decreased"""
  },
  {
        "role": "user", "content":
        """Sentence A: "Slide to my dm guys, come on"
Sentence B: "Instagram DMs are such a fun way to communicate.\""""
  },
  {
        "role": "assistant", "content":
        """```The author encourages people to direct message them, which may be relevant for several minutes to a few hours. Sentence B does not change the duration for which Sentence A is expected to be relevant.```

Neutral"""
  }
]

# 2. Setup OpenAI Model

In [6]:
openai.api_key = os.getenv(API_KEY_ENV_NAME)

In [7]:
# We can prompt the model as follows (using target and follow-up statement)
completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=append_prompt(message_prompt, "we all need to pray for exam for tomorrow", "And tomorrow's just the first of the exams that last all week.")
)

print(completion)

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "```The need for praying for the exam is relevant until the exam is completed, which is typically a few hours. However, in Sentence B, it is mentioned that the exams will last all week, indicating that the need for praying for exams will exist beyond the immediate exam for tomorrow. Therefore, the duration for which Sentence A is expected to be relevant is increased.```\n\nIncreased",
        "role": "assistant"
      }
    }
  ],
  "created": 1690802277,
  "id": "chatcmpl-7iLCP7b1mAjRIShgv328VWUnxXGIP",
  "model": "gpt-3.5-turbo-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 76,
    "prompt_tokens": 474,
    "total_tokens": 550
  }
}


In [9]:
# We bring the response into a structured form using the extract_response function
extract_response("we all need to pray for exam for tomorrow", "And tomorrow's just the first of the exams that last all week.", completion)

{'target': 'we all need to pray for exam for tomorrow',
 'follow_up': "And tomorrow's just the first of the exams that last all week.",
 'explanation': 'The need for praying for the exam is relevant until the exam is completed, which is typically a few hours. However, in Sentence B, it is mentioned that the exams will last all week, indicating that the need for praying for exams will exist beyond the immediate exam for tomorrow. Therefore, the duration for which Sentence A is expected to be relevant is increased.',
 'class': 2}

# 3. Generate Predictions
To test the model, we provide one of the folds of the dataset as input to the model and evaluate the model's predictions. Since these calls incur costs and are non-deterministic, we store the results in a file. The few-shot examples provided to the prompt are from cv2. This means the model has seen 3 examples from the given fold in its prompt, which may have a very minor positive impact on performance of that fold.

In [25]:
def evaluate_fold(input_df, fold_num, output_df=None, from_idx=0):
    if output_df is None:
        results = []

    else:
        results = output_df.to_dict("records")

    for i, row in tqdm(input_df.iterrows(), total=len(input_df), desc=f"Evaluating Fold {fold_num}"):
        # skip if we already have a result for this row
        if i < from_idx:
            continue
        if i % 10 == 0:
            clear_output()
        while True:
            try:
                target = row["context"]
                follow_up = row["follow_up"]
                completion = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=append_prompt(message_prompt, target, follow_up)
                )
                results.append(extract_response(target, follow_up, completion))
                break
            except Exception as e:
                print("Error occurred. Retrying after 1 minute...")
                print(e)
                time.sleep(60)
                continue

    # save results to file
    pd.DataFrame(results).to_csv(OUTPUT_DIR + f"cv{fold_num}_results.csv", index=False)

In [26]:
# iterate over folds
for i in range(5):
    input_df = pd.read_csv(DATA_DIR.format(i))
    # check if output file exists
    if os.path.exists(OUTPUT_DIR + f"cv{i}_results.csv"):
        # If so, check if there are still missing items
        output_df = pd.read_csv(OUTPUT_DIR + f"cv{i}_results.csv")
        if len(output_df) < len(input_df):
            # If so, we need to evaluate the missing items
            print(f"Evaluating fold {i} from index {len(output_df)}.")
            evaluate_fold(input_df, i, output_df, from_idx=len(output_df))
        else:
            # Otherwise, we can skip this fold
            print(f"Skipping fold {i} because all items have already been evaluated.")
            continue
    else:
        # If not, we need to evaluate the entire fold
        print(f"Evaluating fold {i} from index 0.")
        evaluate_fold(input_df, i)

Evaluating Fold 4: 100%|██████████| 1011/1011 [1:14:26<00:00,  4.42s/it]


# 4. Evaluate Model

In [6]:
def get_acc_em(eval_df):
    acc = len(eval_df[eval_df["class"] == eval_df["predicted_class"]]) / len(eval_df)

    target_tweet_map = {}

    for i, row in eval_df.iterrows():
        # Add tweet to map if not exists
        if row["target"] not in target_tweet_map:
            target_tweet_map[row["target"]] = 0
        # Check if prediction matches the label, increase counter
        if row["class"] == row["predicted_class"]:
            target_tweet_map[row["target"]] += 1

    em = len([t for t in target_tweet_map.keys() if target_tweet_map[t] == 3]) / len(target_tweet_map)

    return acc, em

In [7]:
results = None

for i in range(5):
    result_df = pd.read_csv(OUTPUT_DIR + f"cv{i}_results.csv").rename(columns={"class": "predicted_class"})
    fold_df = pd.read_csv(DATA_DIR.format(i)).rename(columns={"context": "target"})
    fold_df["class"] = fold_df["change"].map({"decreased": 0, "neutral": 1, "increased": 2})
    eval_df = pd.DataFrame(result_df).merge(fold_df, how="left")

    acc, em = get_acc_em(eval_df)

    fold_results = pd.DataFrame([{
        "fold": f"chatgpt_{i}",
        "accuracy": acc,
        "exact_match": em
    }])

    if i == 0:
        results = fold_results
    else:
        results = pd.concat((results,fold_results))


In [8]:
results

Unnamed: 0,fold,accuracy,exact_match
0,chatgpt_0,0.611276,0.246291
0,chatgpt_1,0.693373,0.326409
0,chatgpt_2,0.638971,0.252226
0,chatgpt_3,0.683482,0.31454
0,chatgpt_4,0.687438,0.323442


In [15]:
print(f"Mean Accuracy: {results['accuracy'].mean().round(3)}")
print(f"Mean Exact Match: {results['exact_match'].mean().round(3)}")
print(f"Accuracy Std.: {round(results['accuracy'].std(), 3)}")
print(f"Exact Match Std.: {round(results['exact_match'].std(), 3)}")

Mean Accuracy: 0.663
Mean Exact Match: 0.293
Accuracy Std.: 0.036
Exact Match Std.: 0.04
