# Imports
Run this block first to import all necessary libraries.

In [1]:
import csv
import math
import os
import re
import openai
from dotenv import load_dotenv
import pandas as pd
import requests
import tqdm

# Paths, Variables and Setup
Update paths to point to the correct files if necessary, update variables, and run the setup code blocks.

In [2]:
# Paths
DATASET_SOURCE="../data/dataset.csv"
OUTPUT_DIR="../data/model_eval/LLM/"

In [3]:
# Variables
# Settings
USE_3SHOT = True # If false, use 1-shot (per class), otherwise use 3-shot (per class)
SHORT_FEWSHOT = True # If true, format the few-shot samples as flowing text. If false, use structured samples.
# API Keys and URLs
OPENAI_KEY_ENV_NAME="OPENAI_API_KEY"
ANYSCALE_KEY_ENV_NAME="ANYSCALE_API_KEY"
ANYSCALE_BASE_URL="https://api.endpoints.anyscale.com/v1"
# Class mappings
CLASS_MAPPINGS = {
    "decreased": 0,
    "unchanged": 1,
    "neutral": 1,
    "increased": 2,
}
TARGET_CLASSES = ["less than one minute", "1-5 minutes", "5-15 minutes", "15-45 minutes", "45 minutes - 2 hours", "2-6 hours", "more than 6 hours", "1-3 days", "3-7 days", "1-4 weeks", "more than one month"]
# Regex patterns
PTN_BACKTICKS = r'`([^`]*)`'

In [4]:
# Setup: Environment
load_dotenv()
# Setup: Setup requests session
ANYSCALE_KEY = os.getenv(ANYSCALE_KEY_ENV_NAME)
REQ_SESSION = requests.Session()
# Setup: Setup OpenAI session
openai.api_key = os.getenv(OPENAI_KEY_ENV_NAME)
client = openai.OpenAI()

In [5]:
# Setup: Function to load unevaluated items
def get_remaining_items(llm_name):
    input_df_full = pd.read_csv(DATASET_SOURCE)
    output_df_full = pd.read_csv(OUTPUT_DIR + f"{llm_name}.csv", encoding="ISO-8859-1") if os.path.exists(OUTPUT_DIR + f"{llm_name}.csv") else None
    if output_df_full is None:
        print("No output found. Returning full input dataset.")
        return input_df_full
    else:
        print("Found an output file. Returning input dataset from index " + str(len(output_df_full)))
        return input_df_full[len(output_df_full):]

# Notebook Summary
This notebook contains the logic for obtaining dataset results from LLMs using OpenAI's and Anyscale's API. We define a few-shot chain-of-thought reasoning prompt and evaluate all dataset instances over the APIs. We evaluate the model's predictions and calculate accuracy and exact match metrics.

# 1. Define Prompt
In our prompt, we provide a task as well as some few-shot examples to the model.

In [6]:
# Define the system prompt
SYSTEM_PROMPT = """You are a language model specialized in reasoning over temporal common sense. You know that the temporal validity duration of a statement is the duration for which said statement contains relevant and current information after its creation. Information that takes place in the future, such as "I will take a shower at 8 p.m.", is considered valid from the point of statement creation until the information has fully resolved.

Your task is to determine the impact of a context statement on the temporal validity duration of a target statement. The user will provide both statements. When a statement can be interpreted in multiple ways, assume the most likely interpretation is the correct one.

To solve the task effectively, follow the steps outlined below:

1. Ignoring the context statement, determine the temporal validity duration of the target statement. Your estimate must match one of the following labels: [less than one minute, 1-5 minutes, 5-15 minutes, 15-45 minutes, 45 minutes - 2 hours, 2-6 hours, more than 6 hours, 1-3 days, 3-7 days, 1-4 weeks, more than one month]. Select exactly one class and explain why it is the most fitting.

2. Once again, determine the most likely of the above labels to match the temporal validity duration of the target statement, but this time, include any information from the context statement that may influence the class label. Similar to step 1, explain why you chose the class.

3. Compare the two class labels generated in step 1 and step 2 to infer the change in the temporal validity duration. Select the appropriate class label [decreased, unchanged, increased], depending on if the introduction of the context statement decreased, remained neutral to, or increased the temporal validity duration of the target statement. You must surround this class label with backticks (`)."""

# Define the user prompt placeholder
USER_PROMPT = """Please determine the impact of the context statement on the temporal validity duration of the target statement.
Target Statement: "[st]"
Context Statement: "[sf]\""""

# Define the structured few-shot sample
STRUCTURED_SAMPLE = """To address this task, I'll first evaluate the temporal validity duration of the target statement in isolation, and then in the context of the provided context statement.

1. Temporal Validity Duration of the Target Statement (Without Context)
[r1]

2. Temporal Validity Duration of the Target Statement (With Context)
[r2]

3. Comparison of Temporal Validity Durations
[r3]
"""

In [7]:
# Define the few shot samples as quintuples (s_t, s_f, r1, r2, r3)
FEW_SHOT_SAMPLES = [
("My sister: We should go out for a drink. Just ONE. Us: *end up getting drunk*", "Just found out I have an early morning meeting tomorrow", "The target statement suggests a night out, which has already started. Such an occasion may still continue for several hours after the statement was made. For this reason, the most fitting label is 2-6 hours.", "The context statement notes an early morning meeting on the following day. Upon finding out about this meeting, it is likely that the author would cut their night out short. Although it is not clear how much time has passed between the two statements, a reasonable assumption is that the author would discover such a meeting and subsequently cease the activity within the next 15-45 minutes. For this reason, the most fitting label is 15-45 minutes.", "As the information in the context statement decreases the temporal validity duration of the target statement from 2-6 hours to 15-45 minutes, the final label is `decreased`"),

("Cab was early, so here I am at Midway with an hour to spare. And get Dunkin' Donuts does not actually have any donuts.", "I might just take the earlier train then, it's a flex ticket anyway.", "The target statement states that the author has an hour to spare at Midway. For this reason, the most fitting label is 45 minutes - 2 hours.", "The context statement specifies that the author could be taking an earlier train, reducing the amount of time they have to spare at Midway. As they do not seem to be in a hurry, the most fitting updated label is 15-45 minutes.", "As the information in the context statement decreases the temporal validity duration of the target statement from 45 minutes - 2 hours to 15-45 minutes, the final label is `decreased`"),

("I have 1 extra LA Bakeshop Cheesebread /Spanish Bread Mix. DM if you want it.", "Whoops, sorry…gone in seconds.", "The target statement contains an offer regarding some free bread mix. Depending on the reach of the author, it is likely that someone would be interested in this offer relatively quickly. For this reason, the most fitting label is 15-45 minutes.", "The context statement states that the offer was taken within seconds. If this statement can be taken at face value, it suggests the most fitting label is less than one minute.", "As the information in the context statement decreases the temporal validity duration of the target statement from 15-45 minutes to less than one minute, the final label is `decreased`"),

("taxi going home. back to reality.", "How can a week long vacation feel like a single day…?", "The target statement suggests that the author is on a taxi ride home. The typical duration of a taxi ride may be between 15-45 minutes. For this reason, the most fitting label is 15-45 minutes.", "The context statement provides additional information about where the author is coming from, and about the duration of their vacation. However, the duration of the taxi ride is unaffected by this information. For this reason, the most fitting label is still 15-45 minutes.", "As the information in the context statement does not change the temporal validity duration of the target statement, the final label is `unchanged`"),

("I swear I woke up with the mindset to have a great day", "Meditation really seems to be working…maybe I should stick with it", "The target statement refers to a mindset regarding the current day. As a result, the most fitting label is More than 6 hours.", "The context statement reveals meditation as a possible cause for the author's current mindset. The author contemplates sticking with meditation to improve their mood over a longer duration. However, the context statement is not clear on if the author will follow through on this thought, and the process of meditation is not necessarily linked to the author's long-term feelings. For this reason, the most fitting label is still More than 6 hours.", "As the information in the context statement does not change the temporal validity duration of the target statement, the final label is `unchanged`"),

("BTW I'm playing splatoon almost all day, come poke me", "I can play this game for hours. Maybe I should go pro", "According to the target statement, the author plans on playing the game Splatoon almost all day. For this reason, the most fitting label is More than 6 hours.", "In the context statement, the author further professes their enjoyment for the game and contemplates possible future plans. However, this information is not likely to impact the duration for which the playsession in question will continue. For this reason, the most fitting label is still More than 6 hours.", "As the information in the context statement does not change the temporal validity duration of the target statement, the final label is `unchanged`"),

("I'm watching Perfect Match and I feel like this chic Francesca is so selflish", "Why did they make her the focus of the double runtime special???", "Perfect Match is a TV series with a typical episode runtime between 38-65 minutes. According to the target statement, the author has already started watching the episode, so some of the episode's runtime has likely already passed. For this reason, the most fitting label is 15-45 minutes.", "The context statement notes that the episode is a double runtime special. In this case, it is likely that the remaining runtime exceeds 45 minutes. For this reason, the most fitting label is 45 minutes - 2 hours.", "As the information in the context statement increases the temporal validity duration of the target statement from 15-45 minutes to 45 minutes - 2 hours, the final label is `increased`"),

("I'm about to read some tech news really quick then enjoy dinner.", "Oh, I forgot we had a reservation at that fancy dinner place later today…I guess I'll get by with chips for now", "Reading tech news and subsequently enjoying dinner, as suggested by the target statement, is likely to take longer than 45 minutes, but unlikely to take longer than 2 hours. For this reason, the most fitting label is 45 minutes - 2 hours.", "The context reveals that the author has a dinner reservation later in the day, and will only be eating chips for the time being. Since the dinner is already mentioned in the target statement, it remains valid until this event takes place. As the reservation takes place later in the day, the most fitting label is 2-6 hours.", "As the information in the context statement increases the temporal validity duration of the target statement from 45 minutes - 2 hours to 2-6 hours, the final label is `increased`"),

("just patiently waiting for my bff to get here", "He'll be flying in from the states next Monday. So excited!!!", "The target statement states that the author is waiting on the arrival of their bff. With no further information, it is reasonable this refers to an imminent situation. For this reason, the most fitting label is 15-45 minutes.", "The context statement reveals that the author's BFF is in fact flying in from abroad in the following week, thus, the event is not imminent. Since the context statement mentions this event takes place on the next Monday, it is not possible for more than 7 full days to pass before this event occurs. As a result, the most fitting label is 3-7 days.", "As the information in the context statement increases the temporal validity duration of the target statement from 15-45 minutes to 3-7 days, the final label is `increased`")
]

In [8]:
# Build few-shot samples according to setup
def format_fewshot(sample):
    formatted = " ".join(sample[2:]) if SHORT_FEWSHOT else STRUCTURED_SAMPLE.replace("[r1]", sample[2]).replace("[r2]", sample[3]).replace("[r3]", sample[4])
    return [{"role": "user", "content": USER_PROMPT.replace("[st]", sample[0]).replace("[sf]", sample[1])}, {"role": "assistant", "content": formatted}]

FEW_SHOT_INDICES = [0,3,1,6,7,4,2,8,5] if USE_3SHOT else [3, 6, 0]
FEW_SHOT_SAMPLES_FULL = [item for idx in FEW_SHOT_INDICES for item in format_fewshot(FEW_SHOT_SAMPLES[idx])]

# Define function that returns the full messages array for a given item to classify
def prompt_input(st, sf):
    return [{"role": "system", "content": SYSTEM_PROMPT}] + FEW_SHOT_SAMPLES_FULL + [{"role": "user", "content": USER_PROMPT.replace("[st]", st).replace("[sf]", sf)}]

In [9]:
# Build function that gets a single prediction and returns the output, label, and tokens used
def get_prediction(st, sf, model_key, from_anyscale):
    prompt_tokens = 0
    completion_tokens = 0
    message_content = ""
    try:
        prompt = prompt_input(st, sf)

        # Query from anyscale
        if from_anyscale:
            with REQ_SESSION.post(ANYSCALE_BASE_URL+"/chat/completions", headers={"Authorization": f"Bearer {ANYSCALE_KEY}"},
                                  json={
                                      "model": model_key,
                                      "messages": prompt,
                                      "temperature": 0.0,
                                  }) as resp:
                response = resp.json()
                prompt_tokens += response["usage"]["prompt_tokens"]
                completion_tokens += response["usage"]["completion_tokens"]
                message_content = response["choices"][0]["message"]["content"]
                label = CLASS_MAPPINGS[re.findall(PTN_BACKTICKS, message_content)[0]]
        # Query from openai
        else:
            response = client.chat.completions.create(
                model=model_key,
                messages=prompt,
                temperature=0.0,
            )
            prompt_tokens += response.usage.prompt_tokens
            completion_tokens += response.usage.completion_tokens
            message_content = response.choices[0].message.content
            label = CLASS_MAPPINGS[re.findall(PTN_BACKTICKS, message_content)[0]]
        return prompt_tokens, completion_tokens, label, message_content
    except (IndexError, KeyError) as e:
        return prompt_tokens, completion_tokens, -1, message_content
    except Exception as e:
        print("Error occurred. Stopping classification...")
        print(e)
        raise e

In [10]:
# Get n predictions and return estimated cost
def get_n_predictions(prediction_df, model_name, model_key, from_anyscale, ppm_input, ppm_output, max_n=math.inf):
    num_tokens_in = 0
    num_tokens_out = 0
    items_to_process = min(max_n, len(prediction_df))
    print(f"Attempting to classify {items_to_process} items.")
    for i in tqdm.tqdm(range(items_to_process)):
        item = prediction_df.iloc[i]
        st = item.context
        sf = item.follow_up
        tokens_in, tokens_out, label, output = get_prediction(st, sf, model_key, from_anyscale)
        num_tokens_in += tokens_in
        num_tokens_out += tokens_out
        if not os.path.isfile(OUTPUT_DIR + f"{model_name}.csv"):
            with open(OUTPUT_DIR + f"{model_name}.csv", 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(["target", "context", "explanation", "class"])
        # Write the data
        with open(OUTPUT_DIR + f"{model_name}.csv", 'a', newline='', encoding='UTF-8') as file:
            writer = csv.writer(file)
            writer.writerow([st, sf, output, label])

    print(f"Parsed {items_to_process} items for {num_tokens_in} input tokens and {num_tokens_out} output tokens.")
    remaining_items = len(prediction_df) - items_to_process
    remaining_price_in = (num_tokens_in / items_to_process * remaining_items) * ppm_input / 1000000
    remaining_price_out = (num_tokens_out / items_to_process * remaining_items) * ppm_output / 1000000
    remaining_price = remaining_price_in + remaining_price_out
    print(f"Estimated cost for remaining items: ${remaining_price}.")

# 2. Anyscale Predictions

In [11]:
# Get predictions for Mixtral
input_df_remaining = get_remaining_items("Mixtral")
get_n_predictions(input_df_remaining, "Mixtral", "mistralai/Mixtral-8x7B-Instruct-v0.1", True, 0.15, 0.15)

Found an output file. Returning input dataset from index 2909
Attempting to classify 2146 items.
Got prediction with 2496 input tokens and 159 output tokens
Got prediction with 2489 input tokens and 116 output tokens
Got prediction with 2483 input tokens and 101 output tokens
Got prediction with 2482 input tokens and 132 output tokens
Got prediction with 2495 input tokens and 131 output tokens
Got prediction with 2473 input tokens and 114 output tokens
Got prediction with 2481 input tokens and 132 output tokens
Got prediction with 2455 input tokens and 137 output tokens
Got prediction with 2459 input tokens and 150 output tokens
Got prediction with 2456 input tokens and 103 output tokens
Got prediction with 2456 input tokens and 102 output tokens
Got prediction with 2455 input tokens and 126 output tokens
Got prediction with 2470 input tokens and 122 output tokens
Got prediction with 2479 input tokens and 131 output tokens
Got prediction with 2477 input tokens and 121 output tokens
Got

In [11]:
# Get predictions for Llama2
input_df_remaining = get_remaining_items("Llama2")
get_n_predictions(input_df_remaining, "Llama2", "meta-llama/Llama-2-70b-chat-hf", True, 1.0, 1.0)

Found an output file. Returning input dataset from index 3353
Attempting to classify 1702 items.


100%|██████████| 1702/1702 [1:57:47<00:00,  4.15s/it]  

Parsed 1702 items for 4326322 input tokens and 271198 output tokens.
Estimated cost for remaining items: $0.0.





# 3. OpenAI Predictions

In [12]:
# Get predictions for ChatGPT (GPT-3.5-turbo)
input_df_remaining = get_remaining_items("GPT3-5")
get_n_predictions(input_df_remaining, "GPT3-5", "gpt-3.5-turbo-1106", False, 1.0, 2.0)

Found an output file. Returning input dataset from index 2582
Attempting to classify 2473 items.


100%|██████████| 2473/2473 [3:23:47<00:00,  4.94s/it]    

Parsed 2473 items for 5705314 input tokens and 316768 output tokens.
Estimated cost for remaining items: $0.0.





In [15]:
# Get predictions for GPT 4 (GPT-4-turbo)
input_df_remaining = get_remaining_items("GPT4")
get_n_predictions(input_df_remaining, "GPT4", "gpt-4-1106-preview", False, 10.0, 30.0)

Found an output file. Returning input dataset from index 3963
Attempting to classify 1092 items.


100%|██████████| 1092/1092 [3:00:01<00:00,  9.89s/it] 

Parsed 1092 items for 2518889 input tokens and 221271 output tokens.
Estimated cost for remaining items: $0.0.





# 4. Evaluate Models

In [13]:
def get_acc_em(true_df, out_df):
    acc = 0
    em = 0

    for i in range(len(out_df)):
        if CLASS_MAPPINGS[true_df.iloc[i]["change"]] == out_df.iloc[i]["class"]:
            acc += 1
        if i % 3 == 0 and i + 2 < len(out_df):
            exact_match = True
            for e in range(3):
                if CLASS_MAPPINGS[true_df.iloc[i+e]["change"]] != out_df.iloc[i+e]["class"]:
                    exact_match = False
            if exact_match:
                em += 1

    acc /= len(out_df)
    em /= len(out_df) // 3

    return acc, em

In [14]:
true_cls = pd.read_csv(DATASET_SOURCE)

results = None

for file in os.listdir(OUTPUT_DIR):
    if file.endswith(".csv"):
        out_cls = pd.read_csv(OUTPUT_DIR + file, encoding="ISO-8859-1")
        acc, em = get_acc_em(true_cls, out_cls)

        model_results = pd.DataFrame([{
            "model": file[:-4],
            "accuracy": acc,
            "exact_match": em,
            "num_samples": len(out_cls)
        }])

        if results is None:
            results = model_results
        else:
            results = pd.concat((results, model_results))

In [15]:
results

Unnamed: 0,model,accuracy,exact_match,num_samples
0,GPT3-5,0.678932,0.310979,5055
0,GPT4,0.693373,0.304451,5055
0,Llama2,0.464688,0.096736,5055
0,Mixtral,0.630267,0.224926,5055
