In [None]:
import datetime
import pandas as pd
import tiktoken

def log(string, file="evaluator_output/evaluator_log.txt"):
    print(string)
    with open(file, "a") as log_file:
        log_file.write(string + "\n")

pd.set_option('display.max_colwidth', None)
df = pd.read_csv('data/evaluator_filtered.csv')

datetime_str = datetime.datetime.now().isoformat(sep=" ", timespec="seconds")
log("Experiment " + datetime_str)

instruction = "### instruction ###\nAct as a Russian political joke evaluator. Evaluate the funniness with a reason and give an integer rating from 0 to 3, in a format of reason -> rating.\n"
example_ids = [2, 10, 38, 42]
examples = "### examples ###\n"
score_to_explaination = {
    3: "This joke is very funny because it is easy to understand and plays on the absurdities and contradictions of the Soviet regime and its leadership -> 3",
    2: "This joke can be rated between 3 and 1 -> 2",
    1: "This joke is too opaque for immediate comedic impact or is just a simple wordplay without meaningful satire against the absurdities under Soviet regime -> 1",
    0: "This is a fact, not a joke -> 0"
}
for i, id in enumerate(example_ids):
    examples += "Example #" + str(i+1) + "\n<user>: '''" + df.iloc[id,0] + "'''\n<assistant>: '''"
    examples += score_to_explaination[df.iloc[id,2]] + "'''"
    if i != len(example_ids) - 1:
        examples += "\n"
log("example data ids:" + str(example_ids))
log("system prompt:\n" + instruction + examples)

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens
log("system prompt tokens:" + str(num_tokens_from_string(instruction + examples)))

#validate_ids = [3, 11, 39, 43]
validate_ids = [x for x in range(48) if x not in example_ids and x not in range(9, 24)]
log("validation data ids:" + str(validate_ids))
display(df.iloc[validate_ids])

In [None]:
import os
import openai
from sklearn.metrics import classification_report, mean_squared_error

openai.api_key = os.getenv("OPENAI_API_KEY")

model = "gpt-4-1106-preview"
temperature = 1
top_p = 1
log("model: " + model + ", temperature: " + str(temperature) + ", top_p: " + str(top_p))

output = []
for i in validate_ids:
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": instruction + examples
            },
            {
                "role": "user",
                "content": df.iloc[i,0]
            }
        ],
        temperature=1,
        max_tokens=512,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    content = response['choices'][0]['message']['content']
    print(content)
    output.append(content)

def parse_rating(output):
    return [int(o[o.find(" -> ")+4]) for o in output]

def parse_reason(output):
    return [o[:o.find(" -> ")] for o in output]

df2 = df.iloc[validate_ids]
df2.insert(3, 'GPT Rating', parse_rating(output))
df2.insert(4, 'GPT Explaination', parse_reason(output))
df2.to_csv("evaluator_output/" + datetime_str.replace(" ", "_").replace(":", "-") + ".csv", index=False)

log("rmse: " + str(mean_squared_error(df2.iloc[:,2], df2.iloc[:,3], squared=False)))
log(classification_report(df2.iloc[:,2], df2.iloc[:,3], labels=range(4), zero_division=0.0))
log("\n\n")

