In [4]:
import datetime
import pandas as pd
import tiktoken

def log(string, file="evaluator_output/evaluator_log.txt"):
    print(string)
    with open(file, "a") as log_file:
        log_file.write(string + "\n")

pd.set_option('display.max_colwidth', None)
df = pd.read_csv('data/evaluator_filtered.csv')

datetime_str = datetime.datetime.now().isoformat(sep=" ", timespec="seconds")
log("Experiment " + datetime_str)

instruction = "### instruction ###\nAct as a Russian political joke evaluator. Evaluate the funniness of a political joke using an integer rating from 0 to 3 and provide explainations. 0 means the input is not a joke. 1 means the input is a joke but not funny. 3 means the input is very funny.\n"
example_ids = [2, 10, 38, 42]
examples = "### examples ###\n"
score_to_explaination = {
    3: "This joke is very funny because it is easy to understand and plays on the absurdities and contradictions of the Soviet regime and its leadership, so the rating is 3.",
    2: "This joke cannot be rated as 3 or 1, so the rating is 2.",
    1: "This joke is either too simple or too opaque or heavily reliant on insider knowledge, which undermines the comedic impact, so the rating is 1.",
    0: "This is not a joke at all, so the rating is 0."
}
for i, id in enumerate(example_ids):
    examples += "Example #" + str(i+1) + "\n<user>: '''" + df.iloc[id,0] + "'''\n<assistant>: '''"
    examples += score_to_explaination[df.iloc[id,2]] + "'''"
    if i != len(example_ids) - 1:
        examples += "\n"
log("example data ids:" + str(example_ids))
log("system prompt:\n" + instruction + examples)

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens
log("system prompt tokens:" + str(num_tokens_from_string(instruction + examples)))

#validate_ids = [3, 11, 39, 43]
validate_ids = [x for x in range(48) if x not in example_ids and x not in range(9, 24)]
log("validation data ids:" + str(validate_ids))
display(df.iloc[validate_ids])

Experiment 2023-11-24 23:44:35
example data ids:[2, 10, 38, 42]
system prompt:
### instruction ###
Act as a Russian political joke evaluator. Evaluate the funniness of a political joke using an integer rating from 0 to 3 and provide explainations. 0 means the input is not a joke. 1 means the input is a joke but not funny. 3 means the input is very funny.
### examples ###
Example #1
<user>: '''Q: Is it true that every Soviet soldier dreams of becoming a general? A: No, our soldiers are not that stupid. They know that generals may become MIA even in time of peace.'''
<assistant>: '''This joke is very funny because it is easy to understand and plays on the absurdities and contradictions of the Soviet regime and its leadership, so the rating is 3.'''
Example #2
<user>: '''Q: Could an atomic bomb destroy our beloved town, Yerevan, with its splendid buildings and beautiful gardens? A: In principle, yes. But Moscow is by far a more beautiful city.'''
<assistant>: '''This joke cannot be rated 

Unnamed: 0,Text,Number of Tokens,Rating out of 3,Rating,Gan's Rating,Yimin's Rating
0,"Q: Is it true that there are two kinds of people serving as deputies of the Supreme Soviet of the USSR, as members of the Supreme Court, and as Soviet diplomats? A: Yes, it is true. One kind is those not capable of anything at all, and the other is those capable of anything whatsoever.",64,3,5,4,5
1,"Q: Why do we need two central newspapers, Pravda (Truth) and Izvestiya (News) if both are organs of the same Party? A: Because in Pravda there is no news, and in Izvestiya there is no truth.",54,3,5,5,4
3,"Q: What is the difference between the Constitutions of the USA and USSR? Both guarantee freedom of speech. A: Yes, and the US Constitution also guarantees freedom after speech.",36,3,5,5,4
4,"Q: Can a son of a General become a Marshal? A: No, because every Marshal also has a son.",24,3,5,5,5
5,"Q: Why did you not broadcast for such a long time? A: We had to make some changes to our staff. The previous broadcaster, while reading an article that contained the words, ""Socialism is nothing as compared with communism,"" made a pause too long after the word ""nothing.""",59,2,4,3,4
6,"Q: Does China have rockets powerful enough to reach the moon? A: They don't need rockets. If the Chinese communist Party ordered, their people just would step on each other's shoulders, and this way they could reach even the sun.",49,2,3,3,2
7,"Q: Why do some people say that Hungarians love the Russians and hate the Americans? A: Because Russians helped Hungarians to get rid of one totalitarian rule, but Americans didn't help to get rid of the other.",45,2,4,4,3
8,"Q: Is it true that American skyscrapers are the tallest in the world? A: Yes, it's true, but on the other hand the Soviet-made transistors are the largest in the world.",43,2,4,3,4
24,"Q: Are the bedbugs also builders of socialism? A: Of course, in their veins flows the workers-and-peasants' blood.",29,2,4,5,2
25,"Q: Why Lenin wore regular shoes, but Stalin wore boots? A: At Lenin's time, Russia was still only ankle-high in shit.",29,2,3,4,2


In [5]:
import os
import openai
from sklearn.metrics import classification_report, mean_squared_error

openai.api_key = os.getenv("OPENAI_API_KEY")

model = "gpt-4-1106-preview"
temperature = 1
top_p = 1
log("model: " + model + ", temperature: " + str(temperature) + ", top_p: " + str(top_p))

output = []
for i in validate_ids:
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": instruction + examples
            },
            {
                "role": "user",
                "content": df.iloc[i,0]
            }
        ],
        temperature=1,
        max_tokens=512,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    content = response['choices'][0]['message']['content']
    print(content)
    output.append(content)

df2 = df.iloc[validate_ids]
df2.insert(3, 'GPT Rating', output[-2])
df2.insert(4, 'GPT Explaination', output)
df2.to_csv("evaluator_output/" + datetime_str.replace(" ", "_").replace(":", "-") + ".csv", index=False)



model: gpt-4-1106-preview, temperature: 1, top_p: 1
This joke uses a play on words related to the competence and moral flexibility of Soviet officials, which can be humorous given the context of perceived corruption and inefficiency in Soviet bureaucracies. However, it relies on some degree of insider knowledge and may not have universal comedic appeal. Therefore, the rating is 2.
This joke is very funny because it cleverly uses wordplay on the names of the newspapers to criticize the lack of reliable information in Soviet-era media, reflecting the reality of propaganda and censorship, so the rating is 3.
This joke highlights the difference in the freedom of expression between the US and the USSR through a clever play on words, which makes it funny and thought-provoking. So the rating is 3.
This joke plays on the idea of nepotism and the notion that high-ranking positions are passed down through family connections, rather than individual merit. While it touches upon a real social issue

ValueError: could not convert string to float: 'This is not a joke at all, so the rating is 0.'

In [14]:
def extract_rating(explaination):
    for s in explaination:
        if s.isdigit():
            return int(s)


df2 = df.iloc[validate_ids]
df2.insert(3, 'GPT Rating', [extract_rating(e) for e in output])
df2.insert(4, 'GPT Explaination', output)
df2.to_csv("evaluator_output/" + datetime_str.replace(" ", "_").replace(":", "-") + ".csv", index=False)

log("rmse: " + str(mean_squared_error(df2.iloc[:,2], df2.iloc[:,3], squared=False)))
log(classification_report(df2.iloc[:,2], df2.iloc[:,3].apply(int), labels=range(4), zero_division=0.0))
log("\n\n")


rmse: 0.6582805886043833
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       0.25      0.20      0.22         5
           2       0.60      0.56      0.58        16
           3       0.33      0.50      0.40         4

    accuracy                           0.57        30
   macro avg       0.55      0.57      0.55        30
weighted avg       0.57      0.57      0.57        30




