## Imports

In [None]:
!pip3 install openai==0.27.0 --quiet

from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata

import openai
openai.api_key = userdata.get("")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [2]:
import pandas as pd
pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import re
import time
import json
from sklearn.metrics import f1_score, cohen_kappa_score
import numpy as np

In [3]:
MODEL = "gpt-4o-2024-08-06"

model_ids = [model.id for model in openai.Model.list().data]
assert MODEL in model_ids

In [None]:
# Change FA and ABLATION in between ablations for same FA
FA = "2"
ABLATION = f"_1_IO"

FA_COLS = [
    "less_than_condition",
    "less_than_set_absorption",
    "less_than_set_runoff",
    "equal_to_condition",
    "equal_to_set_absorption",
    "equal_to_set_runoff",
    "greater_than_condition",
    "greater_than_set_absorption",
    "greater_than_set_runoff",
]

FA_COLS_DICT = {k: None for k in FA_COLS}

DATA_PATH = ""
PROMPT_PATH = ""
RESULTS_PATH = ""

ROLE_CONTENT_DELIM = "!~*~!"
SEED = 312
N_BOOTSTRAP = 5000

## Data

In [None]:
df = pd.read_csv(DATA_PATH)

df.head()

## Prompt

In [None]:
with open(PROMPT_PATH, 'r', encoding='utf-8') as f:
    prompt = f.read()

prompt

## API Call

In [None]:
def get_openai_response(messages):
  start_time = time.time()
  response = openai.ChatCompletion.create(
      model=MODEL,
      messages=messages,
      temperature=0,
      response_format={"type": "json_object"},
      seed=SEED)
  total_time = time.time()-start_time
  total_tokens = response["usage"]["total_tokens"]
  generation = response["choices"][0]["message"]["content"].strip()

  return generation, total_time, total_tokens

In [None]:
response = get_openai_response([{"role":"system","content":"Confirm that the api is working. Respond using the following JSON schema: {'working':bool, 'explanation':str}"}])
print(response)

('{\n    "working": true,\n    "explanation": "The API is functioning correctly as there are no reported issues or errors in the current system status."\n}', 0.8526895046234131, 65)


## Generations

In [None]:
results = [FA_COLS+["total_score","total_time_s","total_tokens"]]

prompt_split = prompt.split(ROLE_CONTENT_DELIM)
system_role, system_content = prompt_split[0], prompt_split[1]

for idx,row in df.iterrows():
  messages = [
        {"role":system_role,"content":system_content},
        {"role":"user","content":row['response']}
  ]

  generation, total_time, total_tokens = get_openai_response(messages)
  generation_data = json.loads(generation)

  for k in FA_COLS:
    FA_COLS_DICT[k] = int(generation_data[k])

  total_score = sum(FA_COLS_DICT.values())

  results.append([FA_COLS_DICT[col] for col in FA_COLS]+[total_score,total_time,total_tokens])

  print(f"FINISHED FA{FA} STUDENT {idx}.")

FINISHED FA2 STUDENT 0.
FINISHED FA2 STUDENT 1.
FINISHED FA2 STUDENT 2.
FINISHED FA2 STUDENT 3.
FINISHED FA2 STUDENT 4.
FINISHED FA2 STUDENT 5.
FINISHED FA2 STUDENT 6.
FINISHED FA2 STUDENT 7.
FINISHED FA2 STUDENT 8.
FINISHED FA2 STUDENT 9.
FINISHED FA2 STUDENT 10.
FINISHED FA2 STUDENT 11.
FINISHED FA2 STUDENT 12.
FINISHED FA2 STUDENT 13.
FINISHED FA2 STUDENT 14.
FINISHED FA2 STUDENT 15.
FINISHED FA2 STUDENT 16.
FINISHED FA2 STUDENT 17.
FINISHED FA2 STUDENT 18.
FINISHED FA2 STUDENT 19.
FINISHED FA2 STUDENT 20.
FINISHED FA2 STUDENT 21.
FINISHED FA2 STUDENT 22.
FINISHED FA2 STUDENT 23.
FINISHED FA2 STUDENT 24.
FINISHED FA2 STUDENT 25.
FINISHED FA2 STUDENT 26.
FINISHED FA2 STUDENT 27.
FINISHED FA2 STUDENT 28.
FINISHED FA2 STUDENT 29.
FINISHED FA2 STUDENT 30.
FINISHED FA2 STUDENT 31.
FINISHED FA2 STUDENT 32.
FINISHED FA2 STUDENT 33.
FINISHED FA2 STUDENT 34.
FINISHED FA2 STUDENT 35.
FINISHED FA2 STUDENT 36.
FINISHED FA2 STUDENT 37.
FINISHED FA2 STUDENT 38.
FINISHED FA2 STUDENT 39.
FINISHED F

In [None]:
df_results = pd.DataFrame(results[1:],columns=results[0])
df = pd.concat([df,df_results],axis=1)
df.head()

## Save

In [None]:
df.to_csv(path_or_buf=RESULTS_PATH,index=False)

## Metrics

In [5]:
df = pd.read_csv(RESULTS_PATH)

In [6]:
y_true = df["score"].astype(int)
y_pred = df["total_score"].astype(int)

f1 = f1_score(y_true, y_pred, average="micro")
qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")

print(f"Micro‑F1: {f1:.4f}")
print(f"QWK:      {qwk:.4f}")

Micro‑F1: 0.6200
QWK:      0.9128


## CIs

In [7]:
rng = np.random.default_rng(SEED)

f1_samples   = []
kappa_samples = []

n = len(df)

for _ in range(N_BOOTSTRAP):
    idx = rng.integers(0, n, n)
    y_t = y_true.iloc[idx].values
    y_p = y_pred.iloc[idx].values

    f1_samples.append(
        f1_score(y_t, y_p, average="micro")
    )
    kappa_samples.append(
        cohen_kappa_score(y_t, y_p, weights="quadratic")
    )

ci_f1   = np.percentile(f1_samples, [2.5, 97.5])
ci_kappa = np.percentile(kappa_samples, [2.5, 97.5])

moe_f1   = (ci_f1[1]   - ci_f1[0])   / 2
moe_kappa = (ci_kappa[1] - ci_kappa[0]) / 2

print(f"Micro‑F1: {f1*100:.2f} ± {moe_f1*100:.2f}")
print(f"QWK:      {qwk*100:.2f} ± {moe_kappa*100:.2f}")

Micro‑F1: 62.00 ± 13.00
QWK:      91.28 ± 7.37
