## Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata

from openai import OpenAI
client = OpenAI(api_key=userdata.get(""))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import re
import time
import json
from sklearn.metrics import f1_score, cohen_kappa_score
import numpy as np

In [10]:
MODEL = "gpt-4o-2024-08-06"

model_ids = [m.id for m in client.models.list().data]
assert MODEL in model_ids

In [None]:
# Change FA and ABLATION in between ablations for same FA
FA = "2"
ABLATION = f"_3_CoT"

FA_COLS = [
    "less_than_condition",
    "less_than_set_absorption",
    "less_than_set_runoff",
    "equal_to_condition",
    "equal_to_set_absorption",
    "equal_to_set_runoff",
    "greater_than_condition",
    "greater_than_set_absorption",
    "greater_than_set_runoff",
]

FA_COLS_DICT = {k: None for k in FA_COLS}

DATA_PATH = ""
PROMPT_PATH = ""
RESULTS_PATH = ""

ROLE_CONTENT_DELIM = "!~*~!"

# Add this if going from I/O to ICL
LINE_DELIM = "\n!@#@!\n"

SEED = 312
N_BOOTSTRAP = 5000

# Number of few-shot instances * 2 + 1
N_PROMPT_MESSAGES = 5

## Data

In [None]:
df = pd.read_csv(DATA_PATH)

df.head()

## Prompt

In [None]:
with open(PROMPT_PATH, 'r', encoding='utf-8') as f:
    prompt = f.read()

prompt

'system!~*~!You are a teacher whose job it is to score middle school student short answer formative assessment question responses in the Earth Science domain.\n\nPrior to this particular formative assessment, students are taught different "rules" about the relationships between rainfall, absorption, absorption limit, and water runoff. Students are then tasked with the following: \n\nWrite down each rule (recall the IF and THEN multiple choice from today for each category of rainfall).\n\nIn total, there are three rules the students are required to identify:\n1. If rainfall is less than absorption limit, set absorption to rainfall and set runoff to 0.\n2. If rainfall is equal to absorption limit, set absorption to either rainfall or absorption limit, and set runoff to 0.\n3. If rainfall is greater than absorption limit, set absorption to absorption limit and set runoff to either rainfall - absorption limit or rainfall - absorption.\n\nYou are to score student responses based on the foll

## API Call

In [None]:
def get_openai_response(messages):
  start_time = time.time()
  response = client.chat.completions.create(
      model=MODEL,
      messages=messages,
      temperature=0,
      response_format={"type": "json_object"},
      seed=SEED)
  total_time = time.time()-start_time
  total_tokens = response.usage.total_tokens
  generation = response.choices[0].message.content.strip()

  return generation, total_time, total_tokens

In [None]:
response = get_openai_response([{"role":"system","content":"Confirm that the api is working. Respond using the following JSON schema: {'working':bool, 'explanation':str}"}])
print(response)

('{\n    "working": true,\n    "explanation": "The API is functioning correctly as there are no reported issues or errors in the system. All endpoints are responding as expected."\n}', 0.893195390701294, 70)


## Generations

In [None]:
DOUBLED_COLS = []
for col in FA_COLS:
  DOUBLED_COLS.append(col+"_explanation")
  DOUBLED_COLS.append(col+"_score")

results = [DOUBLED_COLS+["total_score","total_time_s","total_tokens"]]

prompt_messages = prompt.split(LINE_DELIM)
assert len(prompt_messages) == N_PROMPT_MESSAGES

system_role, system_content = prompt_messages[0].split(ROLE_CONTENT_DELIM)
user_role1, user_content1 = prompt_messages[1].split(ROLE_CONTENT_DELIM)
assistant_role1, assistant_content1 = prompt_messages[2].split(ROLE_CONTENT_DELIM)
user_role2, user_content2 = prompt_messages[3].split(ROLE_CONTENT_DELIM)
assistant_role2, assistant_content2 = prompt_messages[4].split(ROLE_CONTENT_DELIM)

for idx,row in df.iterrows():
  messages = [
      {"role":system_role,"content":system_content},
      {"role":user_role1,"content":user_content1},
      {"role":assistant_role1,"content":assistant_content1},
      {"role":user_role2,"content":user_content2},
      {"role":assistant_role2,"content":assistant_content2},
      {"role":"user","content":row['response']}
  ]

  generation, total_time, total_tokens = get_openai_response(messages)
  generation_data = json.loads(generation)

  for k in FA_COLS:
    FA_COLS_DICT[k] = {"explanation":generation_data[k]["explanation"], "score":int(generation_data[k]["score"])}

  total_score = sum([FA_COLS_DICT[k]["score"] for k in FA_COLS_DICT.keys()])

  new_result = []
  for k in FA_COLS:
    new_result.append(FA_COLS_DICT[k]["explanation"])
    new_result.append(FA_COLS_DICT[k]["score"])
  new_result.extend([total_score, total_time, total_tokens])

  results.append(new_result)

  print(f"FINISHED FA{FA} STUDENT {idx}.")

FINISHED FA2 STUDENT 0.
FINISHED FA2 STUDENT 1.
FINISHED FA2 STUDENT 2.
FINISHED FA2 STUDENT 3.
FINISHED FA2 STUDENT 4.
FINISHED FA2 STUDENT 5.
FINISHED FA2 STUDENT 6.
FINISHED FA2 STUDENT 7.
FINISHED FA2 STUDENT 8.
FINISHED FA2 STUDENT 9.
FINISHED FA2 STUDENT 10.
FINISHED FA2 STUDENT 11.
FINISHED FA2 STUDENT 12.
FINISHED FA2 STUDENT 13.
FINISHED FA2 STUDENT 14.
FINISHED FA2 STUDENT 15.
FINISHED FA2 STUDENT 16.
FINISHED FA2 STUDENT 17.
FINISHED FA2 STUDENT 18.
FINISHED FA2 STUDENT 19.
FINISHED FA2 STUDENT 20.
FINISHED FA2 STUDENT 21.
FINISHED FA2 STUDENT 22.
FINISHED FA2 STUDENT 23.
FINISHED FA2 STUDENT 24.
FINISHED FA2 STUDENT 25.
FINISHED FA2 STUDENT 26.
FINISHED FA2 STUDENT 27.
FINISHED FA2 STUDENT 28.
FINISHED FA2 STUDENT 29.
FINISHED FA2 STUDENT 30.
FINISHED FA2 STUDENT 31.
FINISHED FA2 STUDENT 32.
FINISHED FA2 STUDENT 33.
FINISHED FA2 STUDENT 34.
FINISHED FA2 STUDENT 35.
FINISHED FA2 STUDENT 36.
FINISHED FA2 STUDENT 37.
FINISHED FA2 STUDENT 38.
FINISHED FA2 STUDENT 39.
FINISHED F

In [None]:
df_results = pd.DataFrame(results[1:],columns=results[0])
df = pd.concat([df,df_results],axis=1)
df.head(25)

## Save

In [None]:
df.to_csv(path_or_buf=RESULTS_PATH,index=False)

## Metrics

In [12]:
df = pd.read_csv(RESULTS_PATH)

In [13]:
y_true = df["score"].astype(int)
y_pred = df["total_score"].astype(int)

f1 = f1_score(y_true, y_pred, average="micro")
qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")

print(f"Micro‑F1: {f1:.4f}")
print(f"QWK:      {qwk:.4f}")

Micro‑F1: 0.7200
QWK:      0.9603


## CIs

In [14]:
rng = np.random.default_rng(SEED)

f1_samples   = []
kappa_samples = []

n = len(df)

for _ in range(N_BOOTSTRAP):
    idx = rng.integers(0, n, n)
    y_t = y_true.iloc[idx].values
    y_p = y_pred.iloc[idx].values

    f1_samples.append(
        f1_score(y_t, y_p, average="micro")
    )
    kappa_samples.append(
        cohen_kappa_score(y_t, y_p, weights="quadratic")
    )

ci_f1   = np.percentile(f1_samples, [2.5, 97.5])
ci_kappa = np.percentile(kappa_samples, [2.5, 97.5])

moe_f1   = (ci_f1[1]   - ci_f1[0])   / 2
moe_kappa = (ci_kappa[1] - ci_kappa[0]) / 2

print(f"Micro‑F1: {f1*100:.2f} ± {moe_f1*100:.2f}")
print(f"QWK:      {qwk*100:.2f} ± {moe_kappa*100:.2f}")

Micro‑F1: 72.00 ± 12.00
QWK:      96.03 ± 3.04
