# Query Fine-Tuned GPT Model

## Imports

In [1]:
import os
import json
import pickle
import numpy as np
import openai
from tqdm import tqdm
autocast_questions = json.load(open('../../autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('../../autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

## Test Model
### Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [2]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

### Query Model

In [3]:
def fine_tuned_gpt(q):
    # query the fine-tuned GPT-3 model
    openai.api_key = "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"
    if q['qtype'] == 'mc':
        choices = ""
        for i, choice in enumerate(q['choices']):
            ## first question is A, second is B, etc.
            choices += chr(ord('A') + i) + ") " + choice + "\n"
        response = openai.Completion.create(
            engine="ada:ft-codewise-2023-03-10-04-22-36",
            prompt=q['background'] + "\n\nDate of event: " + q['publish_time'] + "\n\nWhat is the answer to this question:\n" + q['question'] + "\n\nDate of question: " + q['publish_time'] + "\n\nChoices: " + choices + "\n\n###\n\n",
            temperature=0.0,
            top_p=0,
            max_tokens=1,
            stop=["###"]
        )
    else:
        response = openai.Completion.create(
            engine="ada:ft-codewise-2023-03-10-04-22-36",
            prompt=q['background'] + "\n\nDate of event: " + q['publish_time'] + "\n\nWhat is the answer to this question:\n" + q['question'] + "\n\nDate of question: " + q['publish_time'] + "\n\nChoices: " + str(q['choices']) + "\n\n###\n\n",
            temperature=0.0,
            top_p=0,
            max_tokens=1,
            stop=["###"]
        )
    return response.choices[0].text.strip()

In [None]:
preds = []
answers = []
qtypes = []
responses = []
# use tqdm to show progress
for question in tqdm(autocast_questions):
    response = fine_tuned_gpt(question)
    responses.append(response)

100%|██████████| 6532/6532 [09:56<00:00, 10.95it/s] 


### Save the responses

In [None]:
# save the responses to a pickle file
with open('./temp/gpt3_responses.pkl', 'wb') as f:
    pickle.dump(responses, f)

### Load Previous Responses

In [4]:
# load the responses from the pickle file
with open('./temp/gpt3_responses.pkl', 'rb') as f:
    responses = pickle.load(f)

In [5]:
preds = []
answers = []
qtypes = []
correct = []
for question in autocast_questions:
    correct.append(question['answer'])
    
print(correct[:50])
print(responses[:50])
print("Lengths: ", len(correct), len(responses))

['D', 'A', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'A', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes', 'C', 'no', 'A', 'yes', 'A', 'B', 'yes', 'yes', 'yes', 'no', 'A', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'B', 'yes', 'C', 'yes', 'D', 'C', 'no', 'B', 'no', 'no', 'no', 'yes', 'D']
['C', 'A', 'no', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'A', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'A', 'no', 'A', 'yes', 'A', 'B', 'yes', 'no', 'no', 'no', 'C', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'B', 'no', 'A', 'yes', 'A', 'C', 'no', 'B', 'no', 'no', 'no', 'no', 'D']
Lengths:  6532 6532


In [6]:
for idx, question in enumerate(autocast_questions):
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        pred_idx = 0 if responses[idx] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        pred = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        pred[pred_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        pred_idx = ord(responses[idx]) - ord('A')
        ans = np.zeros(len(question['choices']))
        pred = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        pred[pred_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        # if response is not a number, skip the question
        pred = float(responses[idx])
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)
    preds.append(pred)
print(len(answers), len(preds), len(qtypes))

2797 2797 2797


## Evaluate the model

In [7]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 16.16, MCQ: 41.67, NUM: 41.94
Combined Metric: 99.77
