In [1]:
import os
import json
import pickle
import numpy as np
import openai

In [2]:
api_key = <YOUR API KEY>
openai.api_key = api_key

In [3]:
autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

# Pre-processing 

In [4]:
new_data = []
for item in autocast_questions:
    if item["answer"] != None and item["id"] not in test_ids: # take care of duplicate in training and test set 
        new_item = {"prompt": item["question"] + " choices: " + str(item["choices"]) + " ->", "completion": " " + str(item["answer"]) + ".\n"}
        new_data.append(new_item)
with open("autocast_questions_gpt3.json", "w") as f:
    json.dump(new_data, f)
    
file_name = "autocast_questions_gpt3.jsonl"
with open(file_name, "w") as output_file:
    for entry in new_data:
        json.dump(entry, output_file)
        output_file.write("\n")

In [5]:
!openai tools fine_tunes.prepare_data -f autocast_questions_gpt3.jsonl

Analyzing...

- Your file contains 2797 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 19 duplicated prompt-completion sets. These are rows: [1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1748, 1946]
- All prompts end with suffix ` ->`

Based on the analysis we will perform the following actions:
- [Recommended] Remove 19 duplicate rows [Y/n]: ^C



## Fine-tuning GPT3 model

In [6]:
upload_response = openai.File.create(file=open("autocast_questions_gpt3.jsonl", "rb"), purpose="fine-tune")
file_id = upload_response.id
upload_response

<File file id=file-j290Y5UZP7iccNMInZyzQbbX at 0x7fef2000f9a0> JSON: {
  "bytes": 586222,
  "created_at": 1678246186,
  "filename": "file",
  "id": "file-j290Y5UZP7iccNMInZyzQbbX",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [7]:
fine_tune_response = openai.FineTune.create(training_file=file_id, model="ada")
fine_tune_response

<FineTune fine-tune id=ft-Mu85TtcUiHmndBacW6XqfK4I at 0x7fef9415f1d0> JSON: {
  "created_at": 1678246189,
  "events": [
    {
      "created_at": 1678246189,
      "level": "info",
      "message": "Created fine-tune: ft-Mu85TtcUiHmndBacW6XqfK4I",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-Mu85TtcUiHmndBacW6XqfK4I",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-0Vb7q2Rndj3zUuGWDYjsJAMr",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 586222,
      "created_at": 1678246186,
      "filename": "file",
      "id": "file-j290Y5UZP7iccNMInZyzQbbX",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1678246189,
  "validation_files": []
}

In [8]:
import requests

# Replace YOUR_API_KEY with your OpenAI API key
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer <YOUR API KEY>'
}

# Replace FINE_TUNE_ID with the ID of your fine-tuning job
# previous 
response = requests.get('https://api.openai.com/v1/fine-tunes/<YOUR JOB ID', headers=headers)

# Print the response
print(response.json())


{'object': 'fine-tune', 'id': 'ft-Mu85TtcUiHmndBacW6XqfK4I', 'hyperparams': {'n_epochs': 4, 'batch_size': None, 'prompt_loss_weight': 0.01, 'learning_rate_multiplier': None}, 'organization_id': 'org-0Vb7q2Rndj3zUuGWDYjsJAMr', 'model': 'ada', 'training_files': [{'object': 'file', 'id': 'file-j290Y5UZP7iccNMInZyzQbbX', 'purpose': 'fine-tune', 'filename': 'file', 'bytes': 586222, 'created_at': 1678246186, 'status': 'processed', 'status_details': None}], 'validation_files': [], 'result_files': [], 'created_at': 1678246189, 'updated_at': 1678246189, 'status': 'pending', 'fine_tuned_model': None, 'events': [{'object': 'fine-tune-event', 'level': 'info', 'message': 'Created fine-tune: ft-Mu85TtcUiHmndBacW6XqfK4I', 'created_at': 1678246189}]}


In [24]:
fine_tune_events = openai.FineTune.list_events(id=<YOUR JOB ID>)
fine_tune_events

<OpenAIObject list at 0x7fef94616b80> JSON: {
  "data": [
    {
      "created_at": 1678246189,
      "level": "info",
      "message": "Created fine-tune: ft-Mu85TtcUiHmndBacW6XqfK4I",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1678246614,
      "level": "info",
      "message": "Fine-tune costs $0.21",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1678246614,
      "level": "info",
      "message": "Fine-tune enqueued. Queue number: 3",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1678246661,
      "level": "info",
      "message": "Fine-tune is in the queue. Queue number: 2",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1678246704,
      "level": "info",
      "message": "Fine-tune is in the queue. Queue number: 1",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1678247029,
      "level": "info",
      "message": "Fine-tune is in the queue. Queue number: 0",
      "object": "

In [5]:
retrieve_response = openai.FineTune.retrieve(<YOUR JOB ID>)
fine_tuned_model = retrieve_response.fine_tuned_model
fine_tuned_model

'ada:ft-personal-2023-03-08-04-03-12'

In [6]:
answer = openai.Completion.create(model="<YOUR MODEL ID>", prompt="Before 1 July 2021, will the Chilean government pass legislation that caps administrative fees and/or operating profits of the country's pension fund managers? choices: ['yes', 'no] ->", 
max_tokens=10, temperature=0) 
answer["choices"][0]["text"].strip().split(".\n")[0]

'no'

In [7]:
def gpt3(question):
    answer = openai.Completion.create(model= <YOUR MODEL ID>, prompt=question["question"] + " choices: " + str(question["choices"]) + " ->", max_tokens=10, temperature=0) 
    return answer["choices"][0]["text"].strip().split(".\n")[0]

def calibrated_random_baseline_model(question):
    ans = gpt3(question)
    if question['qtype'] == 't/f':
        if ans == "None" or (ans != "yes" and ans != "no"):
            return np.zeros(2)
        pred_idx = 0 if ans == 'no' else 1        
        pred = np.zeros(2)
        pred[pred_idx] = 1
        return pred 
    elif question['qtype'] == 'mc':
        if ans == "None" or not ("Z" <= ans <= "A"):
            return np.zeros(len(question['choices']))
        pred_idx = ord(ans) - ord('A')
        pred = np.zeros(len(question['choices']))
        if pred_idx < len(pred):
            pred[pred_idx] = 1
        return pred 
    elif question['qtype'] == 'num':
        return float(ans.strip("."))


## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [8]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

## Evaluate the model

In [11]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [12]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    
    if qtype == 't/f':      
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        if len(p) == len(a):
            mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(float(p) - a))
print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 11.07, MCQ: 50.00, NUM: 15.33
Combined Metric: 76.40


## Make predictions on test set

In [13]:
preds = []
for question in test_questions:
    preds.append(calibrated_random_baseline_model(question))

In [14]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

updating: predictions.pkl (deflated 76%)


In [15]:
!ls

README.md                          example_submission_gpt3.ipynb
autocast_competition_test_set.json predictions_correct.pkl
autocast_questions.json            [34msubmission[m[m
autocast_questions_gpt3.json       submission.zip
autocast_questions_gpt3.jsonl      test.ipynb
example_submission.ipynb
