# Imports

In [2]:
import os
import json
import pickle
import numpy as np
import openai

## Import Data

In [3]:
autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

## Display Data

In [5]:
def display_data(data):
    # Display the first 3 questions and answers
    for i in range(3):
        print('Keys: ', data[i].keys())
        print('Type: ', data[i]['qtype'])
        print('Question: ', data[i])
        print('Choices: ', data[i]['choices'])
        print('Publish time: ', data[i]['publish_time'])
        print('Close time: ', data[i]['close_time'])
        print('Background: ', data[i]['background'])
        print('Answer: ', data[i]['answer'])
        print()

print('Autocast dataset')
display_data(autocast_questions)

Autocast dataset
Keys:  dict_keys(['question', 'id', 'background', 'publish_time', 'close_time', 'tags', 'source_links', 'prediction_count', 'forecaster_count', 'answer', 'choices', 'status', 'qtype', 'crowd'])
Type:  mc
Question:  {'question': 'What will the end-of-day closing value for the dollar against the renminbi be on 1 January 2016?', 'id': 'G1', 'background': "Outcome will be determined by the end-of-day closing value reported by Bloomberg, at http://www.bloomberg.com/quote/usdcny:cur. For historical trends, see http://www.bloomberg.com/quote/usdcny:cur/chart. For more information on China's economy see http://www.theworldin.com/article/10492.", 'publish_time': '2015-09-01 13:49:29.860000+00:00', 'close_time': '2016-01-01 17:00:01+00:00', 'tags': ['Finance', 'Economic Indicators'], 'source_links': ['http://ftalphaville.ft.com/2015/08/17/2137329/what-are-chinese-capital-controls-really-part-2/', 'http://www.investmentweek.co.uk/investment-week/analysis/2427669/why-investors-nee

# Fine-Tune GPT-3
## Generate Training Data for GPT-3

In [17]:
def generate_gpt3_training_data():
    # generate training data for GPT-3 and save it to a .jsonl file
    gpt3_training_data = []
    for q in autocast_questions:
        gpt3_training_data.append({
            'prompt': q['background'] + "\n\nDate of event: " + q['publish_time'] + "\n\nWhat is the answer to this question:\n" + q['question'] + "\n\nDate of question: " + q['publish_time'] + "\n\nChoices: " + str(q['choices']) + "\n\n###\n\n",
            'completion': " " + str(q['answer']) + "###"
        })
    # save the training data to a file
    with open('gpt3_training_data.jsonl', 'wb') as f:
        for item in gpt3_training_data:
            f.write(json.dumps(item).encode('utf-8'))
            f.write(b'\n')
    # compress

generate_gpt3_training_data()

## Fine-tune GPT-3 on the training data
> Instructions copied from OpenAI

Create a fine-tuned model
The following assumes you've already prepared training data following the above instructions.

Start your fine-tuning job using the OpenAI CLI:
```bash
openai api fine_tunes.create -t <TRAIN_FILE_ID_OR_PATH> -m <BASE_MODEL>
```
Where `BASE_MODEL` is the name of the base model you're starting from (ada, babbage, curie, or davinci). You can customize your fine-tuned model's name using the suffix parameter.

Running the above command does several things:

Uploads the file using the files API (or uses an already-uploaded file)
Creates a fine-tune job
Streams events until the job is done (this often takes minutes, but can take hours if there are many jobs in the queue or your dataset is large)
Every fine-tuning job starts from a base model, which defaults to curie. The choice of model influences both the performance of the model and the cost of running your fine-tuned model. Your model can be one of: ada, babbage, curie, or davinci. Visit our pricing page for details on fine-tune rates.

After you've started a fine-tune job, it may take some time to complete. Your job may be queued behind other jobs on our system, and training our model can take minutes or hours depending on the model and dataset size. If the event stream is interrupted for any reason, you can resume it by running:
```bash
openai api fine_tunes.follow -i <YOUR_FINE_TUNE_JOB_ID>
```
When the job is done, it should display the name of the fine-tuned model.

In addition to creating a fine-tune job, you can also list existing jobs, retrieve the status of a job, or cancel a job.

```bash
# List all created fine-tunes
openai api fine_tunes.list

# Retrieve the state of a fine-tune. The resulting object includes
# job status (which can be one of pending, running, succeeded, or failed)
# and other information
openai api fine_tunes.get -i <YOUR_FINE_TUNE_JOB_ID>

# Cancel a job
openai api fine_tunes.cancel -i <YOUR_FINE_TUNE_JOB_ID>
```

# Test Model
## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [4]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

### Query Model

In [35]:
def fine_tuned_gpt(q):
    # query the fine-tuned GPT-3 model
    openai.api_key = "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"
    response = openai.Completion.create(
        engine="ada:ft-codewise-2023-03-03-06-51-46",
        prompt=q['background'] + "\n\nDate of event: " + q['publish_time'] + "\n\nWhat is the answer to this question:\n" + q['question'] + "\n\nDate of question: " + q['publish_time'] + "\n\nChoices: " + str(q['choices']) + "\n\n###\n\n",
        temperature=0.0,
        top_p=0,
        max_tokens=1,
        stop=["###"]
    )
    return response.choices[0].text.strip()

In [36]:
preds = []
answers = []
qtypes = []
responses = []
for question in autocast_questions:
    response = fine_tuned_gpt(question)
    responses.append(response)

### Save the responses

In [37]:
# save the responses to a pickle file
with open('gpt3_responses.pkl', 'wb') as f:
    pickle.dump(responses, f)

### Load Previous Responses

In [5]:
# load the responses from the pickle file
with open('gpt3_responses.pkl', 'rb') as f:
    responses = pickle.load(f)
    preds = []
    answers = []
    qtypes = []

In [39]:
correct = []
for question in autocast_questions:
    correct.append(question['answer'])
    
print(correct[:50])
print(responses[:50])
print("Lengths: ", len(correct), len(responses))

['D', 'A', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'A', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes', 'C', 'no', 'A', 'yes', 'A', 'B', 'yes', 'yes', 'yes', 'no', 'A', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'B', 'yes', 'C', 'yes', 'D', 'C', 'no', 'B', 'no', 'no', 'no', 'yes', 'D']
['C', 'A', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'A', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'C', 'no', 'A', 'yes', 'C', 'B', 'yes', 'yes', 'yes', 'no', 'C', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'B', 'yes', 'C', 'yes', 'A', 'C', 'yes', 'B', 'yes', 'no', 'no', 'no', 'D']
Lengths:  6532 6532


In [41]:
wrongq = 0
print(responses)
print(question)
for idx, question in enumerate(autocast_questions):
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    if question['qtype'] == 't/f':
        if responses[idx] not in question['choices']:
            wrongq += 1
            print("\n---\n"+str(wrongq))
            print(responses[idx])
            print(question['choices'])
            pred = np.zeros(len(question['choices']))
            continue
        ans_idx = 0 if question['answer'] == 'no' else 1
        pred_idx = 0 if responses[idx] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        pred = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        pred[pred_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        if responses[idx] not in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', "K"]:
            wrongq += 1
            print("\n---\n"+str(wrongq))
            print(responses[idx])
            print(question['answer'])
            pred = np.zeros(len(question['choices']))
            continue
        ans_idx = ord(question['answer']) - ord('A')
        pred_idx = ord(responses[idx]) - ord('A')
        ans = np.zeros(len(question['choices']))
        pred = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        if pred_idx >= len(question['choices']):
            pred = np.zeros(len(question['choices']))
            continue
        pred[pred_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        # if response is not a number, skip the question
        try:
            pred = float(responses[idx])
        except:
            wrongq += 1
            print("\n---\n"+str(wrongq))
            print(responses[idx])
            print(question['answer'])
            pred = .5
            continue
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)
    preds.append(pred)

['C', 'A', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'A', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'C', 'no', 'A', 'yes', 'C', 'B', 'yes', 'yes', 'yes', 'no', 'C', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'B', 'yes', 'C', 'yes', 'A', 'C', 'yes', 'B', 'yes', 'no', 'no', 'no', 'D', 'yes', 'no', 'no', 'A', 'no', 'C', 'no', 'no', 'no', 'no', 'no', 'A', 'no', 'no', 'no', 'no', 'no', 'A', 'no', 'no', 'no', 'F', 'no', 'no', 'no', 'B', 'B', 'D', 'no', 'C', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'A', 'no', 'D', 'no', 'A', 'no', 'A', 'yes', 'no', 'no', 'C', 'no', 'A', 'no', 'A', 'A', 'A', 'D', 'no', 'B', 'E', 'no', 'no', 'no', 'no', 'B', 'yes', 'C', 'no', 'A', 'no', 'no', 'A', 'B', 'no', 'C', 'D', 'no', 'B', 'no', 'no', 'no', 'B', 'B', 'no', 'C', 'no', 'no', 'no', 'B', 'B', 'no', 'no', 'B', 'B', 'A', 'no', 'no', 'no', 'D', 'B', 'B', 'no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'D', 'no', 'no', 'no', 'yes', 'no', 'no', 'C', 'no', 'B', 'no', 'no', 'no'

## Evaluate the model

In [42]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 12.77, MCQ: 32.26, NUM: 42.21
Combined Metric: 87.24


## Make predictions on test set

In [45]:
responses = []
for question in test_questions:
    responses.append(fine_tuned_gpt(question))

In [46]:
# save the responses to a pickle file
with open('gpt3_test_responses.pkl', 'wb') as f:
    pickle.dump(responses, f)

In [24]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

632.17s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


updating: predictions.pkl (deflated 80%)


In [25]:
!ls

637.42s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


autocast_competition_test_set.json  gpt3_training_data_prepared.jsonl
autocast_questions.json		    negated_tf_questions.json
example_submission.ipynb	    README.md
fine_tune_gpt.ipynb		    submission
gpt3_training_data.jsonl	    submission.zip
