In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.spatial

# from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import torch
from pprint import pprint

import sys
import glob

In [3]:
pip install torch 

Collecting torch
  Using cached torch-1.13.1-cp39-none-macosx_10_9_x86_64.whl (135.3 MB)
Installing collected packages: torch
Successfully installed torch-1.13.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch 
VERSION = "nightly"  #@param ["1.5" , "20200325", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

ModuleNotFoundError: No module named 'torch'

In [12]:
dataset = pd.read_json("autocast_questions.json")
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]
for i in range(len(dataset)):
    if dataset["id"][i] in test_ids or dataset["answer"][i] == None: # getting rid of the test set and skipping questions without answers
        dataset = dataset.drop([i])
df = dataset[["question", "answer", "choices"]]

In [13]:
dataset.shape

(2797, 14)

## Create baseline models outputting random answers

In [9]:
def random_baseline_model(question):
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=question['quesiton'],
        temperature=0.5,
        max_tokens=256,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )
    if question['qtype'] == 't/f':
        return response['choices'][0]['text']
    elif question['qtype'] == 'mc':
        return response['choices'][0]['text']
    elif question['qtype'] == 'num':
        return response['choices'][0]['text']


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.5

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [10]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [11]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

KeyError: 'quesiton'

## Evaluate the model

In [6]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 25.00, MCQ: 38.05, NUM: 22.63
Combined Metric: 85.67


## Make predictions on test set

In [7]:
preds = []
for question in test_questions:
    preds.append(random_baseline_model(question))

In [8]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

  adding: predictions.pkl (deflated 79%)


In [9]:
!ls

README.md                          negated_tf_questions.json
autocast_competition_test_set.json [34msubmission[m[m
autocast_questions.json            submission.zip
example_submission.ipynb
