In [2]:
#@markdown Check type of GPU and VRAM available.
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

/bin/bash: nvidia-smi: command not found


In [3]:
save_to_gdrive = True #@param {type:"boolean"}
if save_to_gdrive:
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import json
import pickle
import numpy as np

In [14]:
def get_json(file_path):
    return json.load(open(file_path))

path = '/content/drive/My Drive/CS542 Competition/datasets/'
autocast_questions = get_json(path + 'autocast_questions.json')
negated_tf_questions = get_json(path + 'negated_tf_questions.json')
test_questions = get_json(path + 'autocast_competition_test_set.json')
test_ids = [q['id'] for q in test_questions]
print('autocast_questions', len(autocast_questions))
print('negated_tf_questions', len(negated_tf_questions))
print('test_questions', len(test_questions))

autocast_questions 6532
negated_tf_questions 3426
test_questions 1364


## Create baseline models outputting random answers

In [7]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.5
    


## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [8]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [15]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
   if question['id'] in test_ids: # skipping questions in the competition test set
       continue
   if question['answer'] is None: # skipping questions without answer
       continue
   preds.append(calibrated_random_baseline_model(question))
   if question['qtype'] == 't/f':
       ans_idx = 0 if question['answer'] == 'no' else 1
       ans = np.zeros(len(question['choices']))
       ans[ans_idx] = 1
       qtypes.append('t/f')
   elif question['qtype'] == 'mc':
       ans_idx = ord(question['answer']) - ord('A')
       ans = np.zeros(len(question['choices']))
       ans[ans_idx] = 1
       qtypes.append('mc')
   elif question['qtype'] == 'num':
       ans = float(question['answer'])
       qtypes.append('num')
   answers.append(ans)

## Evaluate the model

In [16]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 25.00, MCQ: 38.05, NUM: 22.63
Combined Metric: 85.67


## Make predictions on test set

In [11]:
preds = []
for question in test_questions:
    preds.append(calibrated_random_baseline_model(question))

In [12]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..
print("here")

  adding: predictions.pkl (deflated 79%)
here


In [13]:
!ls

drive  sample_data  submission	submission.zip
