In [None]:
import os
import json
import pickle
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

import os
os.chdir("/content/gdrive/MyDrive/CS542/cs542-autocast/")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]
test_types = [q['qtype'] for q in test_questions]

## Create baseline models outputting random answers

In [None]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.5

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [None]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [None]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

## Evaluate the model

In [None]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

## Make predictions on test set

In [4]:
!pip install transformers
!pip install sentencepiece
!pip install torch
!pip install copy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, http

In [None]:
import regex
import string
import transformers

import time
import sys
import torch
import pickle
from tqdm import tqdm
from torch._C import TensorType
import torch.nn.functional as F
import numpy as np
from pathlib import Path
from torch.utils.data import DataLoader, RandomSampler, DistributedSampler, SequentialSampler
from torch.utils.tensorboard import SummaryWriter

import torch.distributed as dist
import copy
from torch.nn import CrossEntropyLoss, MSELoss
from torch import nn

sys.path.append("")
from ./autocast.cs542.src.options import Options


In [None]:
def normalize_answer(s):
    def remove_articles(text):
        return regex.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [None]:
model_class = transformers.T5ForConditionalGeneration
model_path = ""
epoch_path = os.path.realpath(model_path)
model = model_class.from_pretrained(epoch_path)
model = model.cuda()

In [None]:
def evaluate(model, dataset, tokenizer, collator, opt, epoch, device, mode='eval'):
    # TF_TOKENS = sum(tokenizer(['no', 'yes'])['input_ids'], [])
    # MC_TOKENS = sum(tokenizer([chr(i + ord('A')) for i in range(12)])['input_ids'], [])

    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset,
                            sampler=sampler,
                            batch_size=opt.per_gpu_batch_size,
                            drop_last=False,
                            # num_workers=2,
                            collate_fn=collator
                            )
    model.eval()
    total = 0
    tf_em, mc_em, re_em, exactmatch = [], [], [], []
    tf_predictions, mc_predictions, re_predictions, my_predictions = [], [], [], []
    model = model.module if hasattr(model, "module") else model
    cpu_device = torch.device('cpu')
    raw_logits, qids, raw_answers = [], [], []
    with torch.no_grad():
        pbar = tqdm(dataloader, total=len(dataloader))
        for i, batch in enumerate(pbar):
            (idx, ids, labels, indices, lengths, context_ids, context_mask) = batch

            labels = labels.to(device)
            indices = indices.to(device)
            lengths = lengths.to(device)
            input_ids = context_ids.to(device)
            input_ids = input_ids.view(input_ids.size(0), -1)
            attention_mask = context_mask.to(device)
            attention_mask = attention_mask.view(attention_mask.size(0), -1)

            indices_tfmc = indices[0][:lengths[0]]
            indices_re = indices[1][:lengths[1]]
            labels_tfmc, labels_re = None, None

            if labels is None:
                decoder_outputs = model.forward(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    output_hidden_states=True,
                )
                hidden_state = decoder_outputs[2][-1]
                previous_outputs = decoder_outputs[1]
                logits = decoder_outputs[0]
            else:
                labels_tfmc = torch.index_select(labels, 0, indices_tfmc).to(torch.int64)
                labels_re = torch.index_select(labels, 0, indices_re)

                decoder_labels = copy.deepcopy(labels).to(torch.int64)
                decoder_labels[indices_re, :] = torch.zeros_like(labels_re).to(torch.int64).to(device)
                labels_re = labels_re[:, 0].view(-1, 1)  # only takes the first value, as all others are copies
                decoder_outputs = model.forward(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=decoder_labels,
                    output_hidden_states=True,
                )
                hidden_state = decoder_outputs[3][-1]
                previous_outputs = decoder_outputs[2]
                logits = decoder_outputs[1]

            # raw_logits.append(logits)
            regressor = nn.Sequential(
                nn.Linear(model.config.d_model, 1),
                nn.Sigmoid()
            )

            regressor = regressor.to(device)

            results_re = torch.index_select(regressor(hidden_state)[:, 0, :], 0, indices_re)

            if labels is None:
                return logits, previous_outputs, None, results_re

            loss_fn_classifier, loss_fn_regressor = CrossEntropyLoss(ignore_index=-100), MSELoss()
            loss_tfmc, loss_re = torch.tensor(0.0).cuda(), torch.tensor(0.0).cuda()

            re_outputs = results_re.view(-1, results_re.size(-1))
            
            tfmc_outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=10
            )
            
            indices_re = indices[1][:lengths[1]]
            indices_tf = indices[2][:lengths[2]]
            indices_mc = indices[3][:lengths[3]]

            labels_re = torch.index_select(labels, 0, indices_re)[:, 0].view(-1).detach().to(cpu_device).tolist()

            tf_scores, mc_scores = [], []
            # tf_logits, mc_logits = [], []
            tf_ans, mc_ans = [], []
            
            ans_list = []
        
            # for k, (o, lgs) in enumerate(zip(tfmc_outputs, output_logits)):
            for k, o in enumerate(tfmc_outputs):
 
                ans = tokenizer.decode(o, skip_special_tokens=True)
                
                gold = [str(dataset.get_example(idx[k])['answer'])]
                score = src.evaluation.ems(ans, gold)
                total += 1

                if k in indices_tf:
                    tf_scores.append(score)
                    tf_em.append(score)
                    tf_ans.append(ans)
                    tf_predictions.append(ans)
                    ans_list.append(src.evaluation.normalize_answer(ans))

                elif k in indices_mc:
                    mc_scores.append(score)
                    mc_em.append(score)
                    mc_ans.append(ans)
                    mc_predictions.append(ans)
                    ans_list.append(src.evaluation.normalize_answer(ans))

            re_ans = []
            if len(labels_re) > 0:
                re_ans = re_outputs.view(-1).detach().to(cpu_device).tolist()
                for item in re_ans:
                    ans_list.append(item)
            re_scores = [np.abs(re_ans[i] - labels_re[i]) \
                         for i in range(len(labels_re))]
            total += len(re_scores)
            re_predictions.extend(re_ans)
            re_em.extend(re_scores)

            temp_scores, temp_predictions = [], []
            tf_count, mc_count, re_count = 0, 0, 0
            re_outputs = re_outputs.to(cpu_device).tolist()
            for i in range(len(idx)):
                if i in indices_tf:
                    temp_scores.append(tf_scores[tf_count])
                    if mode == 'eval':
                        temp_predictions.append(tf_ans[tf_count])
                        # raw_logits.append(tf_logits[tf_count])
                    tf_count += 1
                elif i in indices_mc:
                    temp_scores.append(mc_scores[mc_count])
                    if mode == 'eval':
                        temp_predictions.append(mc_ans[mc_count])
                        # raw_logits.append(mc_logits[mc_count])
                    mc_count += 1
                elif i in indices_re:
                    temp_scores.append(-re_scores[re_count])
                    if mode == 'eval':
                        temp_predictions.append(re_ans[re_count])
                        # raw_logits.append(re_outputs[re_count])
                    re_count += 1
                qids.append(ids[i])
                raw_answers.append(str(dataset.get_example(idx[i])['answer']))

            exactmatch.extend(temp_scores)
            my_predictions.extend(temp_predictions)

    if opt.is_distributed:
        # objects = [tf_em, mc_em, re_em, tf_predictions, mc_predictions, re_predictions, raw_logits, qids, raw_answers]
        objects = [tf_em, mc_em, re_em, tf_predictions, mc_predictions, re_predictions, qids, raw_answers]
        all_objects = [None for _ in range(opt.world_size)]
        dist.gather_object(objects, all_objects if dist.get_rank() == 0 else None)

        if opt.is_main:
            main_list = [[] for _ in range(len(objects))]
            for rank, obj_list in enumerate(all_objects):
                for i, obj in enumerate(obj_list):
                    main_list[i] += obj  # extend list to gather
            # tf_em, mc_em, re_em, tf_predictions, mc_predictions, re_predictions, raw_logits, qids, raw_answers = main_list
            tf_em, mc_em, re_em, tf_predictions, mc_predictions, re_predictions, qids, raw_answers = main_list

    if mode == 'eval' and (not opt.is_distributed or opt.is_main):
        if len(tf_em) == 0:
            logger.info(f"EVAL: For T/F: Predicted N/A")
        else:
            logger.info(f"EVAL: For T/F: Predicted {tf_em.count(1)} Match {tf_em.count(0)} Wrong \
            ({tf_predictions.count('yes')} YES {tf_predictions.count('no')} NO) | EM: {round(tf_em.count(1) / len(tf_em) * 100, 2)}")
        if len(mc_em) == 0:
            logger.info(f"       For MC:  Predicted N/A")
        else:
            logger.info(f"       For MC:  Predicted {mc_em.count(1)} Match {mc_em.count(0)} Wrong | \
            EM: {round(mc_em.count(1) / len(mc_em) * 100, 2)}")
        if len(re_em) == 0:
            logger.info(f"       For Reg: Predicted N/A")
        else:
            logger.info(f"       For Reg: Dist {np.mean(re_em)}")

    if mode == 'train' and (not opt.is_distributed or opt.is_main):
        if len(tf_em) == 0:
            logger.info(f"TRAIN: For T/F: Predicted N/A")
        else:
            logger.info(f"TRAIN: For T/F: Predicted {tf_em.count(1)} Match {tf_em.count(0)} Wrong \
            ({tf_predictions.count('yes')} YES {tf_predictions.count('no')} NO) | EM: {round(tf_em.count(1) / len(tf_em) * 100, 2)}")
        if len(mc_em) == 0:
            logger.info(f"       For MC:  Predicted N/A")
        else:
            logger.info(f"       For MC:  Predicted {mc_em.count(1)} Match {mc_em.count(0)} Wrong | \
            EM: {round(mc_em.count(1) / len(mc_em) * 100, 2)}")
        if len(re_em) == 0:
            logger.info(f"       For Reg: Predicted N/A")
        else:
            logger.info(f"       For Reg: Dist {np.mean(re_em)}")

    if mode == 'eval' and (not opt.is_distributed or opt.is_main):
        with open(checkpoint_path / f'results_epoch{epoch}.obj', 'wb') as f:
            # pickle.dump(list(zip(qids, raw_answers, raw_logits)), f)
            pickle.dump(list(zip(qids, raw_answers)), f)

    exactmatch, total = src.util.weighted_average(np.mean(exactmatch) / 2, total, opt)
    return exactmatch, ans_list


In [None]:
preds = []
for question in test_questions:
    result = "..."
    if question['qtype'] == 't/f':
        result = normalize_answer(str(result))
        if result == "yes":
            preds.append([0, 1])
        elif result == "no":
            preds.append([1, 0])
        else:
            pred_idx = np.argmax(np.random.random(size=2))
            pred = np.ones(2)
            pred[pred_idx] += 1e-5
            preds.append(pred / pred.sum())
    elif question['qtype'] == 'mc':
        result = normalize_answer(str(result))
        try:
            pred = np.zeros(len(question['choices']))
            ans_idx = ord(result) - ord('a')
            pred[ans_idx] = 1
            preds.append(pred)
        except:
          pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
          pred = np.ones(len(question['choices']))
          pred[pred_idx] += 1e-5
          preds.append(pred / pred.sum())
    elif question['qtype'] == 'num':
        try:
            ans = float(result)
            preds.append(ans)
        except:
            preds.append(0.5)

In [None]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

updating: predictions.pkl (deflated 79%)


In [None]:
!ls

autocast_competition_test_set.json [36msubmission[m[m
autocast_questions.json            submission.zip
example_submission.ipynb
