### Before running this notebook, make sure to upload the `CommonsenseQA` folder as a zipped file to the working directory
- `/home/jupyter/` in case of Google AI platform notebooks.
- `/content/` in case of Google Colab

In [20]:
# Confirm GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sun May  3 05:22:06 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    42W /  70W |     10MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
import torch
torch.cuda.is_available()

True

In [2]:
# Unzip code
!unzip CommonsenseQA.zip

Archive:  CommonsenseQA.zip
   creating: CommonsenseQA/
  inflating: CommonsenseQA/convert_jsonl2tsv.py  
   creating: CommonsenseQA/data/
   creating: CommonsenseQA/data/CommonsenseQA/
  inflating: CommonsenseQA/data/CommonsenseQA/dict.txt  
  inflating: CommonsenseQA/data/CommonsenseQA/test.jsonl  
  inflating: CommonsenseQA/data/CommonsenseQA/train.jsonl  
  inflating: CommonsenseQA/data/CommonsenseQA/valid-propn.jsonl  
  inflating: CommonsenseQA/data/CommonsenseQA/valid.jsonl  
   creating: CommonsenseQA/fairseq/
   creating: CommonsenseQA/fairseq/checkpoints/
 extracting: CommonsenseQA/fairseq/checkpoints/.gitkeep  
   creating: CommonsenseQA/fairseq/examples/
   creating: CommonsenseQA/fairseq/examples/roberta/
   creating: CommonsenseQA/fairseq/examples/roberta/commonsense_qa/
  inflating: CommonsenseQA/fairseq/examples/roberta/commonsense_qa/commonsense_qa_task.py  
  inflating: CommonsenseQA/fairseq/examples/roberta/commonsense_qa/download_cqa_data.sh  
  inflating: Commonsen

In [3]:
!pip install fairseq



In [3]:
# Download roberta model
!wget -O roberta.large.tar.gz https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
!tar -xvzf roberta.large.tar.gz

# !wget -O /content/CommonsenseQA/roberta.base.tar.gz https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz
# !tar -xvzf /content/CommonsenseQA/roberta.base.tar.gz

--2020-03-13 06:14:29--  https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:6a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 655283069 (625M) [application/gzip]
Saving to: ‘roberta.large.tar.gz’


2020-03-13 06:14:51 (29.0 MB/s) - ‘roberta.large.tar.gz’ saved [655283069/655283069]

roberta.large/
roberta.large/dict.txt
roberta.large/model.pt
roberta.large/NOTE


In [1]:
%cd /home/jupyter/CommonsenseQA
!pwd

/home/jupyter/CommonsenseQA
/home/jupyter/CommonsenseQA


In [1]:
%%writefile finetune.sh
#!/bin/bash

## Write the finetuning part to a bash script file
# Modified following from the original script to get it to run on Google AI platform and Colab
# - Set MAX_SENTENCES=8
# - Added --update-freq 4

MAX_UPDATES=3000      # Number of training steps.
WARMUP_UPDATES=150    # Linearly increase LR over this many steps.
LR=1e-05              # Peak LR for polynomial LR scheduler.
MAX_SENTENCES=8      # Batch size.
SEED=23                # Random seed.

BASEDIR=/home/jupyter
# CQA_PATH=/content/CommonsenseQA # For Google Colab
CQA_PATH=$BASEDIR/CommonsenseQA # For Kaggle
ROBERTA_PATH=${BASEDIR}/roberta.large/model.pt
DATA_DIR=${CQA_PATH}/data/CommonsenseQA

# we use the --user-dir option to load the task from
# the examples/roberta/commonsense_qa directory:
FAIRSEQ_PATH=${CQA_PATH}/fairseq
FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa

cd $FAIRSEQ_PATH
CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \
    $DATA_DIR \
    --update-freq 4 \
    --save-dir ./checkpoints \
    --user-dir $FAIRSEQ_USER_DIR \
    --restore-file $ROBERTA_PATH \
    --reset-optimizer --reset-dataloader --reset-meters \
    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
    --task commonsense_qa --init-token 0 --bpe gpt2 \
    --arch roberta_large --max-positions 512 \
    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
    --criterion sentence_ranking --num-classes 5 \
    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr $LR \
    --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
    --max-sentences $MAX_SENTENCES \
    --max-update $MAX_UPDATES \
    --log-format simple --log-interval 25 \
    --seed $SEED

Writing finetune.sh


In [3]:
# Finetune
!bash finetune.sh

Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9, 0.98)', adam_eps=1e-06, arch='roberta_large', attention_dropout=0.1, best_checkpoint_metric='accuracy', bpe='gpt2', bucket_cap_mb=25, clip_norm=0.0, cpu=False, criterion='sentence_ranking', curriculum=0, data='/home/jupyter/CommonsenseQA/data/CommonsenseQA', dataset_impl=None, ddp_backend='no_c10d', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.1, empty_cache_freq=0, encoder_attention_heads=16, encoder_embed_dim=1024, encoder_ffn_embed_dim=4096, encoder_layerdrop=0, encoder_layers=24, encoder_layers_to_keep=None, end_learning_rate=0.0, fast_stat_sync=False, find_unused_parameters=False, fix_batches_to_gpus=False, fixed_validation_seed=None, force_anneal=None, fp16=True, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gpt2_encoder_json='h

In [7]:
%cd /home/jupyter/CommonsenseQA/fairseq
!pwd

/home/jupyter/CommonsenseQA/fairseq
/home/jupyter/CommonsenseQA/fairseq


In [8]:
# Try to resolve import path issues

%cd /home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa
import sys
# sys.path.insert(0, '/home/jupyter/CommonsenseQA/fairseq')
# sys.path.insert(0, '/home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa')
print(sys.path)
# import examples
# from examples.roberta import commonsense_qa
import commonsense_qa_task

/home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa
['/home/jupyter', '/opt/anaconda3/lib/python37.zip', '/opt/anaconda3/lib/python3.7', '/opt/anaconda3/lib/python3.7/lib-dynload', '', '/opt/anaconda3/lib/python3.7/site-packages', '/opt/anaconda3/lib/python3.7/site-packages/IPython/extensions', '/home/jupyter/.ipython']


In [4]:
%cd /home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa

import json
import torch
from fairseq.models.roberta import RobertaModel
# from examples.roberta import commonsense_qa  # load the Commonsense QA task
import commonsense_qa_task  # load the Commonsense QA task

base_dir = '/home/jupyter/CommonsenseQA'
roberta = RobertaModel.from_pretrained(base_dir + '/fairseq/checkpoints', 'checkpoint_best.pt', base_dir + '/data/CommonsenseQA')
print(0)
roberta.eval()  # disable dropout
print(1)
roberta.cuda()  # use the GPU (optional)
nsamples, ncorrect = 0, 0
wrong = []
with open(base_dir + '/data/CommonsenseQA/valid.jsonl') as h:
    print(3)
    for line in h:
        example = json.loads(line)
        scores = []
        for choice in example['question']['choices']:
            input = roberta.encode(
                'Q: ' + example['question']['stem'],
                'A: ' + choice['text'],
                no_separator=True
            )
            score = roberta.predict('sentence_classification_head', input, return_logits=True)
            scores.append(score)
#             print(choice['label'], score.data.item())

        pred = torch.cat(scores).argmax()
#         print('pred: ', chr(ord('A') + pred), 'correct: ', example['answerKey'])
        answer = ord(example['answerKey']) - ord('A')
        nsamples += 1
        if pred == answer:
            ncorrect += 1
        else:
            example['predicted'] = chr(ord('A') + pred)
            example['scores'] = {chr(ord('A') + i): s.data.item() for (i, s) in enumerate(scores)}
            wrong.append(json.dumps(example))

print(4)
# Write a file with JSON lines for wrong predictions
with open(base_dir + '/wrong_preds.jsonl', 'w') as f:
    f.write('\n'.join(wrong))

print(5)
print(f'Accuracy: {ncorrect}/{nsamples} = {ncorrect / float(nsamples)}')
# Accuracy from FB AI: 0.7846027846027847

/home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa
loading archive file /home/jupyter/CommonsenseQA/fairseq/checkpoints
loading archive file /home/jupyter/CommonsenseQA/data/CommonsenseQA
| dictionary: 50265 types
0
1
3
4
5
Accuracy: 956/1221 = 0.782964782964783


In [None]:
#Accuracy: 0.782964782964783

In [7]:
# wrong_preds jsonl2tsv
import json

choice_chars = ['A', 'B', 'C', 'D', 'E']
tsvlines = ['id\tquestion_concept\tquestion\tchoiceA\tscoreA\tchoiceB\tscoreB\tchoiceC\tscoreC\tchoiceD\tscoreD\tchoiceE\tscoreE\tanswer\tpredicted']

base_dir = '/home/jupyter/CommonsenseQA'
# base_dir = "D:\workspace\ASU\Courses\Spring-2020\CSE-576-Topics-in-Natural-Language-Processing\Project-COMMONSENSEQA\\NLP_CommonsenseQA\CommonsenseQA"
with open(base_dir + '/wrong_preds.jsonl') as f:
    for line in f:
        q = json.loads(line)
        l = []
        l.append(q['id'])
        l.append(q['question']['question_concept'])
        l.append(q['question']['stem'])

        choices = {}
        for c in q['question']['choices']:
            choices[c['label']] = f"{c['text']}\t{round(q['scores'][c['label']], 4)}"
        # To make sure TSV has choices in the order A,B,C,D,E
        for c in choice_chars:
            l.append(choices[c])

        l.append(q['answerKey'])
        l.append(q['predicted'])
        tsvlines.append('\t'.join(l))
        # print('\n'.join(tsvlines))
        # break

with open(base_dir + '/wrong_preds.tsv', 'w') as f:
    f.write('\n'.join(tsvlines))

In [3]:
# analysing proper nouns in the validation dataset
!pip install spacy
!python -m spacy download en_core_web_lg

You are using pip version 19.0.3, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.
Collecting en_core_web_lg==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz#egg=en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
Installing collected packages: en-core-web-lg
  Running setup.py install for en-core-web-lg: started
    Running setup.py install for en-core-web-lg: finished with status 'done'
Successfully installed en-core-web-lg-2.2.5
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')
You are using pip version 19.0.3, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [9]:
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()

In [24]:
# analysing proper nouns in the validation dataset
import json
from spacy.matcher import Matcher

# def on_match(matcher, doc, id, matches):
#     print('Matched!', matches)

pattern = [{'POS': 'PROPN'}]  # look for proper nouns
matcher = Matcher(nlp.vocab)
# matcher.add("PropNounsInCQA", [pattern], on_match=on_match)  # matcher.add expects a list of list
matcher.add("PropNounsInCQA", [pattern])  # matcher.add expects a list of list

new_valid = []
# base_path = "D:\workspace\ASU\Courses\Spring-2020\CSE-576-Topics-in-Natural-Language-Processing\Project-COMMONSENSEQA\\NLP_CommonsenseQA\CommonsenseQA"
base_path = '/home/jupyter/CommonsenseQA'
with open(base_path + '/data/CommonsenseQA/valid.jsonl') as f:
    for line in f:
        q = json.loads(line)
        l = []
        l.append(q['question']['question_concept'])
        l.append(q['question']['stem'])
        l += [c['text'] for c in q['question']['choices']]
        doc = nlp(' '.join(l))  # get POS tags for concept + question + choices
        matches = matcher(doc)
        if matches:
            # q['has_propn'] = True
            q['proper_nouns'] = []
        print(doc[:])
        for m in matches:
            print(doc[m[1]:m[2]])
            q['proper_nouns'].append(doc[m[1]:m[2]])

        new_valid.append(json.dumps(q))

with open(base_path + '/data/CommonsenseQA/valid-propn.jsonl', 'w') as f:
    f.write('\n'.join(new_valid))


bank
library
department
store
mall
new
york


TypeError: Object of type Span is not JSON serializable

In [9]:
!pip install geotext

Collecting geotext
[?25l  Downloading https://files.pythonhosted.org/packages/25/c5/36351193092cb4c1d7002d2a3babe5e72ae377868473933d6f63b41e5454/geotext-0.4.0-py2.py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 2.7MB/s eta 0:00:01
[?25hInstalling collected packages: geotext
Successfully installed geotext-0.4.0


In [10]:
from geotext import GeoText

# So geotext doesn't seem to work
places = GeoText("Arizona is a great state")
places.__dict__

{'countries': [],
 'cities': [],
 'nationalities': [],
 'country_mentions': OrderedDict()}

In [12]:
# Find questions with where, what, not
import json
import re

whereq = 0
whatq = 0
notq = 0
# base_path = "D:\workspace\ASU\Courses\Spring-2020\CSE-576-Topics-in-Natural-Language-Processing\Project-COMMONSENSEQA\\NLP_CommonsenseQA\CommonsenseQA"
base_path = '/home/jupyter/CommonsenseQA'
with open(base_path + '/data/CommonsenseQA/valid.jsonl') as f:
    for line in f:
        example = json.loads(line)
        ques = example['question']['stem']
        # This is very crude and not really accurate as both words might appear in the question and will be added to both counts
        if re.search('where', ques, re.IGNORECASE):
            whereq += 1
        if re.search('what', ques, re.IGNORECASE):
            whatq += 1
        if re.search('not', ques, re.IGNORECASE):
            notq += 1

wherew = 0
whatw = 0
notw = 0
second_best = 0
total_wrong = 0
with open(base_path + '/wrong_preds.jsonl') as f:
    for line in f:
        total_wrong += 1
        example = json.loads(line)
        ques = example['question']['stem']
        if re.search('where', ques, re.IGNORECASE):
            wherew += 1
        if re.search('what', ques, re.IGNORECASE):
            whatw += 1
        if re.search('not', ques, re.IGNORECASE):
            notw += 1
        # Questions where 2nd best answer was the correct one
        if example['scores'][example['answerKey']] == list(sorted(example['scores'].values()))[-2]:
            second_best += 1

print(f'Number of questions with "what" predicted wrong: {whatw}/{whatq}')
print(f'Number of questions with "where" predicted wrong: {wherew}/{whereq}')
print(f'Number of questions with "not" predicted wrong: {notw}/{notq}')
print(f'Number of questions with correct answer being 2nd best: {second_best}/{total_wrong}')

Number of questions with "what" predicted wrong: 167/784
Number of questions with "where" predicted wrong: 81/357
Number of questions with "not" predicted wrong: 17/74
Number of questions with correct answer being 2nd best: 172/265


# PHASE 2 EXPERIMENTS

For phase 2, we tried to add external knowledge bases and extract facts for each quetion + answer option combination to train a model with extra knowledge.

We used scripts from [McQueen](https://github.com/ari9dam/McQueen) to ingest KB, IR from them, rerank facts using sentence similarity and finally construct the dataset in the format required by McQueen's MCQ solvers.

Following 3 experiments were tried:
  

In [3]:
!pip install pytorch-transformers==1.1.0

Collecting pytorch-transformers==1.1.0
  Using cached https://files.pythonhosted.org/packages/50/89/ad0d6bb932d0a51793eaabcf1617a36ff530dc9ab9e38f765a35dc293306/pytorch_transformers-1.1.0-py3-none-any.whl
Collecting boto3
  Using cached https://files.pythonhosted.org/packages/c8/1e/587abcd94e8f6dbd42df730f40eb5f7313b6fd7255f5ef5a0db53d116999/boto3-1.13.1-py2.py3-none-any.whl
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/92/9d/dcaaba6fcee6a5c3b36c465557720f088c29cdb5931bc8b4b2556394b3d0/sentencepiece-0.1.86-cp37-cp37m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.8MB/s eta 0:00:01
Collecting jmespath<1.0.0,>=0.7.1
  Using cached https://files.pythonhosted.org/packages/a3/43/1e939e1fcd87b827fe192d0c9fc25b48c5b3368902bfb913de7754b0dc03/jmespath-0.9.5-py2.py3-none-any.whl
Collecting botocore<1.17.0,>=1.16.1
  Using cached https://files.pythonhosted.org/packages/46/b8/588f44ac91f280beabd0d5ce192a65f50e32e39ebb2a419359

## Experiment 1: McQueen Roberta Concat Solver

See `logs/mcqueen-robertalg_concat_2e6_009.log` for the training log.

**Accuracy: ~20%**

See README for analysis.

In [6]:
!nohup python -u hf_trainer.py --training_data_path /home/jupyter/CommonsenseQA/data/CommonsenseQA/train_mergeIR.jsonl --validation_data_path /home/jupyter/CommonsenseQA/data/CommonsenseQA/valid_mergeIR.jsonl --num_labels 5  --mcq_model roberta-mcq-concat --bert_model roberta-large --output_dir ./robertalg_concat_2e6_009 --num_train_epochs 5 --train_batch_size 64  --do_eval --do_train --max_seq_length 8 --do_lower_case --gradient_accumulation_steps 1  --learning_rate 2e-6 --weight_decay 0.009  --eval_freq 1000 --warmup_steps 250  --overwrite_output_dir > mcqueen-robertalg_concat_2e6_009.log &

## Experiment 2: Fairseq Roberta Concat with KBs ARC, Webchild, OpenbookQA, and Atomic

See `CommonsenseQA/finetune-arc-web-open-atomic.sh` for the finetuning script and  `logs/finetune-arc-web-open-atomic.log` for training log.

In [3]:
%cd /home/jupyter/CommonsenseQA

/home/jupyter/CommonsenseQA


In [4]:
%%writefile finetune-arc-web-open-atomic.sh
#!/bin/bash

## Write the finetuning part to a bash script file
# Modified following from the original script to get it to run on Google AI platform and Colab
# - Set MAX_SENTENCES=8
# - Added --update-freq 4

MAX_UPDATES=3000      # Number of training steps.
WARMUP_UPDATES=150    # Linearly increase LR over this many steps.
LR=1e-05              # Peak LR for polynomial LR scheduler.
MAX_SENTENCES=2      # Batch size.
SEED=23                # Random seed.

BASEDIR=/home/jupyter
# CQA_PATH=/content/CommonsenseQA # For Google Colab
CQA_PATH=$BASEDIR/CommonsenseQA # For Kaggle
ROBERTA_PATH=${BASEDIR}/roberta.large/model.pt
DATA_DIR=${CQA_PATH}/data/CommonsenseQA/arc-web-open-atomic

# we use the --user-dir option to load the task from
# the examples/roberta/commonsense_qa directory:
FAIRSEQ_PATH=${CQA_PATH}/fairseq
FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa_with_kb

cd $FAIRSEQ_PATH
CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \
    $DATA_DIR \
    --update-freq 4 \
    --save-dir ./checkpoints \
    --user-dir $FAIRSEQ_USER_DIR \
    --restore-file $ROBERTA_PATH \
    --reset-optimizer --reset-dataloader --reset-meters \
    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
    --task commonsense_qa_with_kb --init-token 0 --bpe gpt2 \
    --arch roberta_large --max-positions 512 \
    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
    --criterion sentence_ranking --num-classes 5 \
    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr $LR \
    --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
    --max-sentences $MAX_SENTENCES \
    --max-update $MAX_UPDATES \
    --log-format simple --log-interval 25 \
    --seed $SEED

Writing finetune-arc-web-open-atomic.sh


In [5]:
%cd /home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa_with_kb

import json
import torch
from fairseq.models.roberta import RobertaModel
# from examples.roberta import commonsense_qa  # load the Commonsense QA task
import commonsense_qa_with_kb_task  # load the Commonsense QA task

base_dir = '/home/jupyter/CommonsenseQA'
roberta = RobertaModel.from_pretrained(base_dir + '/fairseq/checkpoints', 'checkpoint_best.pt', base_dir + '/data/CommonsenseQA/arc-web-open-atomic')
# print(0)
roberta.eval()  # disable dropout
# print(1)
roberta.cuda()  # use the GPU (optional)
nsamples, ncorrect = 0, 0
wrong = []
with open(base_dir + '/data/CommonsenseQA/arc-web-open-atomic/valid.jsonl') as h:
    print(3)
    for line in h:
        example = json.loads(line)
        scores = []
        for i, choice in enumerate(example['choices']):
            input = roberta.encode(
                'Q: ' + example['question'],
                'A: ' + choice + '. ' + '. '.join(example['premises'][i]),
                no_separator=True
            )[:512]  # truncate to 512 if necessary
            score = roberta.predict('sentence_classification_head', input, return_logits=True)
            scores.append(score)
#             print(choice['label'], score.data.item())

        pred = torch.cat(scores).argmax()
#         print('pred: ', chr(ord('A') + pred), 'correct: ', example['answerKey'])
        answer = example['gold_label']
        nsamples += 1
        if pred == answer:
            ncorrect += 1
        else:
            example['predicted'] = chr(ord('A') + pred)
            example['scores'] = {chr(ord('A') + i): s.data.item() for (i, s) in enumerate(scores)}
            wrong.append(json.dumps(example))

# print(4)
# Write a file with JSON lines for wrong predictions
with open(base_dir + '/wrong_preds.jsonl', 'w') as f:
    f.write('\n'.join(wrong))

# print(5)
print(f'Accuracy: {ncorrect}/{nsamples} = {ncorrect / float(nsamples)}')
# Accuracy from FB AI: 0.7846027846027847

/home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa_with_kb
loading archive file /home/jupyter/CommonsenseQA/fairseq/checkpoints
loading archive file /home/jupyter/CommonsenseQA/data/CommonsenseQA/arc-web-open-atomic
| dictionary: 50265 types
3
Accuracy: 934/1221 = 0.764946764946765


## Experiment 3: Fairseq Roberta Concat with KBs ARC, Webchild, and ConceptNet

See `CommonsenseQA/finetune-web-arc-cn.sh` for the finetuning script and  `logs/finetune-web-arc-cn.log` for training log.

In [10]:
%cd /home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa_with_kb

import json
import torch
from fairseq.models.roberta import RobertaModel
# from examples.roberta import commonsense_qa  # load the Commonsense QA task
import commonsense_qa_with_kb_task  # load the Commonsense QA task

base_dir = '/home/jupyter/CommonsenseQA'
roberta = RobertaModel.from_pretrained(base_dir + '/fairseq/checkpoints', 'checkpoint_best.pt', base_dir + '/data/CommonsenseQA/web-arc-cn')
print(0)
roberta.eval()  # disable dropout
print(1)
roberta.cuda()  # use the GPU (optional)
nsamples, ncorrect = 0, 0
wrong = []
with open(base_dir + '/data/CommonsenseQA/web-arc-cn/valid.jsonl') as h:
    print(3)
    for line in h:
        example = json.loads(line)
        scores = []
        for i, choice in enumerate(example['choices']):
            input = roberta.encode(
                'Q: ' + example['question'],
                'A: ' + choice + '. ' + '. '.join(example['premises'][i]),
                no_separator=True
            )[:512]
            score = roberta.predict('sentence_classification_head', input, return_logits=True)
            scores.append(score)
#             print(choice['label'], score.data.item())

        pred = torch.cat(scores).argmax()
#         print('pred: ', chr(ord('A') + pred), 'correct: ', example['answerKey'])
        answer = example['gold_label']
        nsamples += 1
        if pred == answer:
            ncorrect += 1
        else:
            example['predicted'] = chr(ord('A') + pred)
            example['scores'] = {chr(ord('A') + i): s.data.item() for (i, s) in enumerate(scores)}
            wrong.append(json.dumps(example))

print(4)
# Write a file with JSON lines for wrong predictions
with open(base_dir + '/wrong_preds.jsonl', 'w') as f:
    f.write('\n'.join(wrong))

print(5)
print(f'Accuracy: {ncorrect}/{nsamples} = {ncorrect / float(nsamples)}')
# Accuracy from FB AI: 0.7846027846027847

/home/jupyter/CommonsenseQA/fairseq/examples/roberta/commonsense_qa_with_kb
loading archive file /home/jupyter/CommonsenseQA/fairseq/checkpoints
loading archive file /home/jupyter/CommonsenseQA/data/CommonsenseQA/web-arc-cn
| dictionary: 50265 types
0
1
3
4
5
Accuracy: 935/1221 = 0.7657657657657657
