In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
sys.path.append("examples/")

import logging
import argparse
import json
from tqdm import tqdm, trange
import csv

import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel 
from pytorch_pretrained_bert.optimization import BertAdam

from torch.utils.data import Dataset
import random

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
from run_autoreg_eval import BERTDataset, RNNModel

In [13]:
# args
gradient_accumulation_steps = 1
train_batch_size = 1
eval_file = "dataset/dev-v2.0.json"
max_seq_length=128
on_memory = True
bert_model = "autoreg_model_lm_noft/pytorch_model9.bin"

In [14]:
device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
n_gpu = torch.cuda.device_count()

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if n_gpu > 0:
    torch.cuda.manual_seed_all(42)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

# Load eval_data
eval_dataset_answerable = BERTDataset(eval_file, tokenizer, seq_len=max_seq_length,
                            on_memory=on_memory, answerable=True)
eval_dataset_unanswerable = BERTDataset(eval_file, tokenizer, seq_len=max_seq_length,
                           on_memory=on_memory, answerable=False)

# Prepare model
model_state_dict = torch.load(bert_model, map_location='cpu') #TODO daniter: remove this map_location
## TODO daniter: check if bert model is being loaded correctly
context_model = BertModel.from_pretrained("bert-base-uncased")#, state_dict=model_state_dict)
question_model = BertModel.from_pretrained("bert-base-uncased")#, state_dict=model_state_dict)
context_model.to(device)
question_model.to(device)


# Prepare optimizer
print("Checking the vocab size:", len(tokenizer.vocab))
# 768 is bert hidden size, 256 is GRU hidden size, 1 is the layers in the GRU
model = RNNModel("GRU", len(tokenizer.vocab), 768, 768, 1, context_model, question_model, ngpu=n_gpu)
model.load_state_dict(model_state_dict)
model.to(device)

# eval loader
eval_sampler_ans = SequentialSampler(eval_dataset_answerable)
eval_dataloader_ans = DataLoader(eval_dataset_answerable, sampler=eval_sampler_ans,
                                 batch_size=train_batch_size)
eval_sampler_unans = SequentialSampler(eval_dataset_unanswerable)
eval_dataloader_unans = DataLoader(eval_dataset_unanswerable, sampler=eval_sampler_unans,
                                   batch_size=train_batch_size)


criterion = nn.CrossEntropyLoss()
model.init_hidden(train_batch_size)
pass
# with torch.no_grad():
#     model.eval()

#     eval_loss_ans = 0
#     for batch_i, eval_batch in enumerate(eval_dataloader_ans):
#         assert False
#         if batch_i % 1000 == 0:
#             print("#### DANITER completed answerable", batch_i)
#         eids = eval_batch[-1]
#         eval_batch = tuple(t.to(device) for t in eval_batch[:-1])
#         question_ids, question_mask, context_ids, context_mask, targets = eval_batch
#         output, _ = model(context_ids, context_mask, question_ids, question_mask)
#         loss = criterion(output.view(-1, len(tokenizer.vocab)), question_ids.view(-1))
#         eval_loss_ans += loss.item()
#     print("##### DANITER EVAL LOSS IS (ANSWERABLE) : ", eval_loss_ans)

#     eval_loss_unans = 0
#     for batch_i, eval_batch in enumerate(eval_dataloader_unans):
#         if batch_i % 1000 == 0:
#             print("#### DANITER completed unanswerable", batch_i)
#         eids = eval_batch[-1]
#         eval_batch = tuple(t.to(device) for t in eval_batch[:-1])
#         question_ids, question_mask, context_ids, context_mask, targets = eval_batch
#         output, _ = model(context_ids, context_mask, question_ids, question_mask)
#         loss = criterion(output.view(-1, len(tokenizer.vocab)), question_ids.view(-1))
#         eval_loss_unans += loss.item()
#     print("##### DANITER EVAL LOSS IS (UNANSWERABLE) : ", eval_loss_unans)

02/15/2019 11:12:14 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/daniter/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
Loading Squad: 100%|██████████| 35/35 [00:00<00:00, 715.28it/s]
Loading Squad: 100%|██████████| 35/35 [00:00<00:00, 3620.51it/s]
02/15/2019 11:12:16 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/daniter/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
02/15/2019 11:12:16 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/daniter/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1

Checking the vocab size: 30522


  "num_layers={}".format(dropout, num_layers))


In [52]:
with torch.no_grad():
    model.eval()

    eval_loss_ans = 0
    for batch_i, eval_batch in enumerate(eval_dataloader_ans):
        eids = eval_batch[-1]
        eval_batch = tuple(t.to(device) for t in eval_batch[:-1])
        question_ids, question_mask, context_ids, context_mask, targets = eval_batch
        output, _ = model(context_ids, context_mask, question_ids, question_mask)
        loss = criterion(output.view(-1, len(tokenizer.vocab)), targets.view(-1))
        eval_loss_ans += loss.item()
        if batch_i > 10:
            break
#         if loss.item() > 0.01:
#             print(batch_i, eval_loss_ans)
#             break

In [53]:
print(loss.item())

0.38543012738227844


In [54]:
print(tokenizer.convert_ids_to_tokens(context_ids.data.numpy()[0]))

['[CLS]', 'in', 'the', 'course', 'of', 'the', '10th', 'century', ',', 'the', 'initially', 'destructive', 'inc', '##urs', '##ions', 'of', 'norse', 'war', 'bands', 'into', 'the', 'rivers', 'of', 'france', 'evolved', 'into', 'more', 'permanent', 'en', '##camp', '##ments', 'that', 'included', 'local', 'women', 'and', 'personal', 'property', '.', 'the', 'duchy', 'of', 'normandy', ',', 'which', 'began', 'in', '911', 'as', 'a', 'fi', '##ef', '##dom', ',', 'was', 'established', 'by', 'the', 'treaty', 'of', 'saint', '-', 'clair', '-', 'sur', '-', 'ep', '##te', 'between', 'king', 'charles', 'iii', 'of', 'west', 'fran', '##cia', 'and', 'the', 'famed', 'viking', 'ruler', 'roll', '##o', ',', 'and', 'was', 'situated', 'in', 'the', 'former', 'frankish', 'kingdom', 'of', 'ne', '##ust', '##ria', '.', 'the', 'treaty', 'offered', 'roll', '##o', 'and', 'his', 'men', 'the', 'french', 'lands', 'between', 'the', 'river', 'ep', '##te', 'and', 'the', 'atlantic', 'coast', 'in', 'exchange', 'for', 'their', 'prot

In [55]:
print(question_ids)
print(tokenizer.convert_ids_to_tokens(question_ids.data.numpy()[0]))

tensor([[  101,  2040,  2106,  4897,  2080,  3696,  1996,  5036,  1997,  3002,
          1011, 17936,  1011,  7505,  1011,  4958,  2618,  2007,  1029,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [56]:
o = output.data.numpy()
print(tokenizer.convert_ids_to_tokens(np.argmax(o[0], axis=1)))

['who', 'did', 'john', '##o', 'establish', 'the', 'treaty', 'of', 'tie', '-', 'john', 'de', 'de', '-', 'en', '##te', '?', '?', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

In [47]:
for i in range(10):
    print(np.argmax(o[0,i,:]), np.exp(np.max(o[0,i,:]))/ np.sum(np.exp(o[0,i,:])))

2013 0.42656258
2029 0.99888664
3032 0.98821425
2106 0.96953803
1996 0.7692266
7658 0.3381953
21754 0.91210896
2013 0.755071
0 0.99999976
0 1.0


In [21]:
from collections import Counter

In [35]:
c = Counter()

In [50]:
for i in range(o.shape[2]):
    c[i] = np.exp(o[0,7,i])/ np.sum(np.exp(o[0,7,:]))

In [51]:
for idx, score in c.most_common()[:25]:
    print(tokenizer.convert_ids_to_tokens([idx]), score)

['from'] 0.755071
['?'] 0.24354506
['between'] 0.00030850538
['and'] 0.00017943446
['in'] 0.00016526289
['originate'] 0.00013509695
['with'] 0.00012702325
['out'] 5.2344498e-05
['to'] 4.9995124e-05
['after'] 4.5036806e-05
['during'] 3.9964147e-05
['for'] 3.017783e-05
['primarily'] 2.6414084e-05
['into'] 2.418289e-05
['instead'] 1.4401603e-05
['or'] 1.1509827e-05
['by'] 9.858294e-06
['north'] 8.104101e-06
[','] 6.5176055e-06
['towards'] 5.350587e-06
['originating'] 5.0490876e-06
['located'] 4.877459e-06
['which'] 3.9955703e-06
['that'] 3.991282e-06
['acquired'] 3.5135242e-06


In [41]:
c[2435] # normandy

0.0006109915

In [60]:
with torch.no_grad():
    model.eval()

    eval_loss_ans = 0
    for batch_i, eval_batch in enumerate(eval_dataloader_unans):
        eids = eval_batch[-1]
        eval_batch = tuple(t.to(device) for t in eval_batch[:-1])
        question_ids, question_mask, context_ids, context_mask, targets = eval_batch
        output, _ = model(context_ids, context_mask, question_ids, question_mask)
        loss = criterion(output.view(-1, len(tokenizer.vocab)), question_ids.view(-1))
        eval_loss_ans += loss.item()
        break
        if loss.item() > 0.01:
            print(batch_i, eval_loss_ans)
            break

In [61]:
eval_loss_ans

3.0606985092163086e-05

In [62]:
print(question_ids)
print(tokenizer.convert_ids_to_tokens(question_ids.data.numpy()[0]))

tensor([[  101,  2040,  2435,  2037,  2171,  2000, 13298,  1999,  1996,  6694,
          1005,  1055,  1998, 22096,  1005,  1055,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [63]:
o = output.data.numpy()
print(o)
print(tokenizer.convert_ids_to_tokens(np.argmax(o[0], axis=1)))

[[[ 4.9845333  -0.5840121  -1.7072299  ... -0.8986035  -0.7701622
   -2.4625373 ]
  [ 5.5687795  -3.1512156  -1.5189477  ... -2.2867513  -0.81157726
   -1.9473257 ]
  [ 1.8056442  -2.4558816  -1.3853037  ... -1.5242887  -1.9538682
   -2.677875  ]
  ...
  [33.50697    -1.898835   -3.0913079  ... -3.4296584  -3.0715473
   -2.3606715 ]
  [33.482018   -1.8573587  -3.0754821  ... -3.4108062  -3.087913
   -2.3764603 ]
  [33.43003    -1.8635046  -3.1077945  ... -3.40165    -3.1059961
   -2.4096801 ]]]
['[CLS]', 'who', 'gave', 'their', 'name', 'to', 'normandy', 'in', 'the', '1000', "'", 's', 'and', '1100', "'", 's', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [66]:
for i in range(20):
    print(np.argmax(o[0,i,:]), np.exp(np.max(o[0,i,:]))/ np.sum(np.exp(o[0,i,:])))

101 1.0
2040 0.99998236
2435 0.99976426
2037 0.99991757
2171 0.999995
2000 0.99999946
13298 0.9999513
1999 0.99999833
1996 0.99999017
6694 0.998527
1005 0.99999917
1055 0.999995
1998 0.99999845
22096 0.99808013
1005 0.9999972
1055 0.99999726
0 0.99986583
0 0.99999994
0 1.0
0 1.0


# Interesting Examples

In [91]:
from collections import Counter
def print_details(context_ids, question_ids, output, loss, tokenizer):
    print("Loss:", loss.item())
    print("CONTEXT")
    print(tokenizer.convert_ids_to_tokens(context_ids.data.numpy()[0]))
    print("~"*30)
    print("QUESTION")
    q_ids = [i for i in question_ids.data.numpy()[0] if i != 0]
    print(q_ids)
    q_toks = [tok for tok in tokenizer.convert_ids_to_tokens(question_ids.data.numpy()[0]) if tok != '[PAD]']
    print(q_toks)
    print("~"*30)
    print("OUTPUT")
    o = output.data.numpy()
    out_ids = [i for i in np.argmax(o[0], axis=1) if i != 0]
    out_toks = [tok for tok in tokenizer.convert_ids_to_tokens(np.argmax(o[0], axis=1)) if tok != '[PAD]']
    scores = [(np.argmax(o[0,i,:]), np.exp(np.max(o[0,i,:]))/ np.sum(np.exp(o[0,i,:]))) for i in range(len(out_toks))]
    print(out_toks)
    print(list(zip(out_toks, scores)))
    print("~"*30)
    print("TOP K FOR INCORRECT TERMS:")
    for tok_i, (tar, out) in enumerate(zip(q_ids[1:], out_ids)):
        if tar != out:
            print ("Output", out_toks[tok_i], "instead of ", q_toks[tok_i+1])
            c = Counter()
            for i in range(o.shape[2]):
                c[i] = np.exp(o[0,tok_i,i])/ np.sum(np.exp(o[0,tok_i,:]))
            for idx, score in c.most_common()[:10]:
                print("- \t",tokenizer.convert_ids_to_tokens([idx]), score)
    print("#"*30)

In [92]:
explore = {eval_dataloader_ans: [0,1], eval_dataloader_unans: [0, 15]}

with torch.no_grad():
    model.eval()

    eval_loss_ans = 0
    for dataloader in explore.keys():
        for batch_i, eval_batch in enumerate(dataloader):
            eids = eval_batch[-1]
            eval_batch = tuple(t.to(device) for t in eval_batch[:-1])
            question_ids, question_mask, context_ids, context_mask, targets = eval_batch
            output, _ = model(context_ids, context_mask, question_ids, question_mask)
            loss = criterion(output.view(-1, len(tokenizer.vocab)), targets.view(-1))
            eval_loss_ans += loss.item()
            if batch_i in explore[dataloader]:
                if dataloader == eval_dataloader_ans:
                    print("Answerable:",batch_i)
                else:
                    print("Unanswerable:",batch_i)
                print_details(context_ids, question_ids, output, loss, tokenizer)
            if batch_i > max(explore[dataloader]):
                break


Answerable: 0
Loss: 0.07345549762248993
CONTEXT
['[CLS]', 'the', 'norman', '##s', '(', 'norman', ':', 'no', '##ur', '##man', '##ds', ';', 'french', ':', 'norman', '##ds', ';', 'latin', ':', 'norman', '##ni', ')', 'were', 'the', 'people', 'who', 'in', 'the', '10th', 'and', '11th', 'centuries', 'gave', 'their', 'name', 'to', 'normandy', ',', 'a', 'region', 'in', 'france', '.', 'they', 'were', 'descended', 'from', 'norse', '(', '"', 'norman', '"', 'comes', 'from', '"', 'norse', '##man', '"', ')', 'raiders', 'and', 'pirates', 'from', 'denmark', ',', 'iceland', 'and', 'norway', 'who', ',', 'under', 'their', 'leader', 'roll', '##o', ',', 'agreed', 'to', 'swear', 'fe', '##al', '##ty', 'to', 'king', 'charles', 'iii', 'of', 'west', 'fran', '##cia', '.', 'through', 'generations', 'of', 'assimilation', 'and', 'mixing', 'with', 'the', 'native', 'frankish', 'and', 'roman', '-', 'gaul', '##ish', 'populations', ',', 'their', 'descendants', 'would', 'gradually', 'merge', 'with', 'the', 'carol', '##ing

- 	 ['john'] 0.33094674
- 	 ['joshua'] 0.21873909
- 	 ['oct'] 0.07884895
- 	 ['charles'] 0.033959374
- 	 ['edward'] 0.03276904
- 	 ['frederic'] 0.031260744
- 	 ['james'] 0.017703975
- 	 ['napoleon'] 0.013841213
- 	 ['alfred'] 0.01328788
- 	 ['ae'] 0.012359366
Output arrive instead of  begin
- 	 ['arrive'] 0.9563557
- 	 ['begin'] 0.022727778
- 	 ['depart'] 0.013512615
- 	 ['start'] 0.003096774
- 	 ['return'] 0.00088552793
- 	 ['take'] 0.0006476474
- 	 ['come'] 0.0006314512
- 	 ['go'] 0.00036981778
- 	 ['become'] 0.00032174477
- 	 ['move'] 0.0002980802
##############################


# Notes
* LM was suprisingly good at guessing question type and general structure of the question. It may be because the BERT representation is leaky in terms of representation  
    - may be interesting result of its own ... Q: How much does the feature of each BERT word contain information about the surrounding BERT words?  
    - this is probably why the model is pretty good at guessing Q type from the [CLS] token  
* the Sentence repr is not enough represent specific entities in the text  
    - gets confused between normandy and france and mongolia and normans   
    - this might be because I don't train the sentence repr which is probably a mistake and we should retrain this one with trained sentence repr and forward masking in the question repr
    - Maybe we can do an attention over entities? 
    - There is a bias towards more common entities right now (ie. john and paris instead of rollo and normandy)
    - the fact that stuff like Rollo and dates were predicted well means there is leakiness in the representation
* rank may be more important than loss since sometimes the #1 option has very high prob but number 2 is really good (eg. arrive vs begin U15)