In [6]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
sys.path.append("examples/")

import logging
import argparse
import json
from tqdm import tqdm, trange
import csv

import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining 
from pytorch_pretrained_bert.optimization import BertAdam

from torch.utils.data import Dataset
import random

In [22]:
from run_chunk_lm_finetune import InputExample, random_word, InputFeatures

In [8]:
# args
gradient_accumulation_steps = 1
train_batch_size = 1
eval_file = "dataset/dev-v2.0.json"
max_seq_length=256
on_memory = True
bert_model = "model_chunk/pytorch_model.bin"

In [12]:
class BERTDataset(Dataset):
    def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", on_memory=True, answerable_flag=True):
        self.vocab = tokenizer.vocab
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.on_memory = on_memory
        self.corpus_path = corpus_path
        self.encoding = encoding

        # for loading samples directly from file
        self.sample_counter = 0  # used to keep track of full epochs on file
        self.line_buffer = None  # keep second sentence of a pair in memory and use as first sentence in next pair

        # for loading samples in memory
        self.questions = []
        self.contexts = []
        self.examples = []

        # load samples into memory
        if on_memory:
            # DANITER: Load Squad data
            with open(corpus_path, 'r') as handle:
                jdata = json.load(handle)
                data = jdata['data']

            for i in tqdm(range(len(data)), "Loading Squad", total=len(data)):
                section = data[i]['paragraphs']
                for sec in section:
                    context = sec['context']
                    self.contexts.append(context)
                    qas = sec['qas']
                    for j in range(len(qas)):
                        question = qas[j]['question']
                        unanswerable = qas[j]['is_impossible']
                        self.questions.append(question)
                        if unanswerable and answerable_flag:
                            continue
                        if not unanswerable and not answerable_flag:
                            continue
                        self.examples.append((len(self.contexts)-1, len(self.questions)-1))

#             with open("../training_data_chunks.pkl", "rb") as handle:
#                 self.training_data_map = pickle.load(handle)


        # load samples later lazily from disk
        else:
            raise Exception("No supported")

    def __len__(self):
        # last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
        return len(self.examples)

    def __getitem__(self, item):
        cur_id = self.sample_counter
        self.sample_counter += 1
        if not self.on_memory:
            raise Exception("No supported")

        while True:
            t1, t2, target, is_next_label = self.get_example(item)

            # tokenize
            tokens_a = self.tokenizer.tokenize(t1)
            tokens_b = self.tokenizer.tokenize(t2)
            if len(tokens_a) + len(tokens_b) + 3 > self.seq_len :
                item += 1
            else:
                break

        # combine to one sample
        cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label, target=target)

        # transform sample to features
        cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)

        cur_tensors = (torch.tensor(cur_features.input_ids),
                       torch.tensor(cur_features.input_mask),
                       torch.tensor(cur_features.segment_ids),
                       torch.tensor(cur_features.lm_label_ids),
                       torch.tensor(cur_features.is_next))

        return cur_tensors

    def get_example(self, index):
        """
        Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
        from one doc. With 50% the second sentence will be a random one from another doc.
        :param index: int, index of sample.
        :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
        """
        t1, t2 = self.get_corpus_line(index)

        target = (None, None) # keep same shape
        # Daniter we do not do next sentence prediction

        assert len(t1) > 0
        assert len(t2) > 0
        return t1, t2, target, 1

    def get_corpus_line(self, item):
        """
        Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
        :param item: int, index of sample.
        :return: (str, str), two subsequent sentences from corpus
        """
        t1 = ""
        t2 = ""
        assert item < len(self.examples)
        if self.on_memory:
            # DANITER - get the context and question pair based on the example indexes
            context_idx, question_idx = self.examples[item]
            t1 = self.contexts[context_idx]
            t2 = self.questions[question_idx]

            # used later to avoid random nextSentence from same doc
            return t1, t2
        else:
            raise Exception("No supported")

In [18]:
def convert_example_to_features(example, max_seq_length, tokenizer):
    """
    Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
    IDs, LM labels, input_mask, CLS and SEP tokens etc.
    :param example: InputExample, containing sentence input as strings and is_next label
    :param max_seq_length: int, maximum length of sequence.
    :param tokenizer: Tokenizer
    :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
    """
    tokens_a = example.tokens_a
    tokens_b = example.tokens_b
    # Modifies `tokens_a` and `tokens_b` in place so that the total
    # length is less than the specified length.
    # Account for [CLS], [SEP], [SEP] with "- 3"
    # _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

    t1_random, t1_label = random_word(tokens_a, tokenizer, question=False)
    t2_random, t2_label = random_word(tokens_b, tokenizer, question=False)
    # concatenate lm labels and account for CLS, SEP, SEP
    lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0   0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambigiously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    assert len(t2_random) > 0
    for token in t2_random:
        tokens.append(token)
        segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
    while len(lm_label_ids) < max_seq_length:
        lm_label_ids.append(-1)

    assert len(input_ids) == max_seq_length, len(input_ids)
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length


    features = InputFeatures(input_ids=input_ids,
                             input_mask=input_mask,
                             segment_ids=segment_ids,
                             lm_label_ids=lm_label_ids,
                             is_next=example.is_next)
    return features

In [14]:
device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
n_gpu = torch.cuda.device_count()

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if n_gpu > 0:
    torch.cuda.manual_seed_all(42)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

# Load eval_data
eval_dataset_answerable = BERTDataset(eval_file, tokenizer, seq_len=max_seq_length,
                            on_memory=on_memory, answerable_flag=True)
eval_dataset_unanswerable = BERTDataset(eval_file, tokenizer, seq_len=max_seq_length,
                           on_memory=on_memory, answerable_flag=False)

# Prepare model
model_state_dict = torch.load(bert_model, map_location='cpu') #TODO daniter: remove this map_location
## TODO daniter: check if bert model is being loaded correctly
model = BertForPreTraining.from_pretrained("bert-base-uncased", state_dict=model_state_dict)
model.to(device)


# Prepare optimizer
print("Checking the vocab size:", len(tokenizer.vocab))
# 768 is bert hidden size, 256 is GRU hidden size, 1 is the layers in the GRU

# eval loader
eval_sampler_ans = SequentialSampler(eval_dataset_answerable)
eval_dataloader_ans = DataLoader(eval_dataset_answerable, sampler=eval_sampler_ans,
                                 batch_size=train_batch_size)
eval_sampler_unans = SequentialSampler(eval_dataset_unanswerable)
eval_dataloader_unans = DataLoader(eval_dataset_unanswerable, sampler=eval_sampler_unans,
                                   batch_size=train_batch_size)


02/26/2019 15:46:11 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/daniter/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
Loading Squad: 100%|██████████| 35/35 [00:00<00:00, 4334.37it/s]
Loading Squad: 100%|██████████| 35/35 [00:00<00:00, 3928.09it/s]
02/26/2019 15:46:12 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/daniter/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
02/26/2019 15:46:12 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/daniter/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d

Checking the vocab size: 30522


In [109]:
target = 50
with torch.no_grad():
    model.eval()

    eval_loss_ans = 0
    for batch_i, eval_batch in enumerate(eval_dataloader_unans):
        eval_batch = tuple(t.to(device) for t in eval_batch)
        input_ids, input_mask, segment_ids, lm_label_ids, is_next = eval_batch
        if batch_i != target:
            continue
        if batch_i == target:
            input_ids[0][143] = 103
            #input_ids[0][138] = 103
            #input_ids[0][117] = 103
            print(tokenizer.convert_ids_to_tokens(input_ids.data.numpy()[0]))
        output, _ = model(input_ids, segment_ids, input_mask, None, None)
        if batch_i == target:
            break

['[CLS]', 'the', 'norman', '##s', 'were', 'in', 'contact', 'with', 'england', 'from', 'an', 'early', 'date', '.', 'not', 'only', 'were', 'their', 'original', 'viking', 'brethren', 'still', 'ra', '##va', '##ging', 'the', 'english', 'coasts', ',', 'they', 'occupied', 'most', 'of', 'the', 'important', 'ports', 'opposite', 'england', 'across', 'the', 'english', 'channel', '.', 'this', 'relationship', 'eventually', 'produced', 'closer', 'ties', 'of', 'blood', 'through', 'the', 'marriage', 'of', 'emma', ',', 'sister', 'of', 'duke', 'richard', 'ii', 'of', 'normandy', ',', 'and', 'king', 'ethel', '##red', 'ii', 'of', 'england', '.', 'because', 'of', 'this', ',', 'ethel', '##red', 'fled', 'to', 'normandy', 'in', '101', '##3', ',', 'when', 'he', 'was', 'forced', 'from', 'his', 'kingdom', 'by', 'sw', '##ey', '##n', 'fork', '##be', '##ard', '.', 'his', 'stay', 'in', 'normandy', '(', 'until', '1016', ')', 'influenced', 'him', 'and', 'his', 'sons', 'by', 'emma', ',', 'who', 'stayed', 'in', 'normandy

In [110]:
print(tokenizer.convert_ids_to_tokens(np.argmax(output[0].data.numpy(), axis=1)))

['?', 'the', 'norman', '##s', 'were', 'in', 'contact', 'with', 'england', 'from', 'an', 'early', 'date', '.', 'not', 'only', 'were', 'their', 'original', 'viking', 'brethren', 'still', 'ra', '##va', '##ging', 'the', 'english', 'coasts', ',', 'they', 'occupied', 'most', 'of', 'the', 'important', 'ports', 'opposite', 'england', 'across', 'the', 'english', 'channel', '.', 'this', 'relationship', 'eventually', 'made', 'closer', 'ties', 'of', 'blood', 'through', 'the', 'marriage', 'of', 'emma', ',', 'sister', 'of', 'duke', 'richard', 'ii', 'of', 'normandy', ',', 'and', 'king', 'ethel', '##red', 'ii', 'of', 'england', '.', 'because', 'of', 'this', ',', 'ethel', '##red', 'fled', 'to', 'normandy', 'in', '101', '##3', ',', 'when', 'he', 'was', 'forced', 'from', 'his', 'kingdom', 'by', 'sw', '##ey', '##n', 'fork', '##be', '##ard', '.', 'his', 'stay', 'in', 'normandy', '(', 'until', '1016', ')', 'influenced', 'him', 'and', 'his', 'sons', 'by', 'emma', ',', 'who', 'stayed', 'in', 'normandy', 'afte

In [100]:
print(list(zip(tokenizer.convert_ids_to_tokens(input_ids.data.numpy()[0]), range(256))))


[('[CLS]', 0), ('the', 1), ('norman', 2), ('##s', 3), ('were', 4), ('in', 5), ('contact', 6), ('with', 7), ('england', 8), ('from', 9), ('an', 10), ('early', 11), ('date', 12), ('.', 13), ('not', 14), ('only', 15), ('were', 16), ('their', 17), ('original', 18), ('viking', 19), ('brethren', 20), ('still', 21), ('ra', 22), ('##va', 23), ('##ging', 24), ('the', 25), ('english', 26), ('coasts', 27), (',', 28), ('they', 29), ('occupied', 30), ('most', 31), ('of', 32), ('the', 33), ('important', 34), ('ports', 35), ('opposite', 36), ('england', 37), ('across', 38), ('the', 39), ('english', 40), ('channel', 41), ('.', 42), ('this', 43), ('relationship', 44), ('eventually', 45), ('produced', 46), ('closer', 47), ('ties', 48), ('of', 49), ('blood', 50), ('through', 51), ('the', 52), ('marriage', 53), ('of', 54), ('emma', 55), (',', 56), ('sister', 57), ('of', 58), ('duke', 59), ('richard', 60), ('ii', 61), ('of', 62), ('normandy', 63), (',', 64), ('and', 65), ('king', 66), ('ethel', 67), ('##re

In [112]:
c = Counter()
for i, o in enumerate(output[0][145]):
    c[i] = o
for x, val in c.most_common(25):
    print(tokenizer.convert_ids_to_tokens([x]), val)

['vikings'] tensor(16.1189)
['english'] tensor(13.6539)
['##s'] tensor(13.0520)
['norman'] tensor(12.4569)
['viking'] tensor(10.6945)
['scandinavian'] tensor(10.4082)
['germans'] tensor(9.7384)
['british'] tensor(9.5046)
['danes'] tensor(9.4454)
['danish'] tensor(9.2061)
['angles'] tensor(9.1156)
['invaders'] tensor(9.0145)
['french'] tensor(8.9796)
['irish'] tensor(8.6128)
['norse'] tensor(8.6118)
['conquest'] tensor(8.5325)
['saxons'] tensor(8.5170)
['normandy'] tensor(8.4517)
['sas'] tensor(8.3299)
['welsh'] tensor(8.2382)
['knights'] tensor(7.9403)
['people'] tensor(7.7032)
['humans'] tensor(7.6789)
['spanish'] tensor(7.6453)
['swedish'] tensor(7.6362)


In [74]:
np.argmax(output[0][120])

tensor(9586)

In [75]:
tokenizer.ids_to_tokens[9586] 

'edgar'

In [65]:
from collections import Counter

# TODO
* add end token to model!

In [None]:
print(loss.item())

In [None]:
print(tokenizer.convert_ids_to_tokens(context_ids.data.numpy()[0]))

In [None]:
print(question_ids)
print(tokenizer.convert_ids_to_tokens(question_ids.data.numpy()[0]))

In [None]:
o = output.data.numpy()
print(tokenizer.convert_ids_to_tokens(np.argmax(o[0], axis=1)))

In [None]:
for i in range(10):
    print(np.argmax(o[0,i,:]), np.exp(np.max(o[0,i,:]))/ np.sum(np.exp(o[0,i,:])))

In [None]:
from collections import Counter

In [None]:
c = Counter()

In [None]:
for i in range(o.shape[2]):
    c[i] = np.exp(o[0,7,i])/ np.sum(np.exp(o[0,7,:]))

In [None]:
for idx, score in c.most_common()[:25]:
    print(tokenizer.convert_ids_to_tokens([idx]), score)

In [None]:
c[2435] # normandy

In [None]:
with torch.no_grad():
    model.eval()

    eval_loss_ans = 0
    for batch_i, eval_batch in enumerate(eval_dataloader_unans):
        eids = eval_batch[-1]
        eval_batch = tuple(t.to(device) for t in eval_batch[:-1])
        question_ids, question_mask, context_ids, context_mask, targets = eval_batch
        output, _ = model(context_ids, context_mask, question_ids, question_mask)
        loss = criterion(output.view(-1, len(tokenizer.vocab)), question_ids.view(-1))
        eval_loss_ans += loss.item()
        break
        if loss.item() > 0.01:
            print(batch_i, eval_loss_ans)
            break

In [None]:
eval_loss_ans

In [None]:
print(question_ids)
print(tokenizer.convert_ids_to_tokens(question_ids.data.numpy()[0]))

In [None]:
o = output.data.numpy()
print(o)
print(tokenizer.convert_ids_to_tokens(np.argmax(o[0], axis=1)))

In [None]:
for i in range(20):
    print(np.argmax(o[0,i,:]), np.exp(np.max(o[0,i,:]))/ np.sum(np.exp(o[0,i,:])))

# Interesting Examples

In [None]:
from collections import Counter
def print_details(context_ids, question_ids, output, loss, tokenizer):
    print("Loss:", loss.item())
    print("CONTEXT")
    print(tokenizer.convert_ids_to_tokens(context_ids.data.numpy()[0]))
    print("~"*30)
    print("QUESTION")
    q_ids = [i for i in question_ids.data.numpy()[0] if i != 0]
    print(q_ids)
    q_toks = [tok for tok in tokenizer.convert_ids_to_tokens(question_ids.data.numpy()[0]) if tok != '[PAD]']
    print(q_toks)
    print("~"*30)
    print("OUTPUT")
    o = output.data.numpy()
    out_ids = [i for i in np.argmax(o[0], axis=1) if i != 0]
    out_toks = [tok for tok in tokenizer.convert_ids_to_tokens(np.argmax(o[0], axis=1)) if tok != '[PAD]']
    scores = [(np.argmax(o[0,i,:]), np.exp(np.max(o[0,i,:]))/ np.sum(np.exp(o[0,i,:]))) for i in range(len(out_toks))]
    print(out_toks)
    print(list(zip(out_toks, scores)))
    print("~"*30)
    print("TOP K FOR INCORRECT TERMS:")
    for tok_i, (tar, out) in enumerate(zip(q_ids[1:], out_ids)):
        if tar != out:
            print ("Output", out_toks[tok_i], "instead of ", q_toks[tok_i+1])
            c = Counter()
            for i in range(o.shape[2]):
                c[i] = np.exp(o[0,tok_i,i])/ np.sum(np.exp(o[0,tok_i,:]))
            for idx, score in c.most_common()[:10]:
                print("- \t",tokenizer.convert_ids_to_tokens([idx]), score)
    print("#"*30)

In [None]:
explore = {eval_dataloader_ans: [0,1], eval_dataloader_unans: [0, 15]}

with torch.no_grad():
    model.eval()

    eval_loss_ans = 0
    for dataloader in explore.keys():
        for batch_i, eval_batch in enumerate(dataloader):
            eids = eval_batch[-1]
            eval_batch = tuple(t.to(device) for t in eval_batch[:-1])
            question_ids, question_mask, context_ids, context_mask, targets = eval_batch
            output, _ = model(context_ids, context_mask, question_ids, question_mask)
            loss = criterion(output.view(-1, len(tokenizer.vocab)), targets.view(-1))
            eval_loss_ans += loss.item()
            if batch_i in explore[dataloader]:
                if dataloader == eval_dataloader_ans:
                    print("Answerable:",batch_i)
                else:
                    print("Unanswerable:",batch_i)
                print_details(context_ids, question_ids, output, loss, tokenizer)
            if batch_i > max(explore[dataloader]):
                break


# Notes
* LM was suprisingly good at guessing question type and general structure of the question. It may be because the BERT representation is leaky in terms of representation  
    - may be interesting result of its own ... Q: How much does the feature of each BERT word contain information about the surrounding BERT words?  
    - this is probably why the model is pretty good at guessing Q type from the [CLS] token  
* the Sentence repr is not enough represent specific entities in the text  
    - gets confused between normandy and france and mongolia and normans   
    - this might be because I don't train the sentence repr which is probably a mistake and we should retrain this one with trained sentence repr and forward masking in the question repr
    - Maybe we can do an attention over entities? 
    - There is a bias towards more common entities right now (ie. john and paris instead of rollo and normandy)
    - the fact that stuff like Rollo and dates were predicted well means there is leakiness in the representation
* rank may be more important than loss since sometimes the #1 option has very high prob but number 2 is really good (eg. arrive vs begin U15)