In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenized input
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 6
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# Predict all tokens
predictions = model(tokens_tensor, segments_tensors)

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'henson'

print("Wow it works")

12/12/2018 20:14:52 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/daniter/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
12/12/2018 20:14:52 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/daniter/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
12/12/2018 20:14:52 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/daniter/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/8q/55ln6_

Wow it works


In [8]:
predictions[0, masked_index][12523]

tensor(-7.3167, grad_fn=<SelectBackward>)

In [12]:
question = "[MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]"#"Who won the super bowl in 2014?"
q_tokens = tokenizer.tokenize(question)
indexed_q_tokens = tokenizer.convert_tokens_to_ids(q_tokens)

context = '''Elway set several career records for passing attempts and completions 
while at Stanford and also received All-American honors. He was the first selection in 
the 1983 NFL Draft, famously known as the quarterback class of 1983, where he was taken by 
the Baltimore Colts before being traded to the Denver Broncos. In January 1987, Elway 
embarked on one of the most notable performances in sports and in NFL history, helping 
engineer a 98-yard, game-tying touchdown drive in the AFC Championship Game against the 
Cleveland Browns. The moment is known in National Football League lore as The Drive. 
Following that game in Cleveland, Elway and the Broncos lost in Super Bowl XXI to the New York Giants.'''
c_tokens = tokenizer.tokenize(context)
indexed_c_tokens = tokenizer.convert_tokens_to_ids(c_tokens)

In [10]:
indexed_q_tokens

[2040, 2180, 1996, 3565, 4605, 1999, 2297, 1029]

In [14]:
tokens_tensor = torch.tensor([indexed_c_tokens + indexed_q_tokens])
segments_tensors = torch.tensor([0]*len(indexed_c_tokens) + [1]*len(indexed_q_tokens))
predictions = model(tokens_tensor, segments_tensors)

In [35]:
gt_question = "Who won the super bowl in 2014?"
gt_q_tokens = tokenizer.tokenize(gt_question)
gt_indexed_q_tokens = tokenizer.convert_tokens_to_ids(gt_q_tokens)

In [31]:
gt_question = "Is this really a totally irrelevant question?"
gt_q_tokens = tokenizer.tokenize(gt_question)
gt_indexed_q_tokens = tokenizer.convert_tokens_to_ids(gt_q_tokens)

In [39]:
gt_question = "Elway super bowl football words the but also a lot of other words and stuff that are not in the thing?"
gt_q_tokens = tokenizer.tokenize(gt_question)
gt_indexed_q_tokens = tokenizer.convert_tokens_to_ids(gt_q_tokens)

In [40]:
total = 0
for i in range(len(gt_indexed_q_tokens)):
    total += predictions[0,i,gt_indexed_q_tokens[i]]
print (total)

tensor(6.1764, grad_fn=<AddBackward0>)


In [23]:
import random

In [30]:
total = 0
for i in range(len(gt_indexed_q_tokens)):
    total += predictions[0,i,int(random.random() * predictions.shape[2])]
print (total)

tensor(-58.9993, grad_fn=<AddBackward0>)


In [41]:
import json

In [42]:
with open("../squad2/data/dev-v2.0.json", 'r') as handle:
    jdata = json.load(handle)
    data = jdata['data']

In [68]:
def calc_prob(context, question):
    gt_question = question
    gt_q_tokens = tokenizer.tokenize(gt_question)
    gt_indexed_q_tokens = tokenizer.convert_tokens_to_ids(gt_q_tokens)
    
    mask_tokens = ["[MASK]"]*len(gt_indexed_q_tokens)
    indexed_mask_tokens = tokenizer.convert_tokens_to_ids(mask_tokens)
    
    context_tokens = tokenizer.tokenize(context)
    indexed_context_tokens = tokenizer.convert_tokens_to_ids(context_tokens)
    
    tokens_tensor = torch.tensor([indexed_context_tokens + indexed_mask_tokens])
    segments_tensors = torch.tensor([0]*len(indexed_context_tokens) + [1]*len(indexed_mask_tokens))
    predictions = model(tokens_tensor, segments_tensors)
    
    total = 0
    context_len = len(context_tokens)
    q_len = len(indexed_mask_tokens)
    for i in range(q_len):
        total += predictions[0,context_len+i,gt_indexed_q_tokens[i]].item()
    return total

answerable_probs = []
unanswerable_probs = []
counter = 0
for i in range(len(data)):
    section = data[i]['paragraphs']
    for sec in section:
        context = sec['context']
        qas = sec['qas']
        for j in range(len(qas)):
            question = qas[j]['question']
            label = qas[j]['is_impossible']
            prob = calc_prob(context, question)
            if label:
                unanswerable_probs.append(prob)
            else:
                answerable_probs.append(prob)
            counter += 1
            if counter % 1000 == 0:
                print("Processed ", counter)

KeyboardInterrupt: 

In [43]:
type(data)

list

In [None]:
with open("./results.pkl", 'rb')