# Evaluation funtion

In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import pandas as pd
import numpy as np

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
            
def sentencepair2tensor(tokenizer, tokens_a, tokens_b, max_seq_length):
    
    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    assert len(tokens_b) > 0
    for token in tokens_b:
        tokens.append(token)
        segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)
    masked_index = tokens.index("[MASK]")
    sep_index = tokens.index("[MASK]")
    

    
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    
    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(segment_ids) == max_seq_length
    
    tokens_tensor = torch.tensor([input_ids])
    segments_tensors = torch.tensor([segment_ids])
    
    return tokens_tensor, segments_tensors, masked_index, sep_index

In [3]:
df = pd.read_csv('data/generation/BC_test.csv')

In [4]:
# Load pre-trained model tokenizer (vocabulary)
modelpath = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(modelpath)

In [5]:
#examples
i = np.random.randint(0, df.shape[0])

a = df.sentence1.values[i]
b = df.sentence2.values[i]
l = df.label.values[i]
mb = df.sentence2_masked.values[i]

text = a + "  " + b
masked_text = a + "  " + mb

a = tokenizer.tokenize(a)
mb = tokenizer.tokenize(mb)

#Convert inputs to PyTorch tensors
tokens_tensor, segments_tensors, masked_index, sep_index = sentencepair2tensor(tokenizer, a, mb, 128)

In [6]:
tokens_tensor[0][masked_index]

tensor(103)

In [7]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(modelpath)
model.eval()


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(
       

In [30]:
#Predict all tokens
predictions = model(tokens_tensor, segments_tensors)
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])



print("Original:", text)
print("label:", l)

print("Predicted token:", predicted_token)
print("Other options:")
# just curious about what the next few options look like.
for i in range(10):
    predictions[0,masked_index,predicted_index] = 0
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
    print(predicted_token)

Original: Iris didn't visit Melinda, Katherine has visited Brazil.  Alma didn't visit São Tomé & Prícipe or Katherine has visited Brazil
label: or
Predicted token: [',']
Other options:
['.']
['and']
['...']
['、']
['-']
[';']
['while']
['~']
[':']
['/']


In [29]:
#Predict all tokens
predictions = model(tokens_tensor, segments_tensors)
predicted_index = torch.argmax(predictions[0],dim=1)
full_prediction = []
for i in predicted_index:
    predicted_token = tokenizer.convert_ids_to_tokens([i.item()])
    full_prediction.append(predicted_token[0])


full_prediction = " ".join(full_prediction)


print("Original:", text)
print()
print("Masked text:", masked_text)
print("label:", l)
print()
print("model prediction:\n")
print(full_prediction)

Original: Iris didn't visit Melinda, Katherine has visited Brazil.  Alma didn't visit São Tomé & Prícipe or Katherine has visited Brazil

Masked text: Iris didn't visit Melinda, Katherine has visited Brazil.  Alma didn't visit São Tomé & Prícipe [MASK] Katherine has visited Brazil
label: or

model prediction:

. iris didn ' t visit melinda , katherine has visited brazil . . iris didn ' t visit sao tome & pri ##ci ##pe , katherine has visited brazil . ᴬ ##ª ##ª ##ª ##ª ##ª ##ª ᴬ ##ª ##ª ##ª ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ##irus ᴬ ᴬ ᴬ ᴬ ##ʳ ##─ ##─ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ##irus ᴬ ᴬ ᴬ ᴬ ##─ ##─ ##─ ᴬ ##─ ##─ ##─ ##─ ##─ ᴬ ##─ ##─ ##─ ##─ ##─ ##─ ##─ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ᴬ ##¨ ##─ ᴬ ᴬ ᴬ ##hita ##─ ᴬ ᴬ ᴬ ᴬ ##hita ##¨ ᴬ ᴬ ᴬ ᴬ ᴬ ##¨ ᴬ ##¨ ᴬ ##¨ ##¨
