#### pytorch pretrained-BERT testing

In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [2]:
# Load pre-trained model tokenizer (vocabulary)
modelname = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(modelname)

In [3]:
text = "he went to the restaurant again. although he had already eaten a large meal, he was still very hungry."
target = "hungry"
tokenized_text = tokenizer.tokenize(text)

In [4]:
print(tokenized_text)

['he', 'went', 'to', 'the', 'restaurant', 'again', '.', 'although', 'he', 'had', 'already', 'eaten', 'a', 'large', 'meal', ',', 'he', 'was', 'still', 'very', 'hungry', '.']


- Word-wise representation cannot handle unseen word. Character embeddings is one of the solution to overcome out-of-vocabulary (OOV). However, it may too fine-grained. **Subword** is in between word and character. It is not too fine-grained while able to handle unseen word.
    - Sennrich et al. (2016) proposed to use Byte Pair Encoding (BPE) to build subword dictionary. GPT-2 (OpenAI, 2019) adopts BPE. It basically generates a new subword according to the *high frequency occurrence*.
    - WordPiece is another word segmentation algorithm. WordPiece is similar with BPE and the difference part is forming a new subword by likelihood but not the next highest frequency pair. *BERT* uses WordPiece.

In [5]:
test_text = "he went to the sktelecom food market with chanwoo again. My cell number is 010-1111-2222"
test_tokenized_text = tokenizer.tokenize(test_text)
print(test_tokenized_text)

['he', 'went', 'to', 'the', 'sk', '##tel', '##ec', '##om', 'food', 'market', 'with', 'chan', '##wo', '##o', 'again', '.', 'my', 'cell', 'number', 'is', '01', '##0', '-', '111', '##1', '-', '222', '##2']


In [6]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = tokenized_text.index(target)
tokenized_text[masked_index] = '[MASK]'

In [7]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print(indexed_tokens)

[2002, 2253, 2000, 1996, 4825, 2153, 1012, 2348, 2002, 2018, 2525, 8828, 1037, 2312, 7954, 1010, 2002, 2001, 2145, 2200, 103, 1012]


In [8]:
# Define sentence A and B indices associated to 1st and 2nd sentences
segments_ids = [1] * len(tokenized_text)
# this is for the first sentence. 

for i in range(15):
    segments_ids[i] =0 

In [9]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(modelname)
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
   

In [10]:
# Predict all tokens
predictions = model(tokens_tensor, segments_tensors)
predicted_index = torch.argmax(predictions[0, masked_index]).item() # argmax로
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])

In [11]:
print("Original:", text)
print("Masked:", " ".join(tokenized_text))

Original: he went to the restaurant again. although he had already eaten a large meal, he was still very hungry.
Masked: he went to the restaurant again . although he had already eaten a large meal , he was still very [MASK] .


In [12]:
print("Predicted token:", predicted_token)

Predicted token: ['hungry']


In [13]:
predictions.shape # reconstruct 된 문장이므로 22개의 모든 token에 대한 probability를 출력

torch.Size([1, 22, 30522])

In [14]:
predicted_index

7501

In [15]:
predicted_token = tokenizer.convert_ids_to_tokens([7501])
print(predicted_token)

['hungry']


#### other top-k words

In [16]:
prob_masked_token = predictions[0, masked_index]
prob_masked_token.shape

torch.Size([30522])

In [17]:
values, indices = torch.topk(prob_masked_token, 10)

print(values)
print(indices)

tensor([5.4957, 5.4566, 5.2205, 4.8334, 4.7215, 4.5953, 4.1050, 3.9485, 3.9407,
        3.9392], grad_fn=<TopkBackward>)
tensor([7501, 1012, 2844, 2204, 2092, 2172, 2402, 2312, 5458, 2502])


In [18]:
predicted_token = tokenizer.convert_ids_to_tokens(indices.numpy())

In [19]:
predicted_token

['hungry',
 '.',
 'strong',
 'good',
 'well',
 'much',
 'young',
 'large',
 'tired',
 'big']

In [20]:
def generate_topk_words(test_str):
    text = test_str
    tokenized_text = tokenizer.tokenize(text)
    
    masked_index = tokenized_text.index('_')
    tokenized_text[masked_index] = '[MASK]'
    
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define sentence A and B indices associated to 1st and 2nd sentences
    segments_ids = [1] * len(tokenized_text)
    # this is for the first sentence 'hello'. 
    for i in range(1):
        segments_ids[i] =0 

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Predict all tokens
    predictions = model(tokens_tensor, segments_tensors)
    prob_masked_token = predictions[0, masked_index]

    print("Original:", text)
    print("Masked:", " ".join(tokenized_text))
    
    values, indices = torch.topk(prob_masked_token, 10)
    predicted_token = tokenizer.convert_ids_to_tokens(indices.numpy())
    print('Predicted top-k tokens: ',  predicted_token)

In [21]:
generate_topk_words('hello. i went to the grocery store to buy _ to make an apple pie')

Original: hello. i went to the grocery store to buy _ to make an apple pie
Masked: hello . i went to the grocery store to buy [MASK] to make an apple pie
Predicted top-k tokens:  ['food', 'and', 'something', 'bread', 'enough', 'coffee', 'things', 'or', 'flour', 'milk']


In [22]:
generate_topk_words('hello. you need to have _ to avoid catching the flu in the winter.')

Original: hello. you need to have _ to avoid catching the flu in the winter.
Masked: hello . you need to have [MASK] to avoid catching the flu in the winter .
Predicted top-k tokens:  ['something', 'it', 'one', 'them', 'medication', 'this', 'everything', 'surgery', 'that', 'enough']


In [23]:
generate_topk_words('hello. the happiness is all about _ , not  money  .')

Original: hello. the happiness is all about _ , not  money  .
Masked: hello . the happiness is all about [MASK] , not money .
Predicted top-k tokens:  ['love', 'happiness', 'money', 'me', 'sex', 'people', 'everything', 'life', 'time', 'music']
