### bert_find_word.ipynb

original source:  https://huggingface.co/transformers/quickstart.html

2020.01.10

In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
# import logging
# logging.basicConfig(level=logging.INFO)

In [14]:
print("torch version: ", torch.__version__)

torch version:  1.3.1


In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
# model.eval()

In [3]:
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"

In [4]:
# return tokenized text where 'XX' is replaced by '[MASK]'
def replace_xx_by_MASK(text):
    tt = tokenizer.tokenize(text)
    for i in range(len(tt)):
        if tt[i]=='xx':
            tt[i] = '[MASK]'
            masked_idx=i
            break
    return tt  , masked_idx

In [5]:
# assign integer to each token according to its corresponding sentence
def get_segments_ids(tt): 
    beg=end=0
    code=0
    segm_ids=[]

    for c in tt:
        end+=1
        if c=='[SEP]':
            len = (end-beg)
            segm = [code]*len
            #print(tt[beg:end], segm )
            segm_ids += segm
            code+=1
            beg=end
    return segm_ids  

In [6]:
def bert_find_word(text):
    tokenized_text , masked_index = replace_xx_by_MASK(text)
    # print(tokenized_text)
    # print(' '.join(tokenized_text), masked_index)
    segments_ids = get_segments_ids(tokenized_text)
    # print(segments_ids)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # print(indexed_tokens)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    # segments_tensors
    
    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]
    # predictions
    
    # confirm we were able to predict 'henson'
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    
    return predicted_token

In [7]:
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"

text = "[CLS] Who was Jim Henson ? [SEP] Jim XX was a puppeteer [SEP]"


#text =  "[CLS] Alice was looking into the XX hall [SEP] Rabit looked at his watch [SEP]"
#text = "[CLS] Who is XX's wife ? [SEP] Putin is president of Russia [SEP]" 



#text = "[CLS] Putin is president of Russia [SEP] Who is XX's wife ? [SEP]" 
text = "[CLS] Python is a very good programming XX for natural language processing [SEP]"

In [8]:
print(text)
predictedword = bert_find_word(text)
print("Bert thinks 'XX' = ", predictedword)

[CLS] Python is a very good programming XX for natural language processing [SEP]
Bert thinks 'XX' =  language


In [None]:
# tokenized_text , masked_index = replace_xx_by_MASK(text)
# # print(tokenized_text)
# print(' '.join(tokenized_text), masked_index)
# segments_ids = get_segments_ids(tokenized_text)
# print(segments_ids)
# # Convert token to vocabulary indices
# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# print(indexed_tokens)

In [None]:
# # Convert inputs to PyTorch tensors
# tokens_tensor = torch.tensor([indexed_tokens])
# segments_tensors = torch.tensor([segments_ids])
# # segments_tensors

In [None]:
# # Predict all tokens
# with torch.no_grad():
#     outputs = model(tokens_tensor, token_type_ids=segments_tensors)
#     predictions = outputs[0]
# # predictions

In [None]:
# # confirm we were able to predict 'henson'
# predicted_index = torch.argmax(predictions[0, masked_index]).item()
# predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

In [None]:
# predicted_token