In [1]:
import torch
from transformers import BertTokenizer
from IPython . display import clear_output


I0112 10:44:26.544201 14300 file_utils.py:35] PyTorch version 1.1.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
PRETRAINED_MODEL_NAME = "bert-base-chinese"

tokenizer = BertTokenizer . from_pretrained( PRETRAINED_MODEL_NAME)

clear_output()
print("Pytorch version:", torch.__version__)


Pytorch version: 1.1.0


In [3]:
vocab = tokenizer.vocab

print("字典大小:",len(vocab))

字典大小: 21128


In [4]:
import random

random_tokens = random.sample(list(vocab),10)

random_ids = [vocab [t] for t in random_tokens]

print("{0:20}{1:15}".format("token","index"))

print("-"*25)

for t,id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t,id))
    



token               index          
-------------------------
fps                 11671
##√                 13532
##廷                 15512
##①                 13556
mk                  11629
email                8307
##牟                 17340
作                     868
疵                    4560
##伪                 13898


In [5]:
indices = list(range(647,657))

some_pairs = [(t,idx) for t, idx in vocab.items() if idx in indices]

for pair in some_pairs:
    
    print(pair)

('ㄅ', 647)
('ㄆ', 648)
('ㄇ', 649)
('ㄉ', 650)
('ㄋ', 651)
('ㄌ', 652)
('ㄍ', 653)
('ㄎ', 654)
('ㄏ', 655)
('ㄒ', 656)


In [6]:
text = "[CLS]等到潮水[MASK]了，就知道谁没穿裤子"

tokens = tokenizer.tokenize(text)

ids = tokenizer.convert_tokens_to_ids(tokens)

print(text)

print(tokens[:10],'...')

print(ids[:10],'...')

[CLS]等到潮水[MASK]了，就知道谁没穿裤子
['[CLS]', '等', '到', '潮', '水', '[MASK]', '了', '，', '就', '知'] ...
[101, 5023, 1168, 4060, 3717, 103, 749, 8024, 2218, 4761] ...


In [15]:

import torch
from transformers import BertForMaskedLM
# 除了 tokens 以外还要辨别 segment ids
tokens_tensor = torch.tensor([ids]) # (1, seq_len)

segments_tensors = torch.zeros_like(tokens_tensor) # (1, seq_len)

maskedLM_model = BertForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()

# 使用 masked LM 估计[MASK] 位置代表的实际 token

maskedLM_model.eval()
with torch.no_grad():
    
    outputs = maskedLM_model(tokens_tensor, segments_tensors)
    
    predictions = outputs[0]
    
del maskedLM_model

# 将 [MASK] 位置的概率分布做 top k 最有可能的 tokens 出来

masked_index = 5

k = 3


probs, indices = torch.topk(torch.softmax(predictions[0, masked_index], -1), k)

predicted_tokens = tokenizer.convert_ids_to_tokens(indices.tolist())

# 显示 top k 可能的字， 一般取top 1 当预测值

print("输入的 tokens :", tokens[:10],'...')

print('-'*50)

for i, (t, p) in enumerate(zip(predicted_tokens, probs), 1):
    
    tokens[masked_index] = t
    print("Top {} ({:2}%):{}".format(i, int(p.item()*100), tokens[:10], '...'))
    




输入的 tokens : ['[CLS]', '等', '到', '潮', '水', '过', '了', '，', '就', '知'] ...
--------------------------------------------------
Top 1 (78%):['[CLS]', '等', '到', '潮', '水', '来', '了', '，', '就', '知']
Top 2 ( 4%):['[CLS]', '等', '到', '潮', '水', '到', '了', '，', '就', '知']
Top 3 ( 2%):['[CLS]', '等', '到', '潮', '水', '过', '了', '，', '就', '知']


In [12]:
!jt -t oceans16 -f fira -fs 13