In [1]:
import os
import torch
from datasets.bert_mask_dataset import BertMaskDataset
from models.modeling_glycebert import GlyceBertForMaskedLM

In [2]:
pretrain_path = "/root/.cache/huggingface/hub/models--ShannonAI--ChineseBERT-base/snapshots/aa8b6fa9c3427f77b0911b07ab35f2b1b8bf248b"

# step 1: mask sentence
vocab_file = os.path.join(pretrain_path, 'vocab.txt')
config_path = os.path.join(pretrain_path, 'config')
tokenizer = BertMaskDataset(vocab_file, config_path)

# step 2: load model
chinese_bert = GlyceBertForMaskedLM.from_pretrained(pretrain_path)

In [3]:
sentence = "我喜欢小猫"

# step 3: mask each position and fill
for i in range(len(sentence)):
    input_ids, pinyin_ids = tokenizer.mask_sentence(sentence, i)
    length = input_ids.shape[0]
    input_ids = input_ids.view(1, length)
    pinyin_ids = pinyin_ids.view(1, length, 8)
    output = chinese_bert.forward(input_ids, pinyin_ids)[0]
    predict_labels = torch.argmax(output, dim=-1)[0]
    predict_label = predict_labels[i + 1].item()
    output_ids = input_ids.numpy()[0].tolist()
    output_ids[i + 1] = predict_label

    input_sentence = tokenizer.tokenizer.decode(input_ids.numpy().tolist()[0], skip_special_tokens=False)
    output_sentence = tokenizer.tokenizer.decode(output_ids, skip_special_tokens=False)
    print("input sentence:", input_sentence)
    print("output sentence:", output_sentence)
    print()



input sentence: [CLS] [MASK] 喜 欢 小 猫 [SEP]
output sentence: [CLS] 我 喜 欢 小 猫 [SEP]

input sentence: [CLS] 我 [MASK] 欢 小 猫 [SEP]
output sentence: [CLS] 我 喜 欢 小 猫 [SEP]

input sentence: [CLS] 我 喜 [MASK] 小 猫 [SEP]
output sentence: [CLS] 我 喜 欢 小 猫 [SEP]

input sentence: [CLS] 我 喜 欢 [MASK] 猫 [SEP]
output sentence: [CLS] 我 喜 欢 熊 猫 [SEP]

input sentence: [CLS] 我 喜 欢 小 [MASK] [SEP]
output sentence: [CLS] 我 喜 欢 小 。 [SEP]

