In [79]:
from transformers import BertTokenizer

In [80]:
# Construct a BERT tokenizer. Based on WordPiece.
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
)
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [81]:
sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

In [60]:
list_of_token = [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102]

# Convert a list of lists of token ids into a list of strings by calling decode.
print(tokenizer.decode(list_of_token))
# skip_special_tokens:Whether or not to remove special tokens in the decoding.
print(tokenizer.decode(list_of_token, skip_special_tokens=True))

[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP]
选 择 珠 江 花 园 的 原 因 就 是 方 便 。


In [61]:
# Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces.
encode_out_1 = tokenizer.encode(
    # The first sequence to be encoded
    text=sents[0])
print(encode_out_1)
print(tokenizer.decode(encode_out_1))

[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102]
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP]


In [62]:
encode_out_2 = tokenizer.encode(text=sents[0],
                                # Optional second sequence to be encoded
                                text_pair=sents[1])
print(encode_out_2)
print(tokenizer.decode(encode_out_2))

[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102]
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP]


In [63]:
encode_pad = tokenizer.encode(text=sents[0], text_pair=sents[1],
                              # 'max_length':Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                              padding="max_length")
print(encode_pad)
print(len(encode_pad))  # maximum acceptable input length for the model
print(tokenizer.decode(encode_pad))

[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [64]:
encode_pad_max_len = tokenizer.encode(text=sents[0], text_pair=sents[1],
                                      padding="max_length",
                                      # Controls the maximum length to use by one of the truncation/padding parameters.
                                      max_length=30)
print(encode_pad_max_len)
print(len(encode_pad_max_len))  # max_length
print(tokenizer.decode(encode_pad_max_len))

[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
30
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]


In [78]:
encode_tru = tokenizer.encode(text=sents[0],
                              # True or 'longest_first': Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                              #                          This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided.
                              # only_first: Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                              #             This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                              # only_second: Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                              #              This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                              truncation=True,
                              max_length=5)
print(encode_tru)  # max_length
print(len(encode_tru))
print(tokenizer.decode(encode_tru))

[101, 6848, 2885, 4403, 102]
5
[CLS] 选 择 珠 [SEP]


In [66]:
result_list = tokenizer.encode(text=sents[0])
print(type(result_list))  # list(默认)

result_torch = tokenizer.encode(text=sents[0],
                                return_tensors='pt')
print(type(result_torch))  # torch.Tensor

result_tf = tokenizer.encode(text=sents[0],
                             return_tensors='tf')
print(type(result_tf))  # tensorflow.python.framework.ops.EagerTensor

<class 'list'>
<class 'torch.Tensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [87]:
# 增强编码
encode_plus_ = tokenizer.encode_plus(
    text=sents[0],
    text_pair=sents[1],
    truncation=True,
    padding='max_length',
    max_length=30,
    # 返回token_type_ids
    return_token_type_ids=True,
    # 返回attention_mask
    return_attention_mask=True,
    # 返回special_tokens_mask
    return_special_tokens_mask=True,
    # 返回length标识长度
    return_length=True,
)

# input_ids:编码结果
# token_type_ids:第一个句子和特殊符号的位置是0,第二个句子的位置是1
# attention_mask:pad的位置是0,其他位置是1
# special_tokens_mask:特殊符号的位置是1,其他位置是0
# length:句子长度
for k, v in encode_plus_.items():
    print(k, ':', v)

print(tokenizer.decode(encode_plus_['input_ids']))

input_ids : [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
length : 30
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]


In [98]:
# 批量编码句子
batch_encode_plus_0 = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[sents[0], sents[1]],
    # True: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided)
    padding=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    return_length=True,
)

for k, v in batch_encode_plus_0.items():
    print(k, ':', v)

# 句子长度相等
print(len(batch_encode_plus_0['input_ids'][0]))
print(len(batch_encode_plus_0['input_ids'][1]))

print(tokenizer.decode(batch_encode_plus_0['input_ids'][0]))
print(tokenizer.decode(batch_encode_plus_0['input_ids'][1]))

input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102], [101, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0, 0]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]
length : [16, 12]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]
16
16
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP]
[CLS] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD] [PAD]


In [101]:
# 批量编码成对的句子
batch_encode_plus_1 = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[(sents[0], sents[1]), (sents[2], sents[3])],
    return_token_type_ids=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    return_length=True,
)

for k, v in batch_encode_plus_1.items():
    print(k, ':', v)

# 默认padding=False
# 句子长度不一
print(len(batch_encode_plus_1['input_ids'][0]))
print(len(batch_encode_plus_1['input_ids'][1]))

print(tokenizer.decode(batch_encode_plus_1['input_ids'][0]))
print(tokenizer.decode(batch_encode_plus_1['input_ids'][1]))

input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102], [101, 2791, 7313, 1922, 2207, 511, 1071, 800, 4638, 6963, 671, 5663, 511, 102, 791, 1921, 2798, 4761, 6887, 6821, 741, 6820, 3300, 5018, 127, 1318, 117, 4696, 3300, 4157, 6944, 7315, 119, 102]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]
length : [27, 34]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [102]:
# 批量解码
tokenizer.batch_decode(batch_encode_plus_1['input_ids'])

['[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP]',
 '[CLS] 房 间 太 小 。 其 他 的 都 一 般 。 [SEP] 今 天 才 知 道 这 书 还 有 第 6 卷, 真 有 点 郁 闷. [SEP]']