In [83]:
from transformers import BertTokenizer

In [84]:
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese')
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [85]:
sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

In [86]:
t_singe = tokenizer(
    # text(str, List[str], List[List[str]])
    #   —The sequence or batch of sequences to be encoded.
    text=sents[0])  # sequence

print(t_singe)
print(tokenizer.decode(t_singe['input_ids']))

for k, v in t_singe.items():
    print(k, ':', v)  # 单个句子;token_type_ids全为0

{'input_ids': [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP]
input_ids : [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [87]:
t_pair = tokenizer(text=sents[0],
                   # text_pair(str, List[str], List[List[str]])
                   #  — The sequence or batch of sequences to be encoded.
                   text_pair=sents[1])  # 句子对

print(t_pair)
print(tokenizer.decode(t_pair['input_ids']))

{'input_ids': [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP]


In [88]:
t_b = tokenizer(
    text=[sents[0], sents[1], sents[2]],  #  batch of sequences
    text_pair=[sents[2], sents[3], sents[4]])

for k, v in t_b.items():
    print(k, ':', v)  # 句子对;第一个句子token_type_ids全为0,第二个句子token_type_ids全为ie

print(len(t_b['input_ids'][0]))
print(len(t_b['input_ids'][1]))
print(len(t_b['input_ids'][2]))

print(tokenizer.decode(t_b['input_ids'][0]))
print(tokenizer.decode(t_b['input_ids'][1]))
print(tokenizer.decode(t_b['input_ids'][2]))

input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 2791, 7313, 1922, 2207, 511, 1071, 800, 4638, 6963, 671, 5663, 511, 102], [101, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 791, 1921, 2798, 4761, 6887, 6821, 741, 6820, 3300, 5018, 127, 1318, 117, 4696, 3300, 4157, 6944, 7315, 119, 102], [101, 2791, 7313, 1922, 2207, 511, 1071, 800, 4638, 6963, 671, 5663, 511, 102, 3322, 1690, 5520, 7481, 849, 725, 6158, 3056, 749, 2476, 784, 720, 3403, 5041, 8024, 3655, 5540, 6820, 1762, 511, 102]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1

In [89]:
# 批量解码
tokenizer.batch_decode(t_b['input_ids'])

['[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 房 间 太 小 。 其 他 的 都 一 般 。 [SEP]',
 '[CLS] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] 今 天 才 知 道 这 书 还 有 第 6 卷, 真 有 点 郁 闷. [SEP]',
 '[CLS] 房 间 太 小 。 其 他 的 都 一 般 。 [SEP] 机 器 背 面 似 乎 被 撕 了 张 什 么 标 签 ， 残 胶 还 在 。 [SEP]']

In [90]:
'''
add_special_tokens (bool, optional, defaults to True) —
        Whether or not to encode the sequences with the special tokens relative to their model.
'''
t_astf = tokenizer(text=sents[0],
                   text_pair=sents[1],
                   add_special_tokens=False)

print(t_astf)
print(tokenizer.decode(t_astf['input_ids']), end='\n\n')

t_astfd = tokenizer(text=sents[0],
                    text_pair=sents[1])  # 默认add_special_tokens=True

print(t_astfd)
print(tokenizer.decode(t_astfd['input_ids']))

{'input_ids': [6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
选 择 珠 江 花 园 的 原 因 就 是 方 便 。 笔 记 本 的 键 盘 确 实 爽 。

{'input_ids': [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP]


In [91]:
t_p_max_length = tokenizer(text=sents[0], text_pair=sents[1],
                           # True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single sequence if provided).
                           # 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                           # False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different lengths).
                           padding="max_length")

print(t_p_max_length)
print(len(t_p_max_length['input_ids']))  # maximum acceptable input length for the model
print(tokenizer.decode(t_p_max_length['input_ids']))

{'input_ids': [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [92]:
t_p_num = tokenizer(text=sents[0], text_pair=sents[1],
                    padding="max_length",
                    # Controls the maximum length to use by one of the truncation/padding parameters.
                    max_length=30)

print(t_p_num)
print(len(t_p_num['input_ids']))
print(tokenizer.decode(t_p_num['input_ids']))
print(len(tokenizer.decode(t_p_num['input_ids']).split()))

{'input_ids': [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}
30
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]
30


In [93]:
t_p_true = tokenizer(text=[sents[0], sents[1], sents[2]],
                     padding=True)  # ★★★★★Pad to the longest sequence in the batch

print(t_p_true)
print(len(t_p_true['input_ids'][0]), len(t_p_true['input_ids'][0]), len(t_p_true['input_ids'][0]))
print(tokenizer.decode(t_p_true['input_ids'][0]))
print(tokenizer.decode(t_p_true['input_ids'][1]))
print(tokenizer.decode(t_p_true['input_ids'][2]))

{'input_ids': [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102], [101, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0, 0], [101, 2791, 7313, 1922, 2207, 511, 1071, 800, 4638, 6963, 671, 5663, 511, 102, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]}
16 16 16
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP]
[CLS] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD] [PAD]
[CLS] 房 间 太 小 。 其 他 的 都 一 般 。 [SEP] [PAD] [PAD]


In [94]:
t_t_num = tokenizer(text=sents[0],
                    # True or 'longest_first': Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                    #                          This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided.
                    # only_first: Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                    #             This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                    # only_second: Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
                    #              This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                    truncation=True,  # 默认truncation=False
                    max_length=5)

print(t_t_num)  # 截断(最大长度为5)
print(len(t_t_num['input_ids']))
print(tokenizer.decode(t_t_num['input_ids']))

{'input_ids': [101, 6848, 2885, 4403, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
5
[CLS] 选 择 珠 [SEP]


In [95]:
t_t_num_pair = tokenizer(text=sents[0], text_pair=sents[1],
                         truncation=True, max_length=9)

# 保存特殊符号[CLS] [SEP] [SEP]
# 剩余长度=9-3=6
# 第一个句子保留 剩余长度/2=3(若不整除,向上取整) 个token
# 第二个句子保留 剩余长度/2=3(若不整除,向下取整) 个token
print(t_t_num_pair)
print(len(t_t_num_pair['input_ids']))
print(tokenizer.decode(t_t_num_pair['input_ids']), end='\n\n')

t_t_num_pair_first = tokenizer(text=sents[0], text_pair=sents[1],
                               truncation='only_first', max_length=20)

print(t_t_num_pair_first)  # 只截断第一个句子(最大长度为20)
print(len(t_t_num_pair_first['input_ids']))
print(tokenizer.decode(t_t_num_pair_first['input_ids']), end='\n\n')

t_t_num_pair_second = tokenizer(text=sents[0], text_pair=sents[1],
                                truncation='only_second', max_length=20)

print(t_t_num_pair_second)  # 只截断第二个句子(最大长度为20)
print(len(t_t_num_pair_second['input_ids']))
print(tokenizer.decode(t_t_num_pair_second['input_ids']))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'input_ids': [101, 6848, 2885, 4403, 102, 5011, 6381, 3315, 102], 'token_type_ids': [0, 0, 0, 0, 0, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
9
[CLS] 选 择 珠 [SEP] 笔 记 本 [SEP]

{'input_ids': [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
20
[CLS] 选 择 珠 江 花 园 的 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP]

{'input_ids': [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
20
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 [SEP]


In [96]:
print(tokenizer.padding_side)
print(tokenizer.truncation_side)

tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

print(tokenizer.padding_side)
print(tokenizer.truncation_side)

right
right
left
left


In [97]:
t_p_max_length_left = tokenizer(text=sents[0], text_pair=sents[1], padding="max_length")

print(t_p_max_length_left)
print(len(t_p_max_length_left['input_ids']))  # maximum acceptable input length for the model
print(tokenizer.decode(t_p_max_length_left['input_ids']))

{'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [98]:
t_t_num_left = tokenizer(text=sents[0], truncation=True, max_length=5)

print(t_t_num_left)  # 截断(最大长度为5)
print(len(t_t_num_left['input_ids']))
print(tokenizer.decode(t_t_num_left['input_ids']))

{'input_ids': [101, 3175, 912, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
5
[CLS] 方 便 。 [SEP]


In [99]:
tokenizer.padding_side = 'right'
tokenizer.truncation_side = 'right'

In [100]:
result_list = tokenizer(text=sents[0])
print(type(result_list['input_ids']))  # list(默认)
print(type(result_list['token_type_ids']))
print(type(result_list['attention_mask']))

result_torch = tokenizer(text=sents[0],
                         return_tensors='pt')
print(result_torch)
print(result_torch['input_ids'].shape, result_torch['input_ids'].dtype)
print(result_torch['token_type_ids'].shape, result_torch['token_type_ids'].dtype)
print(result_torch['attention_mask'].shape, result_torch['attention_mask'].dtype)
print(type(result_torch['input_ids']))  # torch.Tensor
print(type(result_torch['token_type_ids']))
print(type(result_torch['attention_mask']))

result_tf = tokenizer(text=sents[0],
                      return_tensors='tf')
print(type(result_tf['input_ids']))  # tensorflow.python.framework.ops.EagerTensor
print(type(result_tf['token_type_ids']))
print(type(result_tf['attention_mask']))

<class 'list'>
<class 'list'>
<class 'list'>
{'input_ids': tensor([[ 101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221,
         3175,  912,  511,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([1, 16]) torch.int64
torch.Size([1, 16]) torch.int64
torch.Size([1, 16]) torch.int64
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [101]:
t_return = tokenizer(
    text=sents[0],
    text_pair=sents[1],
    truncation=True,
    padding='max_length',
    max_length=30,
    # 返回special_tokens_mask
    return_special_tokens_mask=True,  # 默认return_special_tokens_mask=False
    # 返回length标识长度
    return_length=True,  # 默认return_length=False
    return_attention_mask=True,  # 默认return_attention_mask=True
    return_token_type_ids=True,  # 默认return_token_type_ids=True
)

# input_ids:编码结果
# token_type_ids:第一个句子和特殊符号的位置是0,第二个句子的位置是1(不是所有分词器都有)
# attention_mask:pad的位置是0,其他位置是1
# special_tokens_mask:特殊符号的位置是1,其他位置是0
# length:句子长度
for k, v in t_return.items():
    print(k, ':', v)

print(tokenizer.decode(t_return['input_ids']))

input_ids : [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
length : 30
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]


In [102]:
# Converts a string in a sequence of tokens, using the tokenizer.
participle = tokenizer.tokenize(sents[0])
participle  # 分词器分词结果

['选', '择', '珠', '江', '花', '园', '的', '原', '因', '就', '是', '方', '便', '。']

In [103]:
tti = tokenizer.convert_tokens_to_ids(participle)
print(tti)

itt = tokenizer.convert_ids_to_tokens(tti)
print(itt)

print(tokenizer.decode(tti))
print(tokenizer.decode(tti).split(' '))  # 与上等价

[6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511]
['选', '择', '珠', '江', '花', '园', '的', '原', '因', '就', '是', '方', '便', '。']
选 择 珠 江 花 园 的 原 因 就 是 方 便 。
['选', '择', '珠', '江', '花', '园', '的', '原', '因', '就', '是', '方', '便', '。']
