### 1. 加载模型

In [80]:
from transformers import BertTokenizer

In [81]:
model_name = 'bert-base-uncased'

In [82]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [83]:
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [85]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [86]:
special_tokens = list(tokenizer.special_tokens_map.values())

In [87]:
special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [88]:
tokenizer.convert_tokens_to_ids(special_tokens)

[100, 102, 0, 101, 103]

In [89]:
tokenizer.encode(special_tokens)

[101, 100, 102, 0, 101, 103, 102]

In [90]:
tokenizer.decode([101, 100, 102, 0, 101, 103, 102])

'[CLS] [UNK] [SEP] [PAD] [CLS] [MASK] [SEP]'

### 2. 认识文本语料

- newsgroups_train.DESCR
- newsgroups_train.data
- newsgroups_train.target
- newsgroups_train.target_names

In [91]:
from sklearn.datasets import fetch_20newsgroups

In [92]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [95]:
type(newsgroups_train.data)

list

In [96]:
len(newsgroups_train.data)

11314

In [98]:
len(newsgroups_train.target) 

11314

In [99]:
from collections import Counter

In [100]:
Counter(newsgroups_train.target)

Counter({7: 594,
         4: 578,
         1: 584,
         14: 593,
         16: 546,
         13: 594,
         3: 590,
         2: 591,
         8: 598,
         19: 377,
         6: 585,
         0: 480,
         12: 591,
         5: 593,
         10: 600,
         9: 597,
         15: 599,
         17: 564,
         18: 465,
         11: 595})

In [101]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### 3. tokenizer 补充

- input_ids, attention_mask
    - mask：bert 另外一个预训练任务，mlm；
- encode_plus, token_type_ids
    - token_type_ids: 0：表示第一句，1：表示第二句；可以通过 tokenizer()（tokenizer.\_\_call\_\_：都是0的）；也可以通过 encode_plus 生成/返回（前一句为0，后一句为1）；
    - 句子对，一般使用在 nsp（next sentence predict，bert 预训练任务）

In [102]:
test_news = newsgroups_train.data[:3]

In [107]:
len(test_news[2])

1981

In [115]:
# single sentence 级别的
tokenizer(test_news[0], truncation=True, max_length=32)

{'input_ids': [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 1006, 2073, 1005, 1055, 2026, 2518, 1007, 3395, 1024, 2054, 2482, 2003, 2023, 999, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [118]:
# sentence pair 级别
tokenizer.encode_plus(text=test_news[0], text_pair=test_news[1], max_length=32, truncation=True)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'input_ids': [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 102, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [119]:
tokenizer.decode([101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 102, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 102])

'[CLS] from : lerxst @ wam. umd. edu [SEP] from : guykuo @ carson. u. washington. edu [SEP]'