In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

## 1.用法示例

In [None]:
test_senteces = ['today is not that bad', 'today is so bad', 'so good']
batch_input = tokenizer(test_senteces, truncation=True, padding=True, return_tensors='pt')
# >>> {'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],[ 101, 2651, 2003, 2061, 2919,  102,    0]]),
# >>> 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 0]])}

tokenizer(test_senteces[0], )
# >>> {'input_ids': [101, 2651, 2003, 2025, 2008, 2919, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

tokenizer.encode(test_senteces[0], )
# >>> [101, 2651, 2003, 2025, 2008, 2919, 102]
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_senteces[0])) # 等价于 encode
# >>> [2651, 2003, 2025, 2008, 2919]
tokenizer.decode([101, 2651, 2003, 2025, 2008, 2919, 102])
# >>> '[CLS] today is not that bad [SEP]'

tokenizer.special_tokens_map.values() # 特殊字符
# >>> dict_values(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'])
tokenizer.convert_tokens_to_ids([special for special in tokenizer.special_tokens_map.values()])
# >>> [100, 102, 0, 101, 103]



## 2.模型调用

In [None]:
model.config

In [None]:
# 输出解析
with torch.no_grad():
    outputs = model(**batch_input)
    print(outputs)
    scores = F.softmax(outputs.logits, dim=1)
    print(scores)
    labels = torch.argmax(scores, dim=1)
    print(labels)
    labels = [model.config.id2label[id] for id in labels.tolist()] # tolist 转成list
    print(labels)

## 3.Bert tokenizer

In [None]:
from transformers import BertTokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
# >>> PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right',
# >>> special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

tokenizer.special_tokens_map
# >>> {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}

In [None]:
# single sentence 级别的
tokenizer(test_news[0], truncation=True, max_length=32)

# >>> {'input_ids': [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 1006, 2073, 1005, 1055, 2026, 2518, 1007, 3395, 1024, 2054, 2482, 2003, 2023, 999, 1029, 102], 
# >>>  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# >>>  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# sentence pair 级别
tokenizer.encode_plus(text=test_news[0], text_pair=test_news[1], max_length=32, truncation=True)

# >>> {'input_ids': [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 102, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 102], 
# >>> 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
# >>> 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
