<a href="https://colab.research.google.com/github/gnudennis/hugggingface_demo/blob/main/01_tokenizer_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets evaluate transformers[sentencepiece] accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## tokenizer，构造输入

- tokenizer, model：相匹配，tokenizer output => model input
- Auto\*Tokenizer, AutoModel\*: Generic type
  - len(input_ids)==len(attention_mask)
  - tokenizer(test_sentences[0], )👉🏻 tokenizer.\_\_call\_\_ 👉🏻encode
  - tokenizer.encode == tokenizer.tokenize（分词） + tokenizer.convert_tokens_to_ids
  - tokenizer.decode 
  - tokenizer工作原理：tokenizer.vocab 字典存储了token=>id的映射关系
    - tokenizer.special_tokens_map


In [2]:
test_sentences = ['today is not that bad', 'today is so bad']
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [5]:
# `max_length`包含了`[CLS]`、 `[SEP]`
batch_input = tokenizer(test_sentences, truncation=True, padding='max_length', max_length=16, return_tensors='pt')
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 101, 2651, 2003, 2061, 2919,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [6]:
# `max_length` is ignored when `padding`=`True`
batch_input = tokenizer(test_sentences, truncation=True, padding=True, max_length=256, return_tensors='pt')
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],
        [ 101, 2651, 2003, 2061, 2919,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0]])}

In [7]:
batch_input = tokenizer(test_sentences, truncation=True, padding=True, return_tensors='pt')
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102],
        [ 101, 2651, 2003, 2061, 2919,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0]])}

In [8]:
tokenizer(test_sentences[0],)

{'input_ids': [101, 2651, 2003, 2025, 2008, 2919, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenizer.encode(test_sentences[0],)

[101, 2651, 2003, 2025, 2008, 2919, 102]

In [10]:
tokenizer.tokenize(test_sentences[0], )

['today', 'is', 'not', 'that', 'bad']

In [11]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentences[0], ))

[2651, 2003, 2025, 2008, 2919]

In [12]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(test_sentences[0],))

['[CLS]', 'today', 'is', 'not', 'that', 'bad', '[SEP]']

In [13]:
tokenizer.decode(tokenizer.encode(test_sentences[0],))

'[CLS] today is not that bad [SEP]'

In [14]:
tokenizer.convert_tokens_to_ids([sepcial for sepcial in tokenizer.special_tokens_map.values()])

[100, 102, 0, 101, 103]

##model，调用模型

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [16]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.30.1",
  "vocab_size": 30522
}

In [17]:
import torch
import torch.nn.functional as F

In [18]:
type(batch_input)

transformers.tokenization_utils_base.BatchEncoding

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [20]:
model.to(device)

with torch.no_grad():
  batch_input = batch_input.to(device)
  outputs = model(**batch_input)
  print(outputs)
  scores = F.softmax(outputs.logits, dim=1)
  print(scores)
  labels = torch.argmax(scores, dim=1)
  print(labels)
  labels = [model.config.id2label[id] for id in labels.tolist()]
  print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4620,  3.6118],
        [ 4.7508, -3.7899]], device='cuda:0'), hidden_states=None, attentions=None)
tensor([[8.4631e-04, 9.9915e-01],
        [9.9980e-01, 1.9531e-04]], device='cuda:0')
tensor([1, 0], device='cuda:0')
['POSITIVE', 'NEGATIVE']


## 加载语料

- newsgroups_train.DESCR(语料描)
- newsgroups_train.data
- newsgroups_train.target
- newsgroups_train.target_names


In [21]:
from sklearn.datasets import fetch_20newsgroups

In [22]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [23]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [24]:
from collections import Counter

In [25]:
Counter(newsgroups_train.target)

Counter({7: 594,
         4: 578,
         1: 584,
         14: 593,
         16: 546,
         13: 594,
         3: 590,
         2: 591,
         8: 598,
         19: 377,
         6: 585,
         0: 480,
         12: 591,
         5: 593,
         10: 600,
         9: 597,
         15: 599,
         17: 564,
         18: 465,
         11: 595})

In [26]:
test_news = newsgroups_train.data[:3]

In [27]:
len(test_news[0])

721

## tokenizer补充
- input_ids, attention_mask：通常bert相关模型都有
- token_type_ids 0:表示第一句，1:表示第二句子 👉🏻 不是所有模型都支持这个字段，主要用于bert NSP任务
  - tokenizer(): 都是0
  - encode_plus()： 第一句是0，第二句是1

In [28]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
cls_model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.pr

In [29]:
tokenizer(test_news, truncation=True, padding=True, max_length=8, return_tensors='pt')

{'input_ids': tensor([[  101,  2013,  1024,  3393,  2099,  2595,  3367,   102],
        [  101,  2013,  1024,  3124,  5283,  2080,  1030,   102],
        [  101,  2013,  1024,  1056, 29602,  6856,  1030,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
tokenizer.encode_plus(test_news[0], test_news[1], truncation=True, padding=True, max_length=8, return_tensors='pt')

{'input_ids': tensor([[ 101, 2013, 1024,  102, 2013, 1024, 3124,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [31]:
texts = ['We introduce a new language representation model called BERT,', 'which stands for Bidirectional Encoder Representations from Transformers.']
texts2 = ['BERT is conceptually simple and empirically powerful.', 'Hugging Face is way more fun with friends and colleagues!']
inputs = tokenizer.batch_encode_plus([texts, texts2], padding=True, truncation=True, return_tensors='pt')

In [32]:
inputs

{'input_ids': tensor([[  101,  2057,  8970,  1037,  2047,  2653,  6630,  2944,  2170, 14324,
          1010,   102,  2029,  4832,  2005,  7226,  7442,  7542,  2389,  4372,
         16044,  2099, 15066,  2013, 19081,  1012,   102],
        [  101, 14324,  2003, 17158,  2135,  3722,  1998, 17537,  2135,  3928,
          1012,   102, 17662,  2227,  2003,  2126,  2062,  4569,  2007,  2814,
          1998,  8628,   999,   102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0]])}

In [33]:
print(tokenizer.decode(inputs['input_ids'].tolist()[0]))
print(tokenizer.decode(inputs['input_ids'].tolist()[1]))

[CLS] we introduce a new language representation model called bert, [SEP] which stands for bidirectional encoder representations from transformers. [SEP]
[CLS] bert is conceptually simple and empirically powerful. [SEP] hugging face is way more fun with friends and colleagues! [SEP] [PAD] [PAD] [PAD]
