# Brief pipeline

In [28]:
import torch
from IPython.display import Image
from IPython.core.display import HTML

In [1]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)
# Raw text -> IDs -> logits -> predictions
# input message -> tokenizer -> model -> post processing

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598050713539124},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [2]:
# Tokenizer is part of the model! The information should be available form the model hub.
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [22]:
model_input = tokenizer([
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ], padding=True, truncation=True, return_tensors="pt")

# padding -> ensure input have the same length
# truncation -> ensure inputs have the same length
# attention_mask -> ignore the chosen word, for example padding word.

# IDs refer to library
print(type(model_input['input_ids'][0]))
print(model_input['input_ids'][0])
print(model_input['input_ids'][1])

print(type(model_input['attention_mask'][0]))
print(model_input['attention_mask'][0])
print(model_input['attention_mask'][1])

<class 'torch.Tensor'>
tensor([  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
         2607,  2026,  2878,  2166,  1012,   102])
tensor([ 101, 1045, 5223, 2023, 2061, 2172,  999,  102,    0,    0,    0,    0,
           0,    0,    0,    0])
<class 'torch.Tensor'>
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])


In [20]:
# model part, now this model only output the hidden state without the head (for downstream task)
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
outputs = model(**model_input)
print(outputs)
print(outputs.last_hidden_state.shape)  # batch, time, feture

BaseModelOutput(last_hidden_state=tensor([[[-0.1798,  0.2333,  0.6321,  ..., -0.3017,  0.5008,  0.1481],
         [ 0.2758,  0.6497,  0.3200,  ..., -0.0760,  0.5136,  0.1329],
         [ 0.9046,  0.0985,  0.2950,  ...,  0.3352, -0.1407, -0.6464],
         ...,
         [ 0.1466,  0.5661,  0.3235,  ..., -0.3376,  0.5100, -0.0561],
         [ 0.7500,  0.0487,  0.1738,  ...,  0.4684,  0.0030, -0.6084],
         [ 0.0519,  0.3729,  0.5223,  ...,  0.3584,  0.6500, -0.3883]],

        [[-0.2937,  0.7283, -0.1497,  ..., -0.1187, -1.0227, -0.0422],
         [-0.2206,  0.9384, -0.0951,  ..., -0.3643, -0.6605,  0.2407],
         [-0.1536,  0.8988, -0.0728,  ..., -0.2189, -0.8528,  0.0710],
         ...,
         [-0.3017,  0.9002, -0.0200,  ..., -0.1082, -0.8412, -0.0861],
         [-0.3338,  0.9674, -0.0729,  ..., -0.1952, -0.8181, -0.0634],
         [-0.3454,  0.8824, -0.0426,  ..., -0.0993, -0.8329, -0.1065]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)
t

In [29]:
Image(url= "https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/transformer_and_head.svg")

In [30]:
# Include the classification head
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**model_input)

In [35]:
print(outputs)  # use the model output with softmax to get the final id
print(model.config.id2label)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
{0: 'NEGATIVE', 1: 'POSITIVE'}


# Model

In [43]:
from transformers import BertConfig, BertModel

# Building the config
config = BertConfig()

# Building the model from the config. Init with random weight
model = BertModel(config)

print(config)
outputs = model(**model_input)
print(outputs[0].shape)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

torch.Size([2, 16, 768])


In [44]:
from transformers import BertModel

# pretrained model
model = BertModel.from_pretrained("bert-base-cased")

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 800kB/s]
Downloading pytorch_model.bin: 100%|██████████| 436M/436M [00:08<00:00, 53.0MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificati

In [45]:
outputs = model(**model_input)

In [46]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.3412, -0.1088,  0.1147,  ..., -0.2650,  0.1881, -0.1104],
         [-0.0316, -0.9939,  0.3383,  ..., -0.2290,  0.5234,  0.2093],
         [ 0.1651, -0.8795,  0.5937,  ..., -0.0165, -0.0986,  0.1480],
         ...,
         [-0.1512, -0.5010,  0.1329,  ..., -0.3520, -0.1183,  0.2425],
         [ 0.1559, -0.4795,  0.1415,  ..., -0.4274, -0.2023,  0.2730],
         [ 0.9356, -0.5068,  0.2157,  ..., -0.9903,  0.0920, -0.6285]],

        [[-0.0725,  0.0540, -0.0037,  ...,  0.1450,  0.2381, -0.0164],
         [-0.1619, -0.3062, -0.2282,  ...,  0.3782, -0.1170,  0.1295],
         [-0.1173, -0.1003,  0.1703,  ...,  0.3227, -0.1996,  0.1646],
         ...,
         [-0.1968, -0.3095, -0.2132,  ...,  0.3795, -0.0715,  0.0696],
         [-0.1486, -0.2172, -0.1510,  ...,  0.3222,  0.1034, -0.0320],
         [-0.1927, -0.1105,  0.1996,  ...,  0.4188,  0.2443, -0.0229]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_ou

# Tokenizer
word -> token (int IDs)

In [47]:
# word
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

# char

['Jim', 'Henson', 'was', 'a', 'puppeteer']


In [49]:
# sub word
# annoyingly -> annoying ly

# Byte BPE (GPT2), WordPiece (BERT), SentencePiece, Unigram

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:01<00:00, 354kB/s]


In [51]:
# final IDs
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [52]:
# to token
tokenizer.tokenize("Using a Transformer network is simple")

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']

In [61]:
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast
tokenizer: BertTokenizerFast
print(tokenizer.vocab_files_names)

{'vocab_file': 'vocab.txt', 'tokenizer_file': 'tokenizer.json'}
