# Behind the pipeline

We will be looking under the hood of the following classification pipeline.

In [1]:
# classification
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

Under the hood, the pipeline tokenizes the raw text into numeric inputs and passes them to the model. It takes the model's output of logit probabilities, and conducts post-processing to convert the output into predictions. 

In [3]:
# tokenization
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [4]:
# generate model outputs
from transformers import AutoModel

model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)

The model consists of embeddings, layers, hidden states and a head, which can be configured for different purposes. 

In [7]:
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [9]:
# generate model outputs for classification
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


In [11]:
# convert outputs to probabilities
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [13]:
# get labels from model
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}


# Models

Models can be created from the model configs.

In [17]:
# Create model from model configs
from transformers import BertConfig, BertModel

configs = BertConfig()
model = BertModel(configs)

In [16]:
print(configs)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.32.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [19]:
# load model from checkpoint
checkpoint = "bert-base-cased"
model = BertModel.from_pretrained(checkpoint)

# save model to disk
model.save_pretrained("saved_models")

In [34]:
# tokenize input and generate output
from transformers import AutoTokenizer

sequences = ["Hello!", "Cool.", "Nice!"]
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
encoded_sequences = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(encoded_sequences)

{'input_ids': tensor([[  101,  8667,   106,   102],
        [  101, 13297,   119,   102],
        [  101,  8835,   106,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]])}




In [35]:
outputs = model(**encoded_sequences)

# Playing with tokenizer and model outputs

In [132]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much! Why?",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)

In [133]:
outputs.last_hidden_state.shape

torch.Size([2, 16, 768])

In [134]:
from torch.nn.functional import softmax

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
softmax(outputs.logits, dim=-1)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9944e-01, 5.6230e-04]], grad_fn=<SoftmaxBackward0>)

In [135]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [136]:
sequences = ["Hello!", "Cool.", "Nice!"]
inputs = tokenizer(sequences)

In [137]:
print(inputs["input_ids"])

[[101, 7592, 999, 102], [101, 4658, 1012, 102], [101, 3835, 999, 102]]


In [138]:
import torch

input_tensor = torch.tensor(inputs["input_ids"])

In [139]:
input_tensor

tensor([[ 101, 7592,  999,  102],
        [ 101, 4658, 1012,  102],
        [ 101, 3835,  999,  102]])

In [140]:
output = model(input_tensor)

In [141]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-3.7235,  3.9691],
        [-4.2219,  4.5807],
        [-4.2852,  4.6166]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [142]:
tokenizer("tokenize me this!")

{'input_ids': [101, 19204, 4697, 2033, 2023, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [143]:
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer("tokenize me this!")

{'input_ids': [101, 22559, 3708, 1143, 1142, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [144]:
sequence = "tokenize me this!"
tokenized_sequence = tokenizer.tokenize(sequence)
tokenized_sequence

['token', '##ize', 'me', 'this', '!']

In [145]:
ids = tokenizer.convert_tokens_to_ids(tokenized_sequence)
ids

[22559, 3708, 1143, 1142, 106]

In [152]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "How can something come from nothing?"

tokenized_sequence = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokenized_sequence)
input_ids = torch.tensor([ids])

output = model(input_ids)
print(softmax(output.logits, dim=-1))

tensor([[9.9917e-01, 8.3222e-04]], grad_fn=<SoftmaxBackward0>)


In [153]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.9118, -3.1788]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

# Putting it all together

In [115]:
from transformers import AutoTokenizer, AutoModelForCausalLM

checkpoint = "openai-gpt"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

sequence = "Who are you?"
input_ids = tokenizer.encode(sequence, return_tensors="pt")

output = model.generate(
    input_ids, 
    max_length=100,
    no_repeat_ngram_size=2,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

who are you? " 
 " i'm not a reporter. i've just come from the newspaper. " she 'd already gotten her name and number from a friend. she wasn't going to waste the time of the person who 'd written that story. 
 she didn't want to risk any more chances. the press would be after her. they 'd find out about her, and they wouldn't stop until they got to her father. her own father, who was in a coma. he 'd
