In [1]:
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
classifier = pipeline("sentiment-analysis")

classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9598045349121094},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

# Preprocessing with a tokenizer

In [3]:
from transformers import AutoTokenizer

#use default model checkpoint
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

#padding is for making sure all inputs are the same length / shape
#trunctation is for cutting off inputs that are too long
#return_tensors is for returning PyTorch tensors

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [4]:
# Loading a model and getting raw logits
from transformers import AutoModel

#use default model checkpoint
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)


In [6]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [7]:
# Getting logits for classification
# classify the sentences as positive or negative
from transformers import AutoModelForSequenceClassification

#use default model checkpoint
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


In [None]:
# pre processing the output
# Our model predicted [-1.5607, 1.6123] for the first sentence and [ 4.1692, -3.3464] for the second one. 
# Those are not probabilities but logits, the raw, unnormalized scores outputted by the last layer of the model. 
# To be converted to probabilities, they need to go through a SoftMax layer 
# (all 🤗 Transformers models output the logits, as the loss function for training will generally fuse the 
# last activation function, such as SoftMax, with the actual loss function, such as cross entropy)
print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [None]:
#make predictions
# First sentence: NEGATIVE: 0.0402, POSITIVE: 0.9598
# Second sentence: NEGATIVE: 0.9995, POSITIVE: 0.0005
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)



tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [10]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [None]:
# Create a transformer and saving a model locally
from transformers import AutoModel

file_path = "D:\\Learn\\models\\bert-base-cased"

model = AutoModel.from_pretrained("bert-base-cased")
model.save_pretrained(file_path)

# loading the model from local path
# we're not going to run this. Just showing how to do it.

# from transformers import AutoModel
# file_path = "D:\\Learn\\models\\bert-base-cased"
# model = AutoModel.from_pretrained(file_path)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Transformer models handle text by turning the inputs into numbers. Here we will look at exactly what happens when your text is processed by the tokenizer

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_input = tokenizer("This is a test")
print(encoded_input)

{'input_ids': [101, 1188, 1110, 170, 2774, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [4]:
# We can decode the input IDs to get back the original text
print(tokenizer.decode(encoded_input["input_ids"]))

[CLS] This is a test [SEP]


In [5]:
#encode multiple inputs
texts = ["This is a test", "This is a second longer test"]
encoded_inputs = tokenizer(texts)
print(encoded_inputs)

{'input_ids': [[101, 1188, 1110, 170, 2774, 102], [101, 1188, 1110, 170, 1248, 2039, 2774, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


In [None]:
# Note that when passing multiple sentences, the tokenizer returns a list for each sentence for each dictionary value. 
# We can also ask the tokenizer to return tensors directly from PyTorch:

# if your input sentences are of different lengths, you can use the padding
# and truncation arguments to ensure that they are all the same length.
encoded_inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
print(encoded_inputs)

{'input_ids': tensor([[ 101, 1188, 1110,  170, 2774,  102,    0,    0],
        [ 101, 1188, 1110,  170, 1248, 2039, 2774,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
# tokenization is the first step in the process of using a transformer model.
# tokenization is the process of converting raw text into a format that the model can understand.
# This usually involves splitting the text into smaller units (tokens) and mapping those tokens to numerical values (input IDs).
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "This is a test"
tokens = tokenizer.tokenize(sequence)
print(tokens)

['This', 'is', 'a', 'test']


In [9]:
# Tokens to input IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[1188, 1110, 170, 2774]


In [13]:
# decoding input IDs back to tokens
# Note that the decode method not only converts the indices back to tokens, 
# but also groups together the tokens that were part of the same words to produce a readable sentence. 
# This behavior will be extremely useful when we use models that predict new text (either text generated from a prompt, 
# or for sequence-to-sequence problems like translation or summarization).
decoded_string = tokenizer.decode(ids)
print(decoded_string)


This is a test


In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "This is a great course, I am learning a lot!"

tokens = tokenizer.tokenize(sequence)
print("token:", tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print("ids:", ids)

input_ids = torch.tensor([ids])
print("tensor:", input_ids)

output = model(input_ids)
print("logits:", output.logits)

token: ['this', 'is', 'a', 'great', 'course', ',', 'i', 'am', 'learning', 'a', 'lot', '!']
ids: [2023, 2003, 1037, 2307, 2607, 1010, 1045, 2572, 4083, 1037, 2843, 999]
tensor: tensor([[2023, 2003, 1037, 2307, 2607, 1010, 1045, 2572, 4083, 1037, 2843,  999]])
logits: tensor([[-4.1740,  4.4881]], grad_fn=<AddmmBackward0>)


In [22]:
# batching is the process of grouping multiple inputs together to be processed at the same time.
# This is more efficient than processing each input individually, as it allows the model to take advantage of parallelism and reduce the overhead of loading the model multiple times.
# Batching is especially important when working with large datasets, as it can significantly speed up the training and inference process.
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "This is a great course, I am learning a lot!",
    "I hate this so much!",
    "This is the worst course I have ever taken.",
]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print("tokens:", tokens)



tokens: {'input_ids': tensor([[ 101, 2023, 2003, 1037, 2307, 2607, 1010, 1045, 2572, 4083, 1037, 2843,
          999,  102],
        [ 101, 1045, 5223, 2023, 2061, 2172,  999,  102,    0,    0,    0,    0,
            0,    0],
        [ 101, 2023, 2003, 1996, 5409, 2607, 1045, 2031, 2412, 2579, 1012,  102,
            0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}
