<a href="https://colab.research.google.com/github/codedm24/Transformers/blob/Transformers-Intro/Multiple-Sequences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
import transformers

In [None]:
!pip install transformers[sentencepiece]



In [34]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "What does machine learning do?"

tokens = tokenizer.tokenize(sequence)
print(f"tokens: {tokens}")
ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"ids: {ids}")

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

print(tokenizer.decode(ids))
print(tokenizer.decode(input_ids[0]))

tokens: ['what', 'does', 'machine', 'learning', 'do', '?']
ids: [2054, 2515, 3698, 4083, 2079, 1029]
Input IDs: tensor([[2054, 2515, 3698, 4083, 2079, 1029]])
Logits: tensor([[ 3.5672, -2.8648]], grad_fn=<AddmmBackward0>)
what does machine learning do?
what does machine learning do?


Padding sequence ids

In [None]:
sequence1_ids = [[200,200,200]]
sequence2_ids = [[200,200]]
batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]


Padding the inputs

In [2]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequence_ids1 = [[200,200,200]]
sequence_ids2 = [[200,200]]
batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]
print(model(torch.tensor(sequence_ids1)))
print(model(torch.tensor(sequence_ids2)))
print(model(torch.tensor(batched_ids)))

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


Padding the inputs with attention mask

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequence_ids1 = [[200,200,200]]
sequence_ids2 = [[200,200]]
batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]

attention_mask = [
    [1,1,1],
    [1,1,0]
]

print(torch.tensor(sequence_ids1))
print(torch.tensor(sequence_ids2))
print(torch.tensor(batched_ids))
print(torch.tensor(attention_mask))

print(model(torch.tensor(sequence_ids1)))
print(model(torch.tensor(sequence_ids2)))
print(model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask)))


#print(tokenizer.decode(torch.tensor(sequence_ids1)))
#print(tokenizer.decode(sequence_ids1["input_ids"]))

tensor([[200, 200, 200]])
tensor([[200, 200]])
tensor([[200, 200, 200],
        [200, 200,   0]])
tensor([[1, 1, 1],
        [1, 1, 0]])
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


Using Tokenizer

In [29]:
sequence = ["What does Machine Learning do?"]
model_inputs = tokenizer(sequence, padding=True, return_tensors="pt")
print(f"model inputs: {model_inputs}")
print(f"model inputs: {model_inputs['input_ids']}")
ids = tokenizer.convert_tokens_to_ids(model_inputs)
print(f"ids: {ids}")
print(torch.tensor(model_inputs['input_ids']))
print(tokenizer.decode(model_inputs['input_ids'][0]))

model inputs: {'input_ids': tensor([[ 101, 2054, 2515, 3698, 4083, 2079, 1029,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
model inputs: tensor([[ 101, 2054, 2515, 3698, 4083, 2079, 1029,  102]])
ids: [100, 100]
tensor([[ 101, 2054, 2515, 3698, 4083, 2079, 1029,  102]])
[CLS] what does machine learning do? [SEP]


  print(torch.tensor(model_inputs['input_ids']))


In [36]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sequence = "What are the branches of Machine Learning?"
model_inputs = tokenizer(sequence)
print(f"model_inputs: {model_inputs}")

# pad sequences to max sequqnece length
model_inputs1 = tokenizer(sequence, padding='longest')
print(f"model_inputs1: {model_inputs1}")

#pad sequences to model max length
model_inputs2 = tokenizer(sequence, padding='max_length')
print(f"model_inputs2: {model_inputs2}")

model_inputs: {'input_ids': [101, 2054, 2024, 1996, 5628, 1997, 3698, 4083, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
model_inputs1: {'input_ids': [101, 2054, 2024, 1996, 5628, 1997, 3698, 4083, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
model_inputs2: {'input_ids': [101, 2054, 2024, 1996, 5628, 1997, 3698, 4083, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,