# Handling multiple sequences (TensorFlow)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [66]:
!pip install datasets evaluate transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [67]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tf.constant(ids)
# This line will fail.
model(input_ids)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_99']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-2.7276225,  2.8789392]], dtype=float32)>, hidden_states=None, attentions=None)

In [68]:
tokenized_inputs = tokenizer(sequence, return_tensors="tf")
print(tokenized_inputs["input_ids"])

tf.Tensor(
[[  101  1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026
   2878  2166  1012   102]], shape=(1, 16), dtype=int32)


In [69]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = tf.constant([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_119']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input IDs: tf.Tensor(
[[ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878
   2166  1012]], shape=(1, 14), dtype=int32)
Logits: tf.Tensor([[-2.7276225  2.8789392]], shape=(1, 2), dtype=float32)


In [70]:
batched_ids = [ids, ids]
print(batched_ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]


✏️ Try it out! Convert this batched_ids list into a tensor and pass it through your model. Check that you obtain the same logits as before (but twice)!

In [71]:
batched_input_ids = tf.constant(batched_ids)
print("Batched input IDs:", batched_input_ids)

Batched input IDs: tf.Tensor(
[[ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878
   2166  1012]
 [ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878
   2166  1012]], shape=(2, 14), dtype=int32)


In [72]:
output = model(batched_input_ids)
print("Batched logits:", output.logits)

Batched logits: tf.Tensor(
[[-2.7276185  2.878935 ]
 [-2.7276206  2.878937 ]], shape=(2, 2), dtype=float32)


# Yes

# Padding the inputs

In [73]:
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [74]:
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

In [75]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(tf.constant(sequence1_ids)).logits)
print(model(tf.constant(sequence2_ids)).logits)
print(model(tf.constant(batched_ids)).logits)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_139']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tf.Tensor([[ 1.5693678 -1.3894578]], shape=(1, 2), dtype=float32)
tf.Tensor([[ 0.58030325 -0.41252738]], shape=(1, 2), dtype=float32)
tf.Tensor(
[[ 1.569367  -1.3894578]
 [ 1.3373486 -1.2163193]], shape=(2, 2), dtype=float32)


In [76]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(tf.constant(batched_ids), attention_mask=tf.constant(attention_mask))
print(outputs.logits)

tf.Tensor(
[[ 1.569367   -1.3894578 ]
 [ 0.58029795 -0.4125215 ]], shape=(2, 2), dtype=float32)


✏️ Try it out! Apply the tokenization manually on the two sentences used in section 2 (“I’ve been waiting for a HuggingFace course my whole life.” and “I hate this so much!”). Pass them through the model and check that you get the same logits as in section 2. Now batch them together using the padding token, then create the proper attention mask. Check that you obtain the same results when going through the model!

## Apply tokenization separately

In [77]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_159']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Not the same - this is what I got from session 2

tf.Tensor(
[[-1.5606971  1.6122824]
 [ 4.169232  -3.3464472]], shape=(2, 2), dtype=float32)

In [78]:

print("Logits 1:", model(input_ids1).logits, "Logits 2:", model(input_ids2).logits)

Logits 1: tf.Tensor([[-2.7276225  2.8789392]], shape=(1, 2), dtype=float32) Logits 2: tf.Tensor([[ 3.1248865 -2.6449811]], shape=(1, 2), dtype=float32)


# Codes copied from section 2

In [None]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
raw_tokens = tokenizer(raw_inputs)

print(raw_tokens)
# inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="tf")

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


# This one has the 101 and 102 paddings



In [86]:
sequence1 = "I've been waiting for a HuggingFace course my whole life."
sequence2 = "I hate this so much."

tokens1 = tokenizer.tokenize(sequence1)
tokens2 = tokenizer.tokenize(sequence2)

print(tokens1, "\n", tokens2)

['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.'] 
 ['i', 'hate', 'this', 'so', 'much', '.']


In [87]:

ids1 = tokenizer.convert_tokens_to_ids(tokens1)
ids2 = tokenizer.convert_tokens_to_ids(tokens2)

input_ids1 = tf.constant(ids1)
input_ids2 = tf.constant(ids2)

print("Input IDs 1:", input_ids1)
print("Input IDs 2:", input_ids2)

Input IDs 1: tf.Tensor(
[ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878
  2166  1012], shape=(14,), dtype=int32)
Input IDs 2: tf.Tensor([1045 5223 2023 2061 2172 1012], shape=(6,), dtype=int32)


# This one lacks padding

In [91]:
tokens1_sb = tokenizer([sequence1])
tokens2_sb = tokenizer([sequence2])

print(tokens1_sb, "\n", tokens2_sb)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} 
 {'input_ids': [[101, 1045, 5223, 2023, 2061, 2172, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}


# If I put sequence1 and sequence2 inside square brackets, I can preserve the 101 and 102 tokens

# Logits for raw_tokens

In [92]:
print(raw_tokens)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


In [116]:
print(raw_tokens['input_ids'])

[[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 999, 102]]


In [127]:
print(model(tf.constant(raw_tokens['input_ids'][0])).logits)
print(model(tf.constant(raw_tokens['input_ids'][1])).logits)

tf.Tensor([[-1.5606974  1.612282 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[ 4.1692314 -3.3464477]], shape=(1, 2), dtype=float32)


#vs the ones without padding

In [129]:
print(model(input_ids1).logits)
print(model(input_ids2).logits)


tf.Tensor([[-2.7276225  2.8789392]], shape=(1, 2), dtype=float32)
tf.Tensor([[ 3.1248865 -2.6449811]], shape=(1, 2), dtype=float32)


# Both are not quite right. Need to add padding to the shorter sentence

In [147]:

# Pad the shorter sentence
raw_tokens['input_ids'][1]= raw_tokens['input_ids'][1]+[0]*(len(raw_tokens['input_ids'][0])-len(raw_tokens['input_ids'][1]))

print(raw_tokens['input_ids'])

[[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0]]


# Now pad the shorter attention mask

In [148]:
print(raw_tokens['attention_mask'])

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]


In [149]:
print(raw_tokens['attention_mask'][1])

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [150]:
# Pad the second attention mask
raw_tokens['attention_mask'][1]= raw_tokens['attention_mask'][1]+[0]*(len(raw_tokens['attention_mask'][0])-len(raw_tokens['attention_mask'][1]))

In [151]:
print(raw_tokens['attention_mask'][1])

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [152]:
print(tf.constant(raw_tokens['input_ids'][0]),"\n")
print(tf.constant(raw_tokens['attention_mask'][0]),"\n")

print(model(tf.constant(raw_tokens['input_ids'][0]), attention_mask = tf.constant(raw_tokens['attention_mask'][0])).logits)


tf.Tensor(
[  101  1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026
  2878  2166  1012   102], shape=(16,), dtype=int32) 

tf.Tensor([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], shape=(16,), dtype=int32) 

tf.Tensor([[-1.5606974  1.612282 ]], shape=(1, 2), dtype=float32)


In [153]:
print(tf.constant(raw_tokens['input_ids'][1]),"\n")
print(tf.constant(raw_tokens['attention_mask'][1]),"\n")

print(model(tf.constant(raw_tokens['input_ids'][1]), attention_mask = tf.constant(raw_tokens['attention_mask'][1])).logits)


tf.Tensor(
[ 101 1045 5223 2023 2061 2172  999  102    0    0    0    0    0    0
    0    0], shape=(16,), dtype=int32) 

tf.Tensor([1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0], shape=(16,), dtype=int32) 

tf.Tensor([[ 4.1692314 -3.3464475]], shape=(1, 2), dtype=float32)


# Yes, these are the same logits from session 2. That's because I included the 101 and 102 tokens from the model 