<a href="https://colab.research.google.com/github/charlenefrench001/First_repository/blob/master/Handling_multiple_sequences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
!pip install transformers

!pip install transformers[sentencepiece]



In [29]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = [
            "I've been waiting for a HuggingFace course my whole life.",
            "I hate winter!"
]

ids_lst = []
for s in sequence:
  print(s)
  t = tokenizer.tokenize(s)
  print(t)
  t = tokenizer.convert_tokens_to_ids(t)
  print(t)
  ids_lst.append(t)
  print(ids_lst)
#tokens = tokenizer.tokenize(sequence)
#ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor(ids_lst)
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

I've been waiting for a HuggingFace course my whole life.
['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]
I hate winter!
['i', 'hate', 'winter', '!']
[1045, 5223, 3467, 999]
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 3467, 999]]


ValueError: ignored

In [74]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1 = "I've been waiting for a HuggingFace course my whole life."
sequence2 = "I hate winter!"

tokens1 = tokenizer.tokenize(sequence1)
ids1 = tokenizer.convert_tokens_to_ids(tokens1)

tokens2 = tokenizer.tokenize(sequence2)
ids2 = tokenizer.convert_tokens_to_ids(tokens2)

batched_ids = [ids1, ids2]
print("Input IDs:", batched_ids)

input_ids = torch.tensor(batched_ids)
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)


Input IDs: [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 3467, 999]]


ValueError: ignored

In [31]:
import torch
from transformers import AutoTokenizer, AutoModelForImageClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

batched_ids = torch.tensor([ids, ids])
print("Input IDs:", batched_ids)

output = model(batched_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward>)


In [32]:
padding_id = 100

batched_ids = [
               [200, 200, 200],
               [200, 200, padding_id]
]

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [[200, 200, 200], [200, 200, tokenizer.pad_token_id]]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

#There’s something wrong with the logits in our batched predictions: the second row should be the same as the logits for the second sentence, but we’ve got completely different values! This is because the key feature of Transformer models is attention layers that contextualize each token. 

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward>)
tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward>)


In [33]:
#Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).
batched_ids = [
               [200, 200, 200],
               [200, 200, padding_id]
]

attention_mask = [
                  [1, 1, 1],
                  [1, 1, 0]
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward>)


In [50]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = [
            "I've been waiting for a HuggingFace course my whole life.",
            "I hate winter!"
]

ids_lst = []
for s in sequence:
  t = tokenizer.tokenize(s)
  c = tokenizer.convert_tokens_to_ids(t)
  ids_lst.append(c)
  print(ids_lst)
#tokens = tokenizer.tokenize(sequence)
#ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor(ids_lst)
print(input_ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172, 999]]


TypeError: ignored

In [81]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
            "I've been waiting for a HuggingFace course my whole life.",
            "I hate winter!"
]

tokens = [tokenizer.tokenize(sequence) for sequence in sequences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

print(ids[0])
print(ids[1])

input_ids1 = torch.tensor([ids[0]])
input_ids2 = torch.tensor([ids[1]])
print(input_ids1)
print(input_ids2)

output1 = model(input_ids1)
output2 = model(input_ids2)
print(output1.logits)
print(output2.logits)


[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 3467, 999]
tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
tensor([[1045, 5223, 3467,  999]])
tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)
tensor([[ 2.0311, -1.6033]], grad_fn=<AddmmBackward>)


In [87]:
def pad_to_max_length(a_list):
  max_length = max(len(x) for x in a_list)
  for idx, x in enumerate(a_list):
    a_list[idx] = x + [0]*(max_length - len(x))
  return a_list 

In [98]:
ids = [
       [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
       [1045, 5223, 3467, 999],
       [104, 123]
       
]
rst = pad_to_max_length(ids)
print(ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 3467, 999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [104, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [93]:
l = [1,23]+[4,5]

In [97]:
print(l)
for idx, x in enumerate(l):
  l[idx] = idx
print(l)

[0, 0, 0, 0]
[0, 1, 2, 3]


In [86]:
import torch

def pad_to_max_length(a_list):
  max_length = max(len(x) for x in a_list)
  for idx, x in enumerate(a_list):
    a_list[idx] = x + [0]*(max_length - len(x))
  return a_list 

ids = [
       [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
       [1045, 5223, 3467, 999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
       
]

input_ids = torch.tensor(ids)
outputs = model(input_ids)
print(outputs.logits)

tensor([[-2.7276,  2.8789],
        [ 0.2512, -0.1760]], grad_fn=<AddmmBackward>)


In [84]:
batched_ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], 
               [1045, 5223, 2023, 2061, 2172, 999, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id]
               
]

attention_mask = [
                  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                  [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[-2.7276,  2.8789],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward>)


In [None]:
#sequence = sequence[:max_sequence_length]

In [51]:
batch_sentences = [
                   "Hello I'm a single sentence",
                   "And another sentence",
                   "And the very very last one"
                   ]
encoded_inputs = tokenizer(batch_sentences)
print(encoded_inputs)

{'input_ids': [[101, 7592, 1045, 1005, 1049, 1037, 2309, 6251, 102], [101, 1998, 2178, 6251, 102], [101, 1998, 1996, 2200, 2200, 2197, 2028, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}
