### Loading the tokenizer

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [3]:
sentence = "Bangladesh is my country ঢাকা ঢাকা"
            

In [4]:
tokens = tokenizer.tokenize(sentence)
print(tokens)

['Bangladesh', 'is', 'my', 'country', '[UNK]', '[UNK]']


In [5]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[6735, 1110, 1139, 1583, 100, 100]


In [6]:
decoded_string = tokenizer.decode(input_ids)
print(decoded_string)

Bangladesh is my country [UNK] [UNK]


### Handling muliple sequences

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

# load tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [8]:
sequence = "I've been waiting for a HuggingFace course my whole life."

In [9]:
# covert to tokens
tokens = tokenizer.tokenize(sequence)
print(tokens)

['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']


In [10]:
# convert tokens to ids
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [11]:
input_ids =  torch.tensor(ids)
print(input_ids)

tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
         2026,  2878,  2166,  1012])


In [12]:
# This will fail
model(input_ids)

IndexError: too many indices for tensor of dimension 1

In [13]:
# passing in proper way, list of list
input_ids = torch.tensor([ids])
print(input_ids)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])


In [14]:
output = model(input_ids)
print(output.logits)

tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [15]:
batched_ids = [ids, ids]
print(batched_ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]


In [16]:
input_ids = torch.tensor(batched_ids)
print(input_ids)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])


In [17]:
output = model(input_ids)
print(output.logits)

tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [18]:
# This won't work as they are in different in size

batched_ids = [
    [200, 200, 200],
    [200, 200,300, 100],
    [200, 200, 330, 400]
]

In [19]:
input_ids = torch.tensor(batched_ids)
print(input_ids)

ValueError: expected sequence of length 3 at dim 1 (got 4)

### Attention Mask

In [4]:
sentence1 = "I’ve been waiting for a HuggingFace course my whole life."
sentence2 = "I hate this so much!"

In [7]:
tokens_1 = tokenizer.tokenize(sentence1)
print(tokens_1)
tokens_2 = tokenizer.tokenize(sentence2)
print(tokens_2)

['i', '’', 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
['i', 'hate', 'this', 'so', 'much', '!']


In [9]:
# Token to IDs

ids_1 = tokenizer.convert_tokens_to_ids(tokens_1)
print(ids_1)
ids_2 = tokenizer.convert_tokens_to_ids(tokens_2)
print(ids_2)

[1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 2061, 2172, 999]


In [14]:
# convert to tensor
input_ids_1 = torch.tensor([ids_1])
print(input_ids_1)
input_ids_2 = torch.tensor([ids_2])
print(input_ids_2)

tensor([[ 1045,  1521,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
tensor([[1045, 5223, 2023, 2061, 2172,  999]])


In [18]:
output1 =  model(input_ids_1)
print(output1.logits)
output2 =  model(input_ids_2)
print(output2.logits)

tensor([[-2.5720,  2.6852]], grad_fn=<AddmmBackward0>)
tensor([[ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)


In [None]:
# Manually update the seconds sequence ids with padding ids
ids_1 = [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
pd = tokenizer.pad_token_type_id
print(f"padding token {pd}")
ids_2 = [1045, 5223, 2023, 2061, 2172, 999, pd, pd, pd, pd , pd, pd ,pd, pd ]

padding token 0


14

In [None]:
batched_input_ids = torch.tensor([ids_1, ids_2])
output = model(batched_input_ids)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.5720,  2.6852],
        [ 2.5423, -2.1265]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


### Now applying attention mask

In [39]:
attention_mask = [
    [1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    [1,1,1,1,1,1,0,0,0,0,0,0,0,0]
]

In [46]:
output = model(batched_input_ids, attention_mask=torch.tensor(attention_mask))
print(output.logits)

tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)
