# Hugging Face - lesson 2

## Auto Tokenizer

load tokenizer based on

In [3]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Tokenize the text and return it as tensors

In [7]:
raw_inputs = [
    "The best food in the world is curry.",
    "I hate it vegetables!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1996,  2190,  2833,  1999,  1996,  2088,  2003, 15478,  1012,
           102],
        [  101,  1045,  5223,  2009, 11546,   999,   102,     0,     0,     0,
             0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


Download the checkpoint and instantiate the model

In [8]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

In [9]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 11, 768])


Auto Model for Sequence Classification

In [11]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [13]:
print(outputs.logits)

tensor([[-3.9114,  4.2566],
        [ 4.1390, -3.3890]], grad_fn=<AddmmBackward0>)


Applying softmax to logits with pytorch softmax function

In [14]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[2.8350e-04, 9.9972e-01],
        [9.9946e-01, 5.3755e-04]], grad_fn=<SoftmaxBackward0>)


Predict results for each statement

In [36]:
# Retrieve labels of model
label_dict = model.config.id2label
for i in range(len(predictions)):
    print(raw_inputs[i])
    print("Results:", label_dict[predictions[i].argmax().item()], "\n")

The best food in the world is curry.
Results: POSITIVE 

I hate it vegetables!
Results: NEGATIVE 



## Creating a Transformer model

In [37]:
from transformers import BertConfig, BertModel

# Building the config
config = BertConfig()

# Building the model from the config
model = BertModel(config)

In [38]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [39]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [40]:
model.save_pretrained("bert-model")

## Creating Bert Tokenizer

In [1]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [7]:
tokenizer("I am playing with fire while using a transformer model!")

{'input_ids': [101, 146, 1821, 1773, 1114, 1783, 1229, 1606, 170, 11303, 1200, 2235, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
tokenizer.save_pretrained("bert-tokenizer")

('bert-tokenizer\\tokenizer_config.json',
 'bert-tokenizer\\special_tokens_map.json',
 'bert-tokenizer\\vocab.txt',
 'bert-tokenizer\\added_tokens.json',
 'bert-tokenizer\\tokenizer.json')

In [10]:
# Tokenize text with Bert Tokenizer
text = "I am playing with fire while using a transformer model!"
token = tokenizer.tokenize(text)
print(token)

['I', 'am', 'playing', 'with', 'fire', 'while', 'using', 'a', 'transform', '##er', 'model', '!']


In [12]:
ids = tokenizer.convert_tokens_to_ids(token)
print(ids)

[146, 1821, 1773, 1114, 1783, 1229, 1606, 170, 11303, 1200, 2235, 106]


In [14]:
decoded_text = tokenizer.decode(ids)
print(decoded_text)

I am playing with fire while using a transformer model!


## Handling Multiple Sequence

In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids])
print(input_ids)

output = model(input_ids)
print(output.logits)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [22]:
batched_ids =  torch.tensor([ids, ids])

output = model(batched_ids)
print(output.logits)

tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


Apply padding to sequence. Sentence needs to be the same length so there is a need to do padding for the shorter sentences within the batch

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


Attention Mask is used to tell the model if a token is not needed

In [28]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [30]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been playing with toys until transformers existed.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.1678, -1.8128],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
