# Pretrained Models with Pytorch from Hugging Faces

## 1. Loading the dataset

In [1]:
from datasets import list_datasets, load_dataset

dataset = load_dataset('tweet_eval', 'emoji')
dataset["train"][20]

Reusing dataset tweet_eval (C:\Users\Administrator\.cache\huggingface\datasets\tweet_eval\emoji\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

{'text': 'Thank you @user for an incredible night last night @ Shrine Auditorium &amp; Expo Hall',
 'label': 7}

## 2. Tokenize the texts

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Loading cached processed dataset at C:\Users\Administrator\.cache\huggingface\datasets\tweet_eval\emoji\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-f7f960e9034d4442.arrow
Loading cached processed dataset at C:\Users\Administrator\.cache\huggingface\datasets\tweet_eval\emoji\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-ead8d431705ecafd.arrow


  0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [3]:
tokenized_datasets["train"][20]["input_ids"][:20]

[101,
 4514,
 1128,
 137,
 4795,
 1111,
 1126,
 10965,
 1480,
 1314,
 1480,
 137,
 20703,
 22511,
 111,
 1821,
 1643,
 132,
 18947,
 1944]

## 3. Training

In [4]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=20)
model

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [5]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"]).rename_column("label", "labels")

In [6]:
tokenized_datasets.set_format("torch")
tokenized_datasets["train"][20]

{'labels': tensor(7),
 'input_ids': tensor([  101,  4514,  1128,   137,  4795,  1111,  1126, 10965,  1480,  1314,
          1480,   137, 20703, 22511,   111,  1821,  1643,   132, 18947,  1944,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,  

In [8]:
# Create a DataLoader

from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets["test"], shuffle=True, batch_size=8)

In [13]:
import torch
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 1
max_batches = 10
num_training_steps = epochs * max_batches
progress_bar = tqdm(range(num_training_steps))

model.train()
print_example = True
trained_examples = 0
for epoch in range(epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        if print_example:
            print(batch)
            print_example = False
        outputs = model(**batch) # Equivalent to model()
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        trained_examples += 1
        if trained_examples >= max_batches:
            break

  0%|          | 0/10 [00:00<?, ?it/s]

{'labels': tensor([ 0,  0,  7, 18,  3, 16,  8,  5]), 'input_ids': tensor([[  101, 10684, 16211,  ...,     0,     0,     0],
        [  101,   137,  4795,  ...,     0,     0,     0],
        [  101, 10656,  1195,  ...,     0,     0,     0],
        ...,
        [  101,   108,  2664,  ...,     0,     0,     0],
        [  101, 14159,  6647,  ...,     0,     0,     0],
        [  101,   108,  2106,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [16]:
from datasets import load_metric

metric = load_metric("accuracy")

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    break

metric.compute()

{'accuracy': 0.375}

## 4. Inference

In [18]:
from transformers import pipeline

classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [20]:
classifier("I am happy at HCMC")

[{'label': 'LABEL_0', 'score': 0.23579677939414978}]

In [21]:
classifier([
    "wonderful, magnificent, outstanding, significant",
    "I love you",
    "I like your house"
])

[{'label': 'LABEL_0', 'score': 0.23579682409763336},
 {'label': 'LABEL_0', 'score': 0.23579680919647217},
 {'label': 'LABEL_0', 'score': 0.23579680919647217}]