## Data Preprocessing

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [2]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

data= load_dataset("glue","mrpc")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
checkpoint= "bert-base-uncased"
tokenizer= AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
def tokenize_function(dataset):
  return tokenizer(dataset["sentence1"], dataset["sentence2"], truncation=True)

In [6]:
data=data.map(tokenize_function, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [7]:
data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [8]:
data_collator= DataCollatorWithPadding(tokenizer=tokenizer)

## Prepare for training

- Remove the columns corresponding to values the model does not expect (like the `sentence1` and `sentence2` columns).
-Rename the column `label` to `labels` (because the model expects the argument to be named labels).
-Set the format of the datasets so they return PyTorch tensors instead of lists.


In [9]:
data=data.remove_columns(["sentence1","sentence2","idx"])
data = data.rename_column("label", "labels")
data.set_format("torch")

In [10]:
data["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

## Data Loader

The dataloaders will be used to iterate over batches.

In [11]:
from torch.utils.data import DataLoader

train_dataoader= DataLoader(data["train"],shuffle=True, batch_size=8, collate_fn=data_collator)

In [12]:
eval_dataoader= DataLoader(data["validation"],batch_size=8, collate_fn=data_collator )

In [31]:
for batch in train_dataoader:
  print({k: v.shape for k, v in batch.items()})
  break

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 63]), 'token_type_ids': torch.Size([8, 63]), 'attention_mask': torch.Size([8, 63])}


In [32]:
batch

{'labels': tensor([0, 1, 0, 1, 1, 1, 0, 0]), 'input_ids': tensor([[  101,  1000,  1045,  2079,  2025,  4299,  2000,  2022,  2556,  2076,
          2023,  7409,  1010,  1000,  2002,  2409,  9761, 25678,  2221,  6020,
          2457,  3648,  2632, 19226, 10278,  2072,  1012,   102,  1000,  2748,
          1010,  1045,  2079,  2025,  4299,  2000,  2022,  2556,  2076,  2023,
          7409,  1010,  1000, 12001,  1010,  2861,  1010, 12885,  2409,  1996,
          3648,  2004,  2002,  2001,  2513,  2000,  2010,  3526,  1012,   102,
             0,     0,     0],
        [  101,  1037,  3861,  1997,  1996,  3460,  1005,  1055,  2365,  3173,
          1996,  2858,  2596,  1999,  1996,  2120,  4372, 15549, 14544,  2074,
          2048,  3134,  2044,  2577,  2351,  1012,   102,  1037,  9982,  1997,
          1996,  3460,  1005,  1055,  2365,  3173,  1996,  2858,  2596,  1999,
          1996,  2120,  4372, 15549, 14544,  2048,  3134,  2044,  6676,  1005,
          1055,  2331,  1012,   102,     0

## Model Training

In [14]:
from transformers import AutoModelForSequenceClassification

model= AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


To make sure that everything will go smoothly during training, we pass our batch to this model:

In [15]:
# batch is saved from above loop
# this is done to just check the model is working or not
output= model(**batch)
print(output.loss, output.logits.shape)

tensor(0.7143, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [16]:
output.logits[0]

tensor([-0.2087,  0.2220], grad_fn=<SelectBackward0>)

We’re almost ready to write our training loop! We’re just missing two things: an optimizer and a learning rate scheduler. Since we are trying to replicate what the Trainer was doing by hand, we will use the same defaults. The optimizer used by the Trainer is AdamW, which is the same as Adam, but with a twist for weight decay regularization.

In [17]:
from transformers import AdamW

optimizer= AdamW(model.parameters(), lr=5e-5)



In [18]:
len(train_dataoader)

459

In [19]:
from transformers import get_scheduler

num_epochs=3
num_training_steps= num_epochs* len(train_dataoader)
lr_scheduler= get_scheduler("linear",
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps
                            )

print(num_training_steps)

1377


## Training Loop

In [25]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

To get some sense of when training will be finished, we add a progress bar over our number of training steps, using the `tqdm` library:

In [35]:
from tqdm.auto import tqdm

progress_bar= tqdm(range(100))

for epoch in range(num_epochs):
  for batch in train_dataoader:
    # Now, batch['input_ids'] and batch['attention_mask'] are on the specified device
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs= model(**batch)
    loss= outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/100 [00:00<?, ?it/s]

In [36]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [37]:
import evaluate

metric= evaluate.load("glue", "mrpc")
model.eval()

for batch in eval_dataoader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    outputs = model(**batch)

  logits= output.logits
  prediction = torch.argmax(logits, dim=-1)
  metric.add_batch(predictions=prediction, references=batch["labels"])

metric.compute()


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

## Using Accelerate

In [39]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
     train_dataoader, eval_dataoader, model, optimizer
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [41]:
progress_bar = tqdm(range(num_training_steps))

model.train()

  0%|          | 0/1377 [00:00<?, ?it/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [42]:
for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)