### Processing the data

In [1]:
!uv pip install transformers torch

[2mUsing Python 3.12.9 environment at: /home/vscode/.venv[0m
[2mAudited [1m2 packages[0m [2min 4ms[0m[0m


In [1]:
import dotenv
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dotenv.load_dotenv()
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Loading a dataset from the HF Hub

In [3]:
from datasets import load_dataset

In [4]:
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 288638.24 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 152018.84 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 555014.91 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [5]:
raw_datasets["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [6]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tok_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tok_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [10]:
inputs = tokenizer("This is the first sentence", "This is the second one")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 102, 2023, 2003, 1996, 2117, 2028, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '[SEP]']

In [13]:
tokenized_ds = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [14]:
def tokenize_func(record):
    return tokenizer(record["sentence1"], record["sentence2"], truncation=True)

In [15]:
tokenized_ds = raw_datasets.map(tokenize_func, batched=True)
tokenized_ds

Map: 100%|██████████| 3668/3668 [00:00<00:00, 20822.62 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 18328.49 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 22058.86 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

#### Dynamic Padding

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
samples = tokenized_ds["train"][:8]
samples = {
    k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]
}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [20]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

### Finetuning with the Trainer API

In [1]:
import dotenv

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

dotenv.load_dotenv()

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 3668/3668 [00:00<00:00, 15510.65 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 20261.62 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 20614.85 examples/s]


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test-trainer", push_to_hub=False, eval_strategy="epoch"
)

In [4]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


This gives the following (expected) warning:
```
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
```
This is because this model was not pre trained for 2 label classification tasks. So its head has been discarded and a new head, randomly initialized, and suited for sequence classification has been instantiated. 

In [15]:
import evaluate
import numpy as np


def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [17]:
trainer.train()

Step,Training Loss
500,0.5242
1000,0.2669


TrainOutput(global_step=1377, training_loss=0.3283784709811297, metrics={'train_runtime': 3051.007, 'train_samples_per_second': 3.607, 'train_steps_per_second': 0.451, 'total_flos': 405114969714960.0, 'train_loss': 0.3283784709811297, 'epoch': 3.0})

This took approximately 50 minutes on an Apple M3.
```
TrainOutput(global_step=1377, training_loss=0.3283784709811297, metrics={'train_runtime': 3051.007, 'train_samples_per_second': 3.607, 'train_steps_per_second': 0.451, 'total_flos': 405114969714960.0, 'train_loss': 0.3283784709811297, 'epoch': 3.0})
```

In [18]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


In [20]:
preds = np.argmax(predictions.predictions, axis=-1)

In [21]:
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8480392156862745, 'f1': 0.8945578231292517}