In [None]:
!python --version

Python 3.10.12


In [None]:
# NLP Purpose
!pip install "transformers[sentencepiece]"



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
import torch
import transformers
from transformers import pipeline
from transformers import AutoModel, AutoTokenizer

import warnings
warnings.filterwarnings('ignore')

## **Fine-Tuning**
**Fine-Tune a pretrained model for own dataset**.

Discussing:
- How to prepare a **large dataset from the Hub**
- How to use **high-level `Trainer` API** to fine-tune a model
- How to use a **custom training loop**
- How to leverage the **Accelerate library** (*custom training loop on any distributed setup*)

## **Processing the Data**

Here is how we would train a sequence classifier on one batch

In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Of course, just training the model on two sentences is not going to yield very good results.

To get better results, you will need to prepare a **bigger dataset**.

In this section we will use as an example the **MRPC** (Microsoft Research Paraphrase Corpus) dataset, introduced in a paper by William B. Dolan and Chris Brockett.

The dataset consists of **5,801 pairs of sentences**, with a `label` **indicating if they are paraphrases or not** (i.e., if both sentences mean the same thing).

### **Loading a Dataset from the Hub**
**The MRPC dataset** ➡ This is one of the 10 datasets composing the GLUE benchmark, which is an academic benchmark that is used to measure the performance of ML models across 10 different text classification tasks.

In [None]:
from datasets import load_dataset

# glue -> benchmark
# mrpc -> datasets
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

As you can see, we get a `DatasetDict` object which contains the training set, the validation set, and the test set.

Each of those contains several columns (sentence1, sentence2, label, and idx) and a variable number of rows, which are the number of elements in each set (so, there are 3,668 pairs of sentences in the training set, 408 in the validation set, and 1,725 in the test set).

In [None]:
## Accessing pair of sentences in train dataset
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
# Inspect the correspondings data type
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

Behind the scenes, `label` is of type `ClassLabel`, and the mapping of integers to label name is stored in the names folder. *0 corresponds to not_equivalent*, and *1 corresponds to equivalent*.

In [None]:
# Look at the element 15
raw_train_dataset[15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [None]:
## Accessing pair of sentences in validation dataset
validation_dataset = raw_datasets["validation"]
validation_dataset[8]

{'sentence1': 'The top rate will go to 4.45 percent for all residents with taxable incomes above $ 500,000 .',
 'sentence2': 'For residents with incomes above $ 500,000 , the income-tax rate will increase to 4.45 percent .',
 'label': 1,
 'idx': 73}

### **Preprocessing a Dataset**
To preprocess the dataset, we need to **convert the text to numbers the model can make sense of**. It is done with a `tokenizer`.

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

However, ***we can’t just pass two sequences to the model* and get a prediction of whether the two sentences are paraphrases or not**.

We need to handle the **two sequences as a pair**, and apply the appropriate preprocessing. Fortunately, the `tokenizer` can also take a pair of sequences and prepare it the way our BERT model expects

In [None]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

`token_type_ids` ➡ tells the model **which part of the input** is the **first sentence** and which is the **second sentence**.

If we decode the `IDs` inside `input_ids` back to words:

In [None]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

So we see the model expects the inputs to be of the form `[CLS] sentence1 [SEP] sentence2 [SEP]` when there are two sentences.

Aligning this with the `token_type_ids` gives us

In [None]:
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(inputs['token_type_ids'])

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


📓 `token_type_ids` is not necessarily always there in the output, it depends on what checkpoint that used (e.g., DistilBERT).

Here, BERT is pretrained with token type IDs, and the goal with this taks is to model relationship between paris of sentences (*next sentence prediction*)

In general, you don’t need to worry about whether or not there are `token_type_ids` in your tokenized inputs: as long as you use the same checkpoint for the tokenizer and the model, everything will be fine as the tokenizer knows what to provide to its model.

Another way to preprocess the training datasets

In [None]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

This works well, but it has the disadvantage of returning a dictionary (with our `keys`, `input_ids`, `attention_mask`, and `token_type_ids`, and values that are lists of lists). It will also only work if you have enough RAM to store your whole dataset during the tokenization.

To **keep the data as a datase**t, we will use the `Dataset.map()` method. This also allows us some extra flexibility, if we need more preprocessing done than just tokenization. The `map()` method works by applying a function on each element of the dataset.

In [None]:
# This function takes a dictionary (like the items of our dataset) and
# returns a new dictionary with the keys input_ids, attention_mask, and token_type_ids.

def tokenize_function(example):
    return tokenizer(example["sentence1"],
                     example["sentence2"],
                     truncation=True)

# This tokenizer can be very fast, but only if we give it lots of inputs at once.

`padding` ➡ **padding all the samples to the maximum length is not efficient**: it’s better to ***pad the samples when we’re building a batch***, as then we only need to ***pad to the maximum length in that batch***, and not the maximum length in the entire dataset

Apply tokenization function on all datasets at once. And use `batched=True` so the function is applied to multiple elements of dataset at once.

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

# adding new fields to the datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

Our tokenize_function returns a dictionary with the keys `input_ids`, `attention_mask`, and `token_type_ids`, so those three fields are added to all splits of our dataset.

The last thing we will need to do is pad all the examples to the length of the longest element when we batch elements together. [**Dynamic Padding**]

### **Dynamic Padding**
The function that is responsible for putting together samples inside a batch is called a ***collate function***. It’s an argument you can pass when you build a `DataLoader`, the default being a function that will just convert your samples to PyTorch tensors and concatenate them (recursively if your elements are lists, tuples, or dictionaries)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

To test this new toy, let’s grab a few samples from our training set that we would like to batch together.

Here, we remove the columns `idx`, `sentence1`, and `sentence2` as they won’t be needed and contain strings (and we can’t create tensors with strings) and have a look at the lengths of each entry in the batch

In [None]:
tokenized_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [None]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["sentence1", "sentence2", "idx"]}

[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

**Dynamic padding** means the samples in this batch should all be padded to a length of 67, the maximum length inside the batch.

 Without dynamic padding, all of the samples would have to be padded to the maximum length in the whole dataset, or the maximum length the model can accept

In [None]:
# double-check that our data_collator is dynamically padding the batch properly
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

Looking good! Now that we’ve gone from raw text to batches our model can deal with, we’re ready to fine-tune it!

## **Fine-tuning a Model with the `Trainer` API or Keras**

Transformers provides a `Trainer` class to help you **fine-tune any of the pretrained models** it provides on your dataset. Once you’ve done all the data preprocessing work in the last section, you have just a few steps left to define the Trainer.

In [None]:
## Just a Recap from the previous step
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## Return tokenization function
def tokenize_function(example):
    return tokenizer(example["sentence1"],
                     example["sentence2"],
                     truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### **Training**

The first step before we can define our `Trainer` is:
1. Define a `TrainingArguments` class that will **contain all the hyperparameters the `Trainer` will use for training and evaluation**.

In [None]:
# It's a Basic fine-tuning
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

2. **Define the model**

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Get a warning this is because:

**BERT has not been pretrained on classifying pairs of sentences**, so the head of the pretrained model has been discarded and a **new head suitable for sequence classification has been added instead**.

**The warnings indicate** that **some weights were not used** (the ones corresponding to the dropped pretraining head) and that **some others were randomly initialized** (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.

Once we have our model, we can define a `Trainer` by passing it all the objects constructed up to now — the `model`, the `training_args`, the training and validation datasets, our `data_collator`, and our `tokenizer`.

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

📓 Note that when you pass the `tokenizer` as we did here, the default `data_collator` used by the `Trainer` will be a `DataCollatorWithPadding` as defined previously, so you can skip the line `data_collator=data_collator` in this call.

Fine-tune the model on the dataset

In [None]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

This will start the fine-tuning (which should take a couple of minutes on a GPU) and report the training loss every 500 steps. It won’t, however, tell you how well (or badly) your model is performing, because:
1. not setting `evaluation_strategy` (`epochs` or `steps`)
2. Not provide `compute_metrics()` function to calculate a metric durin evaluation (otrws, just printed the loss)

### **Evaluation**
To get some predictions from our model, we can use the `Trainer.predict()`

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

The output of the predict() method is another named tuple with three fields: `predictions`, `label_ids`, and `metrics` (loss as default + time metrics)

As you can see, `predictions` is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used). Those are the `logits` for each element of the dataset we passed to `predict()` ( all Transformer models return logits).

To transform them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis:

In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

We can now compare those preds to the labels. Build `compute_metric()`

In [None]:
!pip install evaluate

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

The exact results you get may vary, as the random initialization of the model head might change the metrics it achieved.

Here, we can see our model has an *accuracy* of 85.78% on the validation set and an *F1 score* of 89.97. Those are the two metrics used to evaluate results on the MRPC dataset for the GLUE benchmark

⚠ ⚠ ⚠ `cased` explains better than `uncased`

**Wrap everything for compute matrics for evaluation**

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
## Used in Action to report metrics at the end each epoch

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Note that we create a new `TrainingArguments` with its `evaluation_strategy` set to `"epoch"` and a new model — otherwise, we would just be continuing the training of the model we have already trained.

## **A Full Training**
Now we’ll see how to achieve the same results as we did in the last section without using the `Trainer` class

In [None]:
## Data Preprocessing
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"],
                     example["sentence2"],
                     truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### **Prepare for Training**
Before actually writing our training loop, we will need to define a few objects.
1. The first ones are the `dataloaders` we will use to iterate over batches.

But before we can define those `dataloaders`, we need to apply a bit of postprocessing to our `tokenized_datasets`, to take care of some things that the Trainer did for us automatically

Specifically, we need to:

1. Remove the columns corresponding to values the model does not expect (like the `sentence1` and `sentence2` columns).
2. Rename the column label to `labels` (because the model expects the argument to be named labels).
3. Set the format of the datasets so they return `PyTorch tensors` instead of lists.

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

**Define the `Dataloaders`**

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=8,
    collate_fn=data_collator
)

In [None]:
train_dataloader.items()

To quickly check there is no mistake in the data processing, we can inspect a batch like this:

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

**Turn to the Model**

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model

To make sure that everything will go smoothly during training, we pass our batch to this model:

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

We’re almost ready to write our training loop! We’re just missing two things: an `optimizer` and a `learning rate scheduler`.

Since we are trying to replicate what the `Trainer` was doing by hand, we will use the same defaults. The optimizer used by the Trainer is `AdamW`, which is the same as Adam, but with a twist for weight decay regularization

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

Finally, the `learning rate scheduler` used by default is just a `linear decay` from the `maximum value (5e-5) to 0`.

To **properly define** it, we need to know the *number of training steps* we will take, which is the **number of epochs** we want to run **multiplied** by the **number of training batches** (which is the *length of our training dataloader*). The `Trainer` uses `3 epochs` by default, so we will follow that

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(num_training_steps)

### **The Training Loop**
One last thing: we will want to use the GPU if we have access to one (on a CPU, training might take several hours instead of a couple of minutes). To do this, we define a device we will put our model and our batches on:

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

We are now ready to train! To get some sense of when training will be finished, we add a progress bar over our number of training steps, using the `tqdm` library

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

You can see that the core of the training loop looks a lot like the one in the introduction.

We didn’t ask for any reporting, so this training loop will not tell us anything about how the model fares. We need to add an evaluation loop for that.

### **The Evaluation Loop**
We’ve already seen the `metric.compute()` method, but `metrics` can actually **accumulate batches** for us **as we go over the prediction loop** with the method `add_batch()`.

Once we have accumulated all the batches, we can get the final result with `metric.compute()`

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}

    # torch.no_grad() tells PyTorch to not calculate the gradients,
    # in order to not update the gradients when it is updating the weights
    # as that would affect the back propagation.
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1) # for compare prediction with the labels
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

### **Supercharge Training Loop with `Accelerate`**
The training loop we defined earlier works fine on a single CPU or GPU. But using the 🤗 `Accelerate` library, with just a few adjustments we can **enable distributed training on multiple GPUs or TPUs**.

Starting from the creation of the training and validation dataloaders, here is what our manual training loop looks like:

In [None]:
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

**Here's the changes**

In [None]:
from accelerate import Accelerator # importing accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator() # instantiates -> look at the env and initialize the proper distributed setup

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

## Removing Device
## Accelerate handles the device placement
"""
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
- model.to(device)
"""

## Sends the dataloaders, the model, and the optimizer to accelarator.prepare()
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
     train_dataloader, eval_dataloader, model, optimizer
) # This will wrap those objects in the proper container to make sure your
  # distributed training works as intended.

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # remove (-) | batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        # remove (-) | loss.backward()
        accelerator.backward(loss) ## replacing the loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Putting this in a `train.py` script will make that script runnable on any kind of distributed setup.

To try it out in your distributed setup, run the command:

In [None]:
accelerate config

which will prompt you to answer a few questions and dump your answers in a configuration file used by this command:

In [None]:
accelerate launch train.py
# which will launch the distributed training.