In [1]:
from datasets import load_dataset
from datasets import DatasetDict
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

ds = load_dataset("linxinyuan/cola")


dfs = ds["train"].train_test_split(test_size=0.2)

ds = DatasetDict(
    train=dfs["train"],
    eval=dfs["test"],
    test=ds["test"],
)

ds


Found cached dataset cola (/users/jmperez/.cache/huggingface/datasets/linxinyuan___cola/default/0.0.0/0871d55203d4de46ef1815400998ed8f219236694f0d03786bde849741f04cd4)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6840
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 1711
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 527
    })
})

In [2]:
# Load RoBERTa model

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)



In [3]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=False, truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/6840 [00:00<?, ? examples/s]

Map:   0%|          | 0/1711 [00:00<?, ? examples/s]

Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/linxinyuan___cola/default/0.0.0/0871d55203d4de46ef1815400998ed8f219236694f0d03786bde849741f04cd4/cache-3f311fb0d5746ca2.arrow


In [4]:
# Set the column names to the format expected by the model

tokenized_ds["train"][0]

{'label': 1,
 'input_ids': [0, 37731, 7973, 7644, 2500, 5, 28450, 4, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
import torch
from tqdm.auto import tqdm
from accelerate import Accelerator
from torch.utils.tensorboard import SummaryWriter
from transformers import get_linear_schedule_with_warmup, set_seed

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
# load tensorboard logger
import evaluate






def training_loop(mixed_precision="fp16", seed: int = 42, batch_size: int = 64):
    set_seed(seed)
    writer = SummaryWriter()

    num_epochs = 5
    lr = 5e-5
    warmup_ratio = 0.1

    accelerator = Accelerator(mixed_precision=mixed_precision)
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
    train_dataloader = DataLoader(tokenized_ds["train"], batch_size=32, collate_fn=collator)
    eval_dataloader = DataLoader(tokenized_ds["eval"], batch_size=16, collate_fn=collator)
    test_dataloader = DataLoader(tokenized_ds["test"], batch_size=16, collate_fn=collator)
    
    num_training_steps = num_epochs * len(train_dataloader)
    num_warmup_steps = int(num_training_steps * warmup_ratio)

    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
    )
    loss = torch.nn.CrossEntropyLoss()


    model, optimizer, train_dataloader, eval_dataloader, scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, scheduler
    )

    step = 0

    for epoch in range(num_epochs):
        for batch in tqdm(train_dataloader):
            optimizer.zero_grad()
            outputs = model(**batch)

            loss = outputs.loss
            accelerator.backward(loss)

            #log to tensorboard
            writer.add_scalar("Loss/train", loss, global_step=step)
            # Log learning rate
            writer.add_scalar("Learning rate", scheduler.get_last_lr()[0], global_step=step)

            optimizer.step()
            scheduler.step()
            step += 1

        for eval_batch in tqdm(eval_dataloader):
            with torch.no_grad():
                outputs = model(**eval_batch)

            predictions = outputs.logits.argmax(dim=-1)

            predictions, references = accelerator.gather_for_metrics(
                (predictions, eval_batch["labels"])
            )

            clf_metrics.add_batch(predictions, references)

        metrics = clf_metrics.compute()

        writer.add_scalar("Accuracy/eval", metrics["accuracy"], global_step=step)
        writer.add_scalar("F1/eval", metrics["f1"], global_step=step)
        writer.add_scalar("Precision/eval", metrics["precision"], global_step=step)
        writer.add_scalar("Recall/eval", metrics["recall"], global_step=step)

    

In [6]:

from accelerate import notebook_launcher

notebook_launcher(training_loop, ("fp16", 64), num_processes=2)

Launching training on 2 GPUs.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]