In [1]:
from datasets import load_dataset
from datasets import DatasetDict

ds = load_dataset("linxinyuan/cola")


dfs = ds["train"].train_test_split(test_size=0.2)

ds = DatasetDict(
    train=dfs["train"],
    eval=dfs["test"],
    test=ds["test"],
)

ds


Found cached dataset cola (/users/jmperez/.cache/huggingface/datasets/linxinyuan___cola/default/0.0.0/0871d55203d4de46ef1815400998ed8f219236694f0d03786bde849741f04cd4)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6840
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 1711
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 527
    })
})

In [2]:
# Load RoBERTa model

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [3]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=False, truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/6840 [00:00<?, ? examples/s]

Map:   0%|          | 0/1711 [00:00<?, ? examples/s]

Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/linxinyuan___cola/default/0.0.0/0871d55203d4de46ef1815400998ed8f219236694f0d03786bde849741f04cd4/cache-3f311fb0d5746ca2.arrow


In [4]:
# Set the column names to the format expected by the model

tokenized_ds["train"][0]

{'label': 1,
 'input_ids': [0, 19993, 3790, 2708, 4, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [5]:
# Build the data loader

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
train_dataloader = DataLoader(tokenized_ds["train"], batch_size=32, collate_fn=collator)
eval_dataloader = DataLoader(tokenized_ds["eval"], batch_size=16, collate_fn=collator)
test_dataloader = DataLoader(tokenized_ds["test"], batch_size=16, collate_fn=collator)


In [6]:
import torch
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from transformers import get_linear_schedule_with_warmup
# load tensorboard logger

writer = SummaryWriter()

model = model.to("cuda")

num_epochs = 5
lr = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
warmup_ratio = 0.1
num_training_steps = num_epochs * len(train_dataloader)

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=int(num_training_steps * warmup_ratio),
    num_training_steps=num_training_steps,
)
loss = torch.nn.CrossEntropyLoss()




for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        batch = {k: v.to("cuda") for k, v in batch.items()}
        outputs = model(**batch)

        loss = outputs.loss
        loss.backward()

        #log to tensorboard
        writer.add_scalar("Loss/train", loss, epoch)
        # Log learning rate
        writer.add_scalar("Learning rate", scheduler.get_last_lr()[0], epoch)

        optimizer.step()
        scheduler.step()
    



  0%|          | 0/214 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]