<a href="https://colab.research.google.com/github/doanhieung/colab_notebooks/blob/main/Fine_tuning_RoBERTa_on_IMDB_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import torch
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load IMDB dataset
dataset = load_dataset("stanfordnlp/imdb")
print(f"Train set size: {len(dataset['train'])}")
print(f"Test set size: {len(dataset['test'])}")

# Load RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    """Tokenize the input texts"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512
    )

# Tokenize datasets
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    desc="Tokenizing datasets"
)

# Rename label column to match model expectations
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train set size: 25000
Test set size: 25000


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing datasets:   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing datasets:   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing datasets:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    """
    Compute evaluation metrics
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        predictions,
        average='binary'
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Load RoBERTa model
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",  # Disable wandb logging
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2145,0.205293,0.94932,0.949512,0.945931,0.95312


TrainOutput(global_step=3125, training_loss=0.2723273309326172, metrics={'train_runtime': 3081.4181, 'train_samples_per_second': 8.113, 'train_steps_per_second': 1.014, 'total_flos': 6577776384000000.0, 'train_loss': 0.2723273309326172, 'epoch': 1.0})

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
for key, value in eval_results.items():
  print(f"{key}: {value:.4f}")

eval_loss: 0.2053
eval_accuracy: 0.9493
eval_f1: 0.9495
eval_precision: 0.9459
eval_recall: 0.9531
eval_runtime: 671.7192
eval_samples_per_second: 37.2180
eval_steps_per_second: 4.6520
epoch: 1.0000


In [None]:
# Save the model
save_path = "imdb-roberta-finetuned"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to: {save_path}")

Model saved to: imdb-roberta-finetuned
