In [2]:
# !pip install transformers datasets jupyter notebook

In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-01-18 08:08:58.311034: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-18 08:08:58.323305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737187738.338438   58681 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737187738.343078   58681 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-18 08:08:58.357938: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)  # For binary classification
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

### Tokenize the dataset

In [3]:
dataset = load_dataset("glue", "sst2")

train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

In [4]:
train_data[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = [[] for _ in range(6)]
for split, xb, yb in zip([train_data, validation_data, test_data], [X_train, X_val, X_test], [y_train, y_val, y_test]):
    for observation in split:
        sentence, label, _ = observation.values()
        xb.append(sentence)
        yb.append(label)
    
    xb = tokenizer(xb, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [6]:
X_train = tokenizer(X_train, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_val = tokenizer(X_val, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_test = tokenizer(X_test, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [7]:
class SST2Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        x = {
            'input_ids': self.X['input_ids'][idx, :].to(device),
            'attention_mask': self.X['attention_mask'][idx, :].to(device)
        }
        label = self.y[idx]
        y = torch.Tensor([1.0, 0.0]) if label == 0 else torch.Tensor([0.0, 1.0])
        y = y.to(device)
        return x, y

In [8]:
batch_size = 64
num_epochs = 3

In [9]:
train_dataset = SST2Dataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = SST2Dataset(X_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

### Get accuracy before fine-tuning.

In [10]:
accuracies = []

model.eval()
with torch.no_grad():
    for batch_idx, (x, y) in enumerate(val_dataloader):
        logits = model(**x).logits
        softmax = F.softmax(logits, dim=1)
        y_preds = torch.argmax(softmax, dim=1)
        y = torch.Tensor([x[1] for x in y]).to(device)
        accuracy = (torch.sum(y_preds == y)/len(y)).item()
        accuracies.append(accuracy)

print(torch.mean(torch.Tensor(accuracies)))


tensor(0.4911)


In [11]:
optimizer = Adam(
    params=model.parameters(),
    lr=5e-6,
    betas=(0.9, 0.999),
    weight_decay=0.01,
)

scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for batch_idx, (x, y) in enumerate(train_dataloader):

        # forward
        logits = model(**x).logits
        loss = F.cross_entropy(logits, y)

        # backprop
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        

        train_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
    
    train_loss /= len(train_dataloader)
    scheduler.step()
    

    # Get validation loss and accuracy
    val_accuracy = 0
    val_loss = 0

    model.eval()
    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(val_dataloader):
            logits = model(**x).logits
            softmax = F.softmax(logits, dim=1)
            loss = F.cross_entropy(logits, y)
            y_preds = torch.argmax(softmax, dim=1)
            y = torch.Tensor([x[1] for x in y]).to(device)

            accuracy = (torch.sum(y_preds == y)/len(y)).item()
            val_accuracy += accuracy
            val_loss += loss.item()
    
    val_accuracy /= len(val_dataloader)
    val_loss /= len(val_dataloader)

    print(f"Epoch [{epoch+1}/{num_epochs}] summary")
    print(f'    Train loss: {train_loss:.4f}')
    print(f'    Validation loss: {val_loss:.4f}')
    print(f'    Accuracy: {val_accuracy:.4f}')
    print('========================================')

Epoch [1/3], Step [1/1053], Loss: 0.7029
Epoch [1/3], Step [101/1053], Loss: 0.6773
Epoch [1/3], Step [201/1053], Loss: 0.6941
Epoch [1/3], Step [301/1053], Loss: 0.3288
Epoch [1/3], Step [401/1053], Loss: 0.3073
Epoch [1/3], Step [501/1053], Loss: 0.3440
Epoch [1/3], Step [601/1053], Loss: 0.2898
Epoch [1/3], Step [701/1053], Loss: 0.2431
Epoch [1/3], Step [801/1053], Loss: 0.2257
Epoch [1/3], Step [901/1053], Loss: 0.2344
Epoch [1/3], Step [1001/1053], Loss: 0.1962
Epoch [1/3] summary
    Train loss: 0.3540
    Validation loss: 0.1980
    Accuracy: 0.9321
Epoch [2/3], Step [1/1053], Loss: 0.2045
Epoch [2/3], Step [101/1053], Loss: 0.0452
Epoch [2/3], Step [201/1053], Loss: 0.2648
Epoch [2/3], Step [301/1053], Loss: 0.0856
Epoch [2/3], Step [401/1053], Loss: 0.2022
Epoch [2/3], Step [501/1053], Loss: 0.1861
Epoch [2/3], Step [601/1053], Loss: 0.1506
Epoch [2/3], Step [701/1053], Loss: 0.2417
Epoch [2/3], Step [801/1053], Loss: 0.2545
Epoch [2/3], Step [901/1053], Loss: 0.1723
Epoch [2

In [None]:
[
    {
        'learning_rate': 2e-5,
        'batch_size': 64,
        'train_losses': [0.2679, 0.2083, 0.2443],
        'val_losses': [0.2092, 0.2720, 0.3516],
        'val_accuracy': [0.9292, 0.8882, 0.8520]
    },

    # unstable loss even in first epoch => decrease learning rate
    {
        'learning_rate': 5e-6,
        'batch_size': 64,
        'train_losses': [0.3540, 0.1906, 0.1685],
        'val_losses': [0.1980, 0.1990, 0.2054],
        'val_accuracy': [0.9321, 0.9292, 0.9270]
    }
    
    # observe that loss fluctuates more in epoch 2 => implemented step learning rate, hopefully there will be gain in epochs 2 and 3
   {
        'initial learning_rate': 5e-6,
        'stepLR gamma': 0.5, # decrease LR by half after each epoch
        'batch_size': 64,
        'train_losses': [0.3540, 0.1899, 0.1712],
        'val_losses': [0.1980, 0.1990, 0.2005],
        'val_accuracy': [0.9321, 0.9277, 0.9259]
    } 

    # there is not much difference in the metrics of epoch 2 and 3. I will further decrease gamma.
]