In [None]:
# !pip install transformers datasets jupyter notebook

In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR, LinearLR, SequentialLR
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-01-18 23:47:48.243821: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-18 23:47:48.256288: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737244068.277305   52911 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737244068.285706   52911 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-18 23:47:48.311681: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

### Tokenize the dataset

In [3]:
dataset = load_dataset("glue", "sst2")

train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

In [4]:
train_data[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = [[] for _ in range(6)]
for split, xb, yb in zip([train_data, validation_data, test_data], [X_train, X_val, X_test], [y_train, y_val, y_test]):
    for observation in split:
        sentence, label, _ = observation.values()
        xb.append(sentence)
        yb.append(label)
    
    xb = tokenizer(xb, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [6]:
X_train = tokenizer(X_train, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_val = tokenizer(X_val, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_test = tokenizer(X_test, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [7]:
class SST2Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        x = {
            'input_ids': self.X['input_ids'][idx, :].to(device),
            'attention_mask': self.X['attention_mask'][idx, :].to(device)
        }
        label = self.y[idx]
        y = torch.Tensor([1.0, 0.0]) if label == 0 else torch.Tensor([0.0, 1.0])
        y = y.to(device)
        return x, y

In [8]:
train_dataset = SST2Dataset(X_train, y_train)
val_dataset = SST2Dataset(X_val, y_val)

In [None]:
num_epochs = 10

all_runs = []
for batch_size in [16, 32]:
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    total_steps = num_epochs * len(train_dataloader)

    warmup_ratio = 0.06
    warmup_steps = warmup_ratio * total_steps
    patience = 3 # early stopping after 3 epochs with no improvement
    eval_iters = len(train_dataloader) // 10 # how many batch we will evaluate train loss on

    for learning_rate in [1e-5, 2e-5, 3e-5]:
        print('========= RUN PARAMETERS: ===============')
        print(f'Learning rate: {learning_rate:.1e}, batch size: {batch_size}')
        model = RobertaForSequenceClassification.from_pretrained("roberta-base")
        model.to(device)

        optimizer = Adam(
            params=model.parameters(),
            lr=learning_rate,
            betas=(0.9, 0.999),
            weight_decay=0.01,
        )

        warmup_scheduler = LinearLR(
            optimizer=optimizer,
            start_factor=0.05,
            end_factor=1.0,
            total_iters=warmup_steps,
        )
        decay_scheduler = LinearLR(
            optimizer=optimizer,
            start_factor=1.0,
            end_factor=0.0,
            total_iters=total_steps - warmup_steps,
        )

        scheduler = SequentialLR(
            optimizer=optimizer,
            schedulers=[warmup_scheduler, decay_scheduler],
            milestones=[warmup_steps]
        )

        min_val_loss = float('inf')
        epoch_since_last_loss_improvement = 0

        train_losses, val_losses, accuracies = [], [], []
        # training loop
        for epoch in range(num_epochs):
            model.train()

            for batch_idx, (x, y) in enumerate(train_dataloader):

                # forward
                logits = model(**x).logits
                loss = F.cross_entropy(logits, y)

                # backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                
                if batch_idx % 200 == 0:
                    print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}, LR: {scheduler.get_last_lr()[0]:.1e}")
            
            # get train loss over eval_iter batches
            train_loss = 0
            model.eval()
            with torch.no_grad():
                for batch_idx, (xb, yb) in enumerate(train_dataloader):
                    if batch_idx == eval_iters:
                        break
                    logits = model(**x).logits
                    loss = F.cross_entropy(logits, y)
                    train_loss += loss.item()

            train_loss /= eval_iters

            # Get validation loss and accuracy
            val_accuracy = 0
            val_loss = 0

            model.eval()
            with torch.no_grad():
                for batch_idx, (x, y) in enumerate(val_dataloader):
                    logits = model(**x).logits
                    softmax = F.softmax(logits, dim=1)
                    loss = F.cross_entropy(logits, y)
                    y_preds = torch.argmax(softmax, dim=1)
                    y = torch.Tensor([x[1] for x in y]).to(device)

                    accuracy = (torch.sum(y_preds == y)/len(y)).item()
                    val_accuracy += accuracy
                    val_loss += loss.item()
            
            val_accuracy /= len(val_dataloader)
            val_loss /= len(val_dataloader)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            accuracies.append(val_accuracy)

            print(f"Epoch [{epoch+1}/{num_epochs}] summary")
            print(f'    Train loss: {train_loss:.4f}')
            print(f'    Validation loss: {val_loss:.4f}')
            print(f'    Accuracy: {val_accuracy:.4f}')
            print('========================================')

            if val_loss < min_val_loss:
                min_val_loss = val_loss
                epoch_since_last_loss_improvement = 0
            else:
                epoch_since_last_loss_improvement += 1
                if epoch_since_last_loss_improvement > patience:
                    print("-------- Early stopping --------------")
                    break

        all_runs.append({
            'learning_rate': learning_rate,
            'batch_size': batch_size,
            'train_losses': train_losses,
            'val_losses': val_losses,
            'accuracies': accuracies,
        })

        print(f'Best accuracy is {max(accuracies):.4f}% at epoch {accuracies.index(max(accuracies)) + 1}')

        del model
        torch.cuda.empty_cache()

Epoch [1/10], Step [1/2105], Loss: 0.6995, LR: 5.1e-07
Epoch [1/10], Step [101/2105], Loss: 0.7048, LR: 1.3e-06
Epoch [1/10], Step [201/2105], Loss: 0.7139, LR: 2.0e-06
Epoch [1/10], Step [301/2105], Loss: 0.6833, LR: 2.8e-06
Epoch [1/10], Step [401/2105], Loss: 0.6896, LR: 3.5e-06
Epoch [1/10], Step [501/2105], Loss: 0.3660, LR: 4.3e-06
Epoch [1/10], Step [601/2105], Loss: 0.2384, LR: 5.0e-06
Epoch [1/10], Step [701/2105], Loss: 0.5010, LR: 5.8e-06
Epoch [1/10], Step [801/2105], Loss: 0.2605, LR: 6.5e-06
Epoch [1/10], Step [901/2105], Loss: 0.2366, LR: 7.3e-06
Epoch [1/10], Step [1001/2105], Loss: 0.2270, LR: 8.0e-06
Epoch [1/10], Step [1101/2105], Loss: 0.3123, LR: 8.8e-06
Epoch [1/10], Step [1201/2105], Loss: 0.2511, LR: 9.5e-06




Epoch [1/10], Step [1301/2105], Loss: 0.2074, LR: 1.0e-05
Epoch [1/10], Step [1401/2105], Loss: 0.4326, LR: 9.9e-06
Epoch [1/10], Step [1501/2105], Loss: 0.1063, LR: 9.9e-06
Epoch [1/10], Step [1601/2105], Loss: 0.2263, LR: 9.8e-06
Epoch [1/10], Step [1701/2105], Loss: 0.2408, LR: 9.8e-06
Epoch [1/10], Step [1801/2105], Loss: 0.1689, LR: 9.7e-06
Epoch [1/10], Step [1901/2105], Loss: 0.1037, LR: 9.7e-06
Epoch [1/10], Step [2001/2105], Loss: 0.2621, LR: 9.6e-06
Epoch [1/10], Step [2101/2105], Loss: 0.2470, LR: 9.6e-06
Epoch [1/10] summary
    Train loss: 0.2335
    Validation loss: 0.2075
    Accuracy: 0.9230
Epoch [2/10], Step [1/2105], Loss: 0.2299, LR: 9.6e-06
Epoch [2/10], Step [101/2105], Loss: 0.1641, LR: 9.5e-06
Epoch [2/10], Step [201/2105], Loss: 0.2202, LR: 9.5e-06
Epoch [2/10], Step [301/2105], Loss: 0.0934, LR: 9.4e-06
Epoch [2/10], Step [401/2105], Loss: 0.1458, LR: 9.4e-06
Epoch [2/10], Step [501/2105], Loss: 0.3126, LR: 9.3e-06
Epoch [2/10], Step [601/2105], Loss: 0.1820, 

In [13]:
[
    {
        'learning_rate': 2e-5,
        'batch_size': 64,
        'train_losses': [0.2679, 0.2083, 0.2443],
        'val_losses': [0.2092, 0.2720, 0.3516],
        'val_accuracy': [0.9292, 0.8882, 0.8520]
    },

    # unstable loss even in first epoch => decrease learning rate
    {
        'learning_rate': 5e-6,
        'batch_size': 64,
        'train_losses': [0.3540, 0.1906, 0.1685],
        'val_losses': [0.1980, 0.1990, 0.2054],
        'val_accuracy': [0.9321, 0.9292, 0.9270]
    },
    
    # observe that loss fluctuates more in epoch 2 => implemented step learning rate, hopefully there will be gain in epochs 2 and 3
    {
        'initial learning_rate': 5e-6,
        'stepLR gamma': 0.5, # decrease LR by half after each epoch
        'batch_size': 64,
        'train_losses': [0.3540, 0.1899, 0.1712],
        'val_losses': [0.1980, 0.1990, 0.2005],
        'val_accuracy': [0.9321, 0.9277, 0.9259]
    } 

    # there is not much difference in the metrics of epoch 2 and 3. I think further decreasing gamma will not bring about much changes.
    # I will now focus on increasing accuracy on epoch 1.
    # We reduced the learning rate in one of the experiments above and got nice result. Now I will tune it down a bit further.


]

[{'learning_rate': 2e-05,
  'batch_size': 64,
  'train_losses': [0.2679, 0.2083, 0.2443],
  'val_losses': [0.2092, 0.272, 0.3516],
  'val_accuracy': [0.9292, 0.8882, 0.852]},
 {'learning_rate': 5e-06,
  'batch_size': 64,
  'train_losses': [0.354, 0.1906, 0.1685],
  'val_losses': [0.198, 0.199, 0.2054],
  'val_accuracy': [0.9321, 0.9292, 0.927]},
 {'initial learning_rate': 5e-06,
  'stepLR gamma': 0.5,
  'batch_size': 64,
  'train_losses': [0.354, 0.1899, 0.1712],
  'val_losses': [0.198, 0.199, 0.2005],
  'val_accuracy': [0.9321, 0.9277, 0.9259]}]