In [None]:
# !pip install transformers datasets jupyter notebook

In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR, LinearLR, SequentialLR
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from utils import get_loss_and_accuracy, SST2Dataset
import pandas as pd
from datasets import load_dataset
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-01-19 09:04:23.619392: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-19 09:04:23.636603: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737277463.653499  141800 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737277463.658090  141800 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-19 09:04:23.673063: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

### Tokenize the dataset

In [3]:
dataset = load_dataset("glue", "sst2")

train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

In [4]:
train_data[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = [[] for _ in range(6)]
for split, x, y in zip([train_data, validation_data, test_data], [X_train, X_val, X_test], [y_train, y_val, y_test]):
    for observation in split:
        sentence, label, _ = observation.values()
        x.append(sentence)
        y.append(label)
    

In [6]:
X_train, X_val, X_test = [tokenizer(X, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device) for X in [X_train, X_val, X_test]]

In [7]:
train_dataset = SST2Dataset(X_train, y_train)
val_dataset = SST2Dataset(X_val, y_val)

In [14]:
torch.save(train_dataset, './datasets/train_dataset.pth')
torch.save(val_dataset, './datasets/val_dataset.pth')

In [None]:
num_epochs = 10

all_runs = []
for batch_size in [16, 32]:
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    total_steps = num_epochs * len(train_dataloader)

    warmup_ratio = 0.06
    warmup_steps = warmup_ratio * total_steps
    patience = 3 # early stopping after 3 epochs with no improvement

    for learning_rate in [1e-5, 2e-5, 3e-5]:
        print('========= RUN PARAMETERS: ===============')
        print(f'Learning rate: {learning_rate:.1e}, batch size: {batch_size}')

        model = RobertaForSequenceClassification.from_pretrained("roberta-base")
        model.to(device)

        optimizer = Adam(
            params=model.parameters(),
            lr=learning_rate,
            betas=(0.9, 0.98),
            weight_decay=0.1,
            eps=1e-6
        )

        warmup_scheduler = LinearLR(
            optimizer=optimizer,
            start_factor=0.05,
            end_factor=1.0,
            total_iters=warmup_steps,
        )
        decay_scheduler = LinearLR(
            optimizer=optimizer,
            start_factor=1.0,
            end_factor=0.0,
            total_iters=total_steps - warmup_steps,
        )

        scheduler = SequentialLR(
            optimizer=optimizer,
            schedulers=[warmup_scheduler, decay_scheduler],
            milestones=[warmup_steps]
        )

        min_val_loss = float('inf')
        epoch_since_last_loss_improvement = 0

        # training loop
        train_losses, val_losses, accuracies = [], [], []
        for epoch in range(num_epochs):
            model.train()

            for batch_idx, (x, y) in enumerate(train_dataloader):

                # forward
                logits = model(**x).logits
                loss = F.cross_entropy(logits, y)

                # backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                
                if batch_idx % 200 == 0:
                    print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}, LR: {scheduler.get_last_lr()[0]:.1e}")
            
            # get train loss
            train_loss, _ = get_loss_and_accuracy(
                model=model,
                dataset=train_dataset,
                device=device,
                eval_ratio=0.1 # evaluate on 10% of train data
            )

            # Get validation loss and accuracy
            val_loss, val_accuracy = get_loss_and_accuracy(
                model=model,
                dataset=val_dataset,
                device=device
            )

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            accuracies.append(val_accuracy)

            print(f"Epoch [{epoch+1}/{num_epochs}] summary")
            print(f'    Train loss: {train_loss:.4f}')
            print(f'    Validation loss: {val_loss:.4f}')
            print(f'    Accuracy: {val_accuracy:.4f}')
            print('========================================')

            if val_loss < min_val_loss:
                min_val_loss = val_loss
                epoch_since_last_loss_improvement = 0
            else:
                epoch_since_last_loss_improvement += 1
                if epoch_since_last_loss_improvement > patience:
                    print("-------- Early stopping --------------")
                    break

        all_runs.append({
            'learning_rate': learning_rate,
            'batch_size': batch_size,
            'train_losses': train_losses,
            'val_losses': val_losses,
            'accuracies': accuracies,
        })

        print(f'Best accuracy is {max(accuracies):.4f} at epoch {accuracies.index(max(accuracies)) + 1}')

        del model
        torch.cuda.empty_cache()

Learning rate: 1.0e-05, batch size: 16


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Step [1/4210], Loss: 0.7475, LR: 5.0e-07
Epoch [1/10], Step [201/4210], Loss: 0.6956, LR: 1.3e-06
Epoch [1/10], Step [401/4210], Loss: 0.7214, LR: 2.0e-06
Epoch [1/10], Step [601/4210], Loss: 0.6610, LR: 2.8e-06
Epoch [1/10], Step [801/4210], Loss: 0.6856, LR: 3.5e-06
Epoch [1/10], Step [1001/4210], Loss: 0.6795, LR: 4.3e-06
Epoch [1/10], Step [1201/4210], Loss: 0.6862, LR: 5.0e-06
Epoch [1/10], Step [1401/4210], Loss: 0.7135, LR: 5.8e-06
Epoch [1/10], Step [1601/4210], Loss: 0.4193, LR: 6.5e-06
Epoch [1/10], Step [1801/4210], Loss: 0.4360, LR: 7.3e-06
Epoch [1/10], Step [2001/4210], Loss: 0.2258, LR: 8.0e-06
Epoch [1/10], Step [2201/4210], Loss: 0.3720, LR: 8.8e-06
Epoch [1/10], Step [2401/4210], Loss: 0.3038, LR: 9.5e-06




Epoch [1/10], Step [2601/4210], Loss: 0.4430, LR: 1.0e-05
Epoch [1/10], Step [2801/4210], Loss: 0.2578, LR: 9.9e-06
Epoch [1/10], Step [3001/4210], Loss: 0.0949, LR: 9.9e-06
Epoch [1/10], Step [3201/4210], Loss: 0.2585, LR: 9.8e-06
Epoch [1/10], Step [3401/4210], Loss: 0.5005, LR: 9.8e-06
Epoch [1/10], Step [3601/4210], Loss: 0.4587, LR: 9.7e-06
Epoch [1/10], Step [3801/4210], Loss: 0.5679, LR: 9.7e-06
Epoch [1/10], Step [4001/4210], Loss: 0.4998, LR: 9.6e-06
Epoch [1/10], Step [4201/4210], Loss: 0.5385, LR: 9.6e-06
Epoch [1/10] summary
    Train loss: 0.1188
    Validation loss: 0.4090
    Accuracy: 0.8057
Epoch [2/10], Step [1/4210], Loss: 0.3187, LR: 9.6e-06
Epoch [2/10], Step [201/4210], Loss: 0.7634, LR: 9.5e-06
Epoch [2/10], Step [401/4210], Loss: 0.2093, LR: 9.5e-06
Epoch [2/10], Step [601/4210], Loss: 0.3806, LR: 9.4e-06
Epoch [2/10], Step [801/4210], Loss: 0.4714, LR: 9.4e-06
Epoch [2/10], Step [1001/4210], Loss: 0.2442, LR: 9.3e-06
Epoch [2/10], Step [1201/4210], Loss: 0.3869

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Step [1/4210], Loss: 0.6943, LR: 1.0e-06
Epoch [1/10], Step [201/4210], Loss: 0.6993, LR: 2.5e-06
Epoch [1/10], Step [401/4210], Loss: 0.6640, LR: 4.0e-06
Epoch [1/10], Step [601/4210], Loss: 0.6952, LR: 5.5e-06
Epoch [1/10], Step [801/4210], Loss: 0.7208, LR: 7.0e-06
Epoch [1/10], Step [1001/4210], Loss: 0.6912, LR: 8.5e-06
Epoch [1/10], Step [1201/4210], Loss: 0.4687, LR: 1.0e-05
Epoch [1/10], Step [1401/4210], Loss: 0.3762, LR: 1.2e-05
Epoch [1/10], Step [1601/4210], Loss: 0.3398, LR: 1.3e-05
Epoch [1/10], Step [1801/4210], Loss: 0.3931, LR: 1.5e-05
Epoch [1/10], Step [2001/4210], Loss: 0.2426, LR: 1.6e-05
Epoch [1/10], Step [2201/4210], Loss: 0.3989, LR: 1.8e-05
Epoch [1/10], Step [2401/4210], Loss: 0.2200, LR: 1.9e-05
Epoch [1/10], Step [2601/4210], Loss: 0.3853, LR: 2.0e-05
Epoch [1/10], Step [2801/4210], Loss: 0.3133, LR: 2.0e-05
Epoch [1/10], Step [3001/4210], Loss: 0.4344, LR: 2.0e-05
Epoch [1/10], Step [3201/4210], Loss: 0.4969, LR: 2.0e-05
Epoch [1/10], Step [3

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Step [1/4210], Loss: 0.6602, LR: 1.5e-06
Epoch [1/10], Step [201/4210], Loss: 0.6812, LR: 3.8e-06
Epoch [1/10], Step [401/4210], Loss: 0.6463, LR: 6.0e-06
Epoch [1/10], Step [601/4210], Loss: 0.7134, LR: 8.3e-06
Epoch [1/10], Step [801/4210], Loss: 0.5317, LR: 1.1e-05
Epoch [1/10], Step [1001/4210], Loss: 0.4111, LR: 1.3e-05
Epoch [1/10], Step [1201/4210], Loss: 0.5189, LR: 1.5e-05
Epoch [1/10], Step [1401/4210], Loss: 0.4181, LR: 1.7e-05
Epoch [1/10], Step [1601/4210], Loss: 0.1255, LR: 2.0e-05
Epoch [1/10], Step [1801/4210], Loss: 0.3412, LR: 2.2e-05
Epoch [1/10], Step [2001/4210], Loss: 0.2416, LR: 2.4e-05
Epoch [1/10], Step [2201/4210], Loss: 0.4162, LR: 2.6e-05
Epoch [1/10], Step [2401/4210], Loss: 0.5384, LR: 2.9e-05
Epoch [1/10], Step [2601/4210], Loss: 0.3065, LR: 3.0e-05
Epoch [1/10], Step [2801/4210], Loss: 0.3663, LR: 3.0e-05
Epoch [1/10], Step [3001/4210], Loss: 0.4358, LR: 3.0e-05
Epoch [1/10], Step [3201/4210], Loss: 0.4822, LR: 2.9e-05
Epoch [1/10], Step [3

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Step [1/2105], Loss: 0.6905, LR: 5.1e-07
Epoch [1/10], Step [201/2105], Loss: 0.6804, LR: 2.0e-06
Epoch [1/10], Step [401/2105], Loss: 0.6536, LR: 3.5e-06
Epoch [1/10], Step [601/2105], Loss: 0.6863, LR: 5.0e-06
Epoch [1/10], Step [801/2105], Loss: 0.6745, LR: 6.5e-06
Epoch [1/10], Step [1001/2105], Loss: 0.6039, LR: 8.0e-06
Epoch [1/10], Step [1201/2105], Loss: 0.5036, LR: 9.5e-06
Epoch [1/10], Step [1401/2105], Loss: 0.3109, LR: 9.9e-06
Epoch [1/10], Step [1601/2105], Loss: 0.2093, LR: 9.8e-06
Epoch [1/10], Step [1801/2105], Loss: 0.4837, LR: 9.7e-06
Epoch [1/10], Step [2001/2105], Loss: 0.4262, LR: 9.6e-06
Epoch [1/10] summary
    Train loss: 0.2878
    Validation loss: 0.2704
    Accuracy: 0.9007
Epoch [2/10], Step [1/2105], Loss: 0.2898, LR: 9.6e-06
Epoch [2/10], Step [201/2105], Loss: 0.4114, LR: 9.5e-06
Epoch [2/10], Step [401/2105], Loss: 0.3102, LR: 9.4e-06
Epoch [2/10], Step [601/2105], Loss: 0.3495, LR: 9.3e-06
Epoch [2/10], Step [801/2105], Loss: 0.2725, LR: 9

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Step [1/2105], Loss: 0.6873, LR: 1.0e-06
Epoch [1/10], Step [201/2105], Loss: 0.6931, LR: 4.0e-06
Epoch [1/10], Step [401/2105], Loss: 0.6867, LR: 7.0e-06
Epoch [1/10], Step [601/2105], Loss: 0.4485, LR: 1.0e-05
Epoch [1/10], Step [801/2105], Loss: 0.2590, LR: 1.3e-05
Epoch [1/10], Step [1001/2105], Loss: 0.2670, LR: 1.6e-05
Epoch [1/10], Step [1201/2105], Loss: 0.2897, LR: 1.9e-05
Epoch [1/10], Step [1401/2105], Loss: 0.2530, LR: 2.0e-05
Epoch [1/10], Step [1601/2105], Loss: 0.3157, LR: 2.0e-05
Epoch [1/10], Step [1801/2105], Loss: 0.3355, LR: 1.9e-05
Epoch [1/10], Step [2001/2105], Loss: 0.3442, LR: 1.9e-05
Epoch [1/10] summary
    Train loss: 0.2729
    Validation loss: 0.3929
    Accuracy: 0.8237
Epoch [2/10], Step [1/2105], Loss: 0.5464, LR: 1.9e-05
Epoch [2/10], Step [201/2105], Loss: 0.3945, LR: 1.9e-05
Epoch [2/10], Step [401/2105], Loss: 0.4191, LR: 1.9e-05
Epoch [2/10], Step [601/2105], Loss: 0.5094, LR: 1.9e-05
Epoch [2/10], Step [801/2105], Loss: 0.3591, LR: 1

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Step [1/2105], Loss: 0.6871, LR: 1.5e-06
Epoch [1/10], Step [201/2105], Loss: 0.7208, LR: 6.0e-06
Epoch [1/10], Step [401/2105], Loss: 0.7037, LR: 1.1e-05
Epoch [1/10], Step [601/2105], Loss: 0.4366, LR: 1.5e-05
Epoch [1/10], Step [801/2105], Loss: 0.5549, LR: 2.0e-05
Epoch [1/10], Step [1001/2105], Loss: 0.2791, LR: 2.4e-05
Epoch [1/10], Step [1201/2105], Loss: 0.3378, LR: 2.9e-05
Epoch [1/10], Step [1401/2105], Loss: 0.4560, LR: 3.0e-05
Epoch [1/10], Step [1601/2105], Loss: 0.4812, LR: 2.9e-05
Epoch [1/10], Step [1801/2105], Loss: 0.3306, LR: 2.9e-05
Epoch [1/10], Step [2001/2105], Loss: 0.4417, LR: 2.9e-05
Epoch [1/10] summary
    Train loss: 0.3730
    Validation loss: 0.4793
    Accuracy: 0.7690
Epoch [2/10], Step [1/2105], Loss: 0.4485, LR: 2.9e-05
Epoch [2/10], Step [201/2105], Loss: 0.4830, LR: 2.8e-05
Epoch [2/10], Step [401/2105], Loss: 0.3806, LR: 2.8e-05
Epoch [2/10], Step [601/2105], Loss: 0.4889, LR: 2.8e-05
Epoch [2/10], Step [801/2105], Loss: 0.6567, LR: 2

In [None]:
with open('all_runs.json', 'w') as f:
    f.write(str(all_runs).replace("'", '"'))

In [None]:
df = pd.DataFrame(all_runs)
df['train_losses'] = df['train_losses'].apply(lambda x: [round(i, 2) for i in x])
df['val_losses'] = df['val_losses'].apply(lambda x: [round(i, 2) for i in x])
df['accuracies'] = df['accuracies'].apply(lambda x: [round(i, 2) for i in x])

df['learning_rate'] = df['learning_rate'].round(2)
df['batch_size'] = df['batch_size'].astype(int)

df

Unnamed: 0,learning_rate,batch_size,train_losses,val_losses,accuracies
0,0.0,16,"[0.12, 0.4, 0.15, 0.12, 0.57]","[0.41, 0.44, 0.46, 0.48, 0.48]","[0.81, 0.8, 0.78, 0.77, 0.76]"
1,0.0,16,"[0.36, 0.71, 0.34, 0.6, 0.61, 0.63]","[0.49, 0.46, 0.52, 0.58, 0.7, 0.7]","[0.74, 0.78, 0.75, 0.74, 0.51, 0.51]"
2,0.0,16,"[0.44, 0.73, 0.79, 0.72, 0.68]","[0.46, 0.58, 0.7, 0.7, 0.7]","[0.78, 0.66, 0.51, 0.51, 0.51]"
3,0.0,32,"[0.29, 0.48, 0.34, 0.44, 0.2]","[0.27, 0.41, 0.43, 0.51, 0.43]","[0.9, 0.82, 0.81, 0.76, 0.78]"
4,0.0,32,"[0.27, 0.34, 0.78, 0.59, 0.3]","[0.39, 0.44, 0.43, 0.44, 0.44]","[0.82, 0.78, 0.79, 0.78, 0.8]"
5,0.0,32,"[0.37, 0.48, 0.28, 0.42, 0.45, 0.36]","[0.48, 0.45, 0.46, 0.49, 0.5, 0.52]","[0.77, 0.79, 0.79, 0.77, 0.78, 0.73]"


In [None]:
torch.manual_seed(42)
del model
torch.cuda.empty_cache()

model = RobertaForSequenceClassification.from_pretrained("roberta-base")
model.to(device)

batch_size = 64
num_epochs = 2

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

optimizer = Adam(
    params=model.parameters(),
    lr=5e-6,
    betas=(0.9, 0.999),
    weight_decay=0.01,
)

for epoch in range(num_epochs):
    model.train()

    for batch_idx, (x, y) in enumerate(train_dataloader):

        # forward
        logits = model(**x).logits
        loss = F.cross_entropy(logits, y)

        # backprop
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
    
    # get train loss
    train_loss, _ = get_loss_and_accuracy(
        model=model,
        dataset=train_dataset,
        device=device,
        eval_ratio=0.1 # evaluate on 10% of train data
    )

    # Get validation loss and accuracy
    val_loss, val_accuracy = get_loss_and_accuracy(
        model=model,
        dataset=val_dataset,
        device=device
    )

    print(f"Epoch [{epoch+1}/{num_epochs}] summary")
    print(f'    Train loss: {train_loss:.4f}')
    print(f'    Validation loss: {val_loss:.4f}')
    print(f'    Accuracy: {val_accuracy:.4f}')
    print('========================================')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/2], Step [1/1053], Loss: 0.7357
Epoch [1/2], Step [101/1053], Loss: 0.7078
Epoch [1/2], Step [201/1053], Loss: 0.6649
Epoch [1/2], Step [301/1053], Loss: 0.3061
Epoch [1/2], Step [401/1053], Loss: 0.3070
Epoch [1/2], Step [501/1053], Loss: 0.2225
Epoch [1/2], Step [601/1053], Loss: 0.1832
Epoch [1/2], Step [701/1053], Loss: 0.1393
Epoch [1/2], Step [801/1053], Loss: 0.2323
Epoch [1/2], Step [901/1053], Loss: 0.2204
Epoch [1/2], Step [1001/1053], Loss: 0.2159
Epoch [1/2] summary
    Train loss: 0.2152
    Validation loss: 0.1691
    Accuracy: 0.9362
Epoch [2/2], Step [1/1053], Loss: 0.2890
Epoch [2/2], Step [101/1053], Loss: 0.2561
Epoch [2/2], Step [201/1053], Loss: 0.1747
Epoch [2/2], Step [301/1053], Loss: 0.1304
Epoch [2/2], Step [401/1053], Loss: 0.2262
Epoch [2/2], Step [501/1053], Loss: 0.2732
Epoch [2/2], Step [601/1053], Loss: 0.1889
Epoch [2/2], Step [701/1053], Loss: 0.1037
Epoch [2/2], Step [801/1053], Loss: 0.3003
Epoch [2/2], Step [901/1053], Loss: 0.1361
Epoch [2

In [10]:
model.save_pretrained('./models/best_accuracy')