In [2]:
# !pip install transformers datasets jupyter notebook

In [11]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [51]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)  # For binary classification
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

### Tokenize the dataset

In [3]:
dataset = load_dataset("glue", "sst2")

train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

In [4]:
train_data[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = [[] for _ in range(6)]
for split, xb, yb in zip([train_data, validation_data, test_data], [X_train, X_val, X_test], [y_train, y_val, y_test]):
    for observation in split:
        sentence, label, _ = observation.values()
        xb.append(sentence)
        yb.append(label)
    
    xb = tokenizer(xb, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [6]:
X_train = tokenizer(X_train, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_val = tokenizer(X_val, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_test = tokenizer(X_test, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [42]:
class SST2Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        x = {
            'input_ids': self.X['input_ids'][idx, :].to(device),
            'attention_mask': self.X['attention_mask'][idx, :].to(device)
        }
        label = self.y[idx]
        y = torch.Tensor([1.0, 0.0]) if label == 0 else torch.Tensor([0.0, 1.0])
        y = y.to(device)
        return x, y

In [43]:
batch_size = 32
num_epochs = 3

In [52]:
train_dataset = SST2Dataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = SST2Dataset(X_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

### Get accuracy before fine-tuning.

In [54]:
accuracies = []

model.eval()
with torch.no_grad():
    for batch_idx, (x, y) in enumerate(val_dataloader):
        logits = model(**x).logits
        softmax = F.softmax(logits, dim=1)
        y_preds = torch.argmax(softmax, dim=1)
        y = torch.Tensor([x[1] for x in y]).to(device)
        accuracy = (torch.sum(y_preds == y)/len(y)).item()
        accuracies.append(accuracy)

print(torch.mean(torch.Tensor(accuracies)))


tensor(0.5089)


In [47]:
len(train_dataset)

67349

In [55]:
optimizer = Adam(
    params=model.parameters(),
    lr=5e-5,
    betas=(0.9, 0.999),
    weight_decay=0.01,
)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for batch_idx, (x, y) in enumerate(train_dataloader):
        optimizer.zero_grad()

        # forward
        logits = model(**x).logits
        softmax = F.softmax(logits, dim=1)
        loss = F.cross_entropy(logits, y)

        # backprop
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
    
    train_loss /= len(train_dataloader)
    

    # Get validation loss and accuracy
    val_accuracy = 0
    val_loss = 0

    model.eval()
    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(val_dataloader):
            logits = model(**x).logits
            softmax = F.softmax(logits, dim=1)
            loss = F.cross_entropy(logits, y)
            y_preds = torch.argmax(softmax, dim=1)
            y = torch.Tensor([x[1] for x in y]).to(device)

            accuracy = (torch.sum(y_preds == y)/len(y)).item()
            val_accuracy += accuracy
            val_loss += loss.item()
    
    val_accuracy /= len(val_dataloader)
    val_loss /= len(val_dataloader)

    print(f"Epoch [{epoch+1}/{num_epochs}] summary")
    print(f'    Train loss: {train_loss:.4f}')
    print(f'    Validation loss: {val_loss:.4f}')
    print(f'    Accuracy: {val_accuracy:.4f}')
    print('========================================')

Epoch [1/3], Step [1/2105], Loss: 0.6957
Epoch [1/3], Step [101/2105], Loss: 0.5393
Epoch [1/3], Step [201/2105], Loss: 0.3685
Epoch [1/3], Step [301/2105], Loss: 0.1532
Epoch [1/3], Step [401/2105], Loss: 0.5114
Epoch [1/3], Step [501/2105], Loss: 0.3916
Epoch [1/3], Step [601/2105], Loss: 0.3165
Epoch [1/3], Step [701/2105], Loss: 0.3809
Epoch [1/3], Step [801/2105], Loss: 0.1698
Epoch [1/3], Step [901/2105], Loss: 0.3512
Epoch [1/3], Step [1001/2105], Loss: 0.1893
Epoch [1/3], Step [1101/2105], Loss: 0.4392
Epoch [1/3], Step [1201/2105], Loss: 0.4482
Epoch [1/3], Step [1301/2105], Loss: 0.2900
Epoch [1/3], Step [1401/2105], Loss: 0.2921
Epoch [1/3], Step [1501/2105], Loss: 0.2874
Epoch [1/3], Step [1601/2105], Loss: 0.3353
Epoch [1/3], Step [1701/2105], Loss: 0.3340
Epoch [1/3], Step [1801/2105], Loss: 0.4084
Epoch [1/3], Step [1901/2105], Loss: 0.3014
Epoch [1/3], Step [2001/2105], Loss: 0.3713
Epoch [1/3], Step [2101/2105], Loss: 0.2741
Epoch [1/3] summary
    Train loss: 0.3529
 

In [10]:
batch = next(iter(train_dataloader))

In [None]:
x, y = batch

In [12]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [13]:
for parameter in model.parameters():
    parameter.requires_grad = False

In [35]:
# with torch.no_grad():
output = model(**x)

In [10]:
batch_size = 32
eval_iters = 200

In [None]:
def get_batch(split):
    data = X_train if split == 'train' else X_val
    label = y_train if split == 'train' else y_val
    idx = torch.randint(low=0, high=data['input_ids'].shape[0], size=(batch_size,))
    xb = {k: v[idx] for k, v in data.items()}
    yb = torch.tensor(label)[idx].to(device)
    return xb, yb


In [12]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        accuracies = torch.clone(losses)

        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits = model(**xb)['logits']
            losses[k] = F.cross_entropy(logits, yb)

            prob = F.softmax(logits, dim=1)
            label = torch.argmax(prob, dim=1)
            accuracy = torch.sum(label==yb) / len(yb)
            accuracies[k] = accuracy
        out[split] = losses.mean()
        out[f'{split} accuracy'] = accuracies.mean()
    model.train()
    return out

In [90]:
estimate_loss()

{'train': tensor(0.7785),
 'train accuracy': tensor(0.4484),
 'val': tensor(0.7504),
 'val accuracy': tensor(0.4906)}

In [31]:
xb, yb = get_batch('train')

In [33]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [34]:
del model.classifier

In [None]:
model.roberta.encoder.

RobertaEncoder(
  (layer): ModuleList(
    (0-11): 12 x RobertaLayer(
      (attention): RobertaAttention(
        (self): RobertaSdpaSelfAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): RobertaSelfOutput(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (intermediate): RobertaIntermediate(
        (dense): Linear(in_features=768, out_features=3072, bias=True)
        (intermediate_act_fn): GELUActivation()
      )
      (output): RobertaOutput(
        (dense): Linear(in_features=3072, out_features=768, bias=True)
        (LayerNorm): LayerNorm((768,), eps=1e-05,

In [None]:
kktorch.zeros(eval_iters).copy()

AttributeError: 'Tensor' object has no attribute 'copy'