In [2]:
# !pip install transformers datasets jupyter notebook

In [10]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)  # For binary classification
model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

### Tokenize the dataset

In [3]:
dataset = load_dataset("glue", "sst2")

train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:
train_data[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [37]:
X_train, X_val, X_test, y_train, y_val, y_test = [[] for _ in range(6)]
for split, xb, yb in zip([train_data, validation_data, test_data], [X_train, X_val, X_test], [y_train, y_val, y_test]):
    for observation in split:
        sentence, label, _ = observation.values()
        xb.append(sentence)
        yb.append(label)
    
    xb = tokenizer(xb, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [38]:
X_train = tokenizer(X_train, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_val = tokenizer(X_val, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
X_test = tokenizer(X_test, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

In [39]:
y_train = torch.Tensor(y_train)
y_val = torch.Tensor(y_val)
y_test = torch.Tensor(y_test)

In [42]:
class SST2Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        x = {
            'input_ids': self.x['input_ids'][0, :].unsqueeze(0),
            'attention_mask': self.x['attention_mask'][0, :].unsqueeze(0)
        }
        y = self.y[idx]

        return x, y

In [8]:
y_train.dtype

torch.int64

In [None]:
X_train.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
X_train['input_ids'].shape

torch.Size([67349, 67])

In [10]:
batch_size = 32
eval_iters = 200

In [None]:
def get_batch(split):
    data = X_train if split == 'train' else X_val
    label = y_train if split == 'train' else y_val
    idx = torch.randint(low=0, high=data['input_ids'].shape[0], size=(batch_size,))
    xb = {k: v[idx] for k, v in data.items()}
    yb = torch.tensor(label)[idx].to(device)
    return xb, yb


In [12]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        accuracies = torch.clone(losses)

        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits = model(**xb)['logits']
            losses[k] = F.cross_entropy(logits, yb)

            prob = F.softmax(logits, dim=1)
            label = torch.argmax(prob, dim=1)
            accuracy = torch.sum(label==yb) / len(yb)
            accuracies[k] = accuracy
        out[split] = losses.mean()
        out[f'{split} accuracy'] = accuracies.mean()
    model.train()
    return out

In [90]:
estimate_loss()

{'train': tensor(0.7785),
 'train accuracy': tensor(0.4484),
 'val': tensor(0.7504),
 'val accuracy': tensor(0.4906)}

In [31]:
xb, yb = get_batch('train')

In [33]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [34]:
del model.classifier

In [None]:
model.roberta.encoder.

RobertaEncoder(
  (layer): ModuleList(
    (0-11): 12 x RobertaLayer(
      (attention): RobertaAttention(
        (self): RobertaSdpaSelfAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): RobertaSelfOutput(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (intermediate): RobertaIntermediate(
        (dense): Linear(in_features=768, out_features=3072, bias=True)
        (intermediate_act_fn): GELUActivation()
      )
      (output): RobertaOutput(
        (dense): Linear(in_features=3072, out_features=768, bias=True)
        (LayerNorm): LayerNorm((768,), eps=1e-05,

In [None]:
kktorch.zeros(eval_iters).copy()

AttributeError: 'Tensor' object has no attribute 'copy'