## Alireza Heidari


# Imports

In [1]:
from transformers import RobertaForSequenceClassification
import torch
from transformers import AdamW
from transformers import get_scheduler

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Load The Model

In [2]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

model

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.ou

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [3]:
for param in model.parameters():
    param.requires_grad = False
    
for name, param in model.named_parameters():
    if "my_adapter" in name:
        param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

In [4]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

roberta.encoder.layer.0.attention.output.my_adapter_self_out.fc1.weight
roberta.encoder.layer.0.attention.output.my_adapter_self_out.fc1.bias
roberta.encoder.layer.0.attention.output.my_adapter_self_out.fc2.weight
roberta.encoder.layer.0.attention.output.my_adapter_self_out.fc2.bias
roberta.encoder.layer.0.output.my_adapter_out.fc1.weight
roberta.encoder.layer.0.output.my_adapter_out.fc1.bias
roberta.encoder.layer.0.output.my_adapter_out.fc2.weight
roberta.encoder.layer.0.output.my_adapter_out.fc2.bias
roberta.encoder.layer.1.attention.output.my_adapter_self_out.fc1.weight
roberta.encoder.layer.1.attention.output.my_adapter_self_out.fc1.bias
roberta.encoder.layer.1.attention.output.my_adapter_self_out.fc2.weight
roberta.encoder.layer.1.attention.output.my_adapter_self_out.fc2.bias
roberta.encoder.layer.1.output.my_adapter_out.fc1.weight
roberta.encoder.layer.1.output.my_adapter_out.fc1.bias
roberta.encoder.layer.1.output.my_adapter_out.fc2.weight
roberta.encoder.layer.1.output.my_adapt

# Prepare Dataset

In [17]:
class IMDBDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['label']
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

imdb_dataset = load_dataset("imdb")

train_dataset = IMDBDataset(imdb_dataset['train'], tokenizer)
val_dataset = IMDBDataset(imdb_dataset['test'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

Found cached dataset imdb (/home/alireza/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 473.54it/s]


In [6]:
for batch in train_loader:
    print(batch)
    break

{'input_ids': tensor([[    0,   100,   802,  ...,     1,     1,     1],
        [    0,   113, 42362,  ...,     1,     1,     1],
        [    0,   100,   206,  ...,     1,     1,     1],
        ...,
        [    0,   113,   243,  ...,     1,     1,     1],
        [    0,   108, 34673,  ...,  4889,   149,     2],
        [    0,  7516,   272,  ...,   734,  2497,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0])}


# Set Device

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [8]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed May  3 19:03:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   54C    P0     8W /  N/A |   1017MiB /  4096MiB |     22%      Default |
|                               |   

# Set Optimizers

In [9]:
optimizer = AdamW(model.parameters(), lr=3e-4)

num_epochs = 10
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

20840




# Training Loop

In [10]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    i = 1
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        epoch_loss += loss.item()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            progress_bar.update(10)
        i += 1 
        
    print(f'loss = {epoch_loss / i}, epoch = {epoch}')

 10%|███▌                                | 2080/20840 [14:28<2:10:24,  2.40it/s]

loss = 0.22611150621476736, epoch = 0


 20%|███████▏                            | 4160/20840 [28:59<1:56:17,  2.39it/s]

loss = 0.16857459795498234, epoch = 1


 30%|██████████▊                         | 6240/20840 [43:30<1:41:38,  2.39it/s]

loss = 0.13587871901953383, epoch = 2


 40%|██████████████▎                     | 8320/20840 [58:03<1:27:28,  2.39it/s]

loss = 0.09862852213051107, epoch = 3


 50%|████████████████▍                | 10400/20840 [1:12:38<1:13:04,  2.38it/s]

loss = 0.07395832287454732, epoch = 4


 60%|████████████████████▉              | 12480/20840 [1:27:14<58:29,  2.38it/s]

loss = 0.04813371616686757, epoch = 5


 70%|████████████████████████▍          | 14560/20840 [1:41:47<43:46,  2.39it/s]

loss = 0.03535903479497181, epoch = 6


 80%|███████████████████████████▉       | 16640/20840 [1:56:19<29:16,  2.39it/s]

loss = 0.02554674058588952, epoch = 7


 90%|███████████████████████████████▍   | 18720/20840 [2:10:50<14:48,  2.38it/s]

loss = 0.017479574276099197, epoch = 8


100%|██████████████████████████████████▉| 20800/20840 [2:25:22<00:16,  2.39it/s]

loss = 0.011762766072365212, epoch = 9


# Evaluation

In [18]:
import evaluate

metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
model.eval()

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.94156,
 'f1': 0.9417812313209802,
 'precision': 0.9382294561333863,
 'recall': 0.94536}