In [None]:
!pip install einops

In [None]:
!pip install transformers datasets

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# %cd /content/drive/MyDrive/VCCorp/Báo cáo công việc/code

/content/drive/MyDrive/VCCorp/Báo cáo công việc/code


In [8]:
# !ls

train_transformer.ipynb  transformer_encoder.py


In [9]:
from einops import rearrange
import torch.nn as nn
import torch
import numpy as np
import math
import torch.nn.functional as F

from torch.utils.data import DataLoader
from datetime import datetime
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformer_encoder import TransformerEncoder

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


## Dataset

In [10]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [11]:
from datasets import load_dataset

In [12]:
raw_datasets = load_dataset('glue', 'sst2')

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [14]:
def tokenize_df(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [15]:
tokenized_datasets = raw_datasets.map(tokenize_df, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [17]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [18]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [20]:
# auto padding in this step
train_loader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

valid_loader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=32,
    collate_fn=data_collator
)

In [21]:
tokenizer.max_model_input_sizes[checkpoint]

512

## Model

In [24]:
model = TransformerEncoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=4,
    n_classes=2,
    dropout_prob=0.1
)

model.to(device)

TransformerEncoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (drop_out): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,),

## Training

In [25]:
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)

    for epoch in range(epochs):
        # training
        model.train()
        t0 = datetime.now()
        train_loss = 0
        n_train = 0
        for batch in tqdm(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()

            outputs = model(batch['input_ids'], batch['attention_mask'])
            loss = criterion(outputs, batch['labels'])

            loss.backward()
            optimizer.step()

            # crossentropy loss computes average loss
            train_loss += loss.item()*batch['input_ids'].size(0)
            n_train += batch['input_ids'].size(0)
        
        # get_average train loss
        train_loss = train_loss / n_train

        # evaluating
        model.eval()
        test_loss = 0
        n_test = 0
        with torch.no_grad():
            for batch in tqdm(valid_loader):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(batch['input_ids'], batch['attention_mask'])
                loss = criterion(outputs, batch['labels'])

                test_loss += loss.item()*batch['input_ids'].size(0)
                n_test += batch['input_ids'].size(0)

        test_loss = test_loss / n_test

        # save loss
        train_losses[epoch] = train_loss
        test_losses[epoch] = test_loss

        dt = datetime.now() - t0
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Duration: {dt}')
    
    return train_losses, test_losses

In [27]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [28]:
train_losses, test_losses = train(model, 
    criterion, optimizer, train_loader, valid_loader, epochs=10)

  0%|          | 0/2105 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 2105/2105 [00:51<00:00, 40.62it/s]
100%|██████████| 28/28 [00:00<00:00, 111.33it/s]


Epoch 1/10, Train Loss: 0.5257, Test Loss: 0.4773, Duration: 0:00:52.094287


100%|██████████| 2105/2105 [00:47<00:00, 44.20it/s]
100%|██████████| 28/28 [00:00<00:00, 120.84it/s]


Epoch 2/10, Train Loss: 0.3518, Test Loss: 0.4625, Duration: 0:00:47.873967


100%|██████████| 2105/2105 [00:50<00:00, 42.03it/s]
100%|██████████| 28/28 [00:00<00:00, 123.53it/s]


Epoch 3/10, Train Loss: 0.2886, Test Loss: 0.4499, Duration: 0:00:50.327271


100%|██████████| 2105/2105 [00:46<00:00, 45.30it/s]
100%|██████████| 28/28 [00:00<00:00, 118.91it/s]


Epoch 4/10, Train Loss: 0.2513, Test Loss: 0.4629, Duration: 0:00:46.720209


100%|██████████| 2105/2105 [00:45<00:00, 45.99it/s]
100%|██████████| 28/28 [00:00<00:00, 119.87it/s]


Epoch 5/10, Train Loss: 0.2244, Test Loss: 0.5114, Duration: 0:00:46.021782


100%|██████████| 2105/2105 [00:46<00:00, 45.73it/s]
100%|██████████| 28/28 [00:00<00:00, 122.19it/s]


Epoch 6/10, Train Loss: 0.2043, Test Loss: 0.5441, Duration: 0:00:46.273956


100%|██████████| 2105/2105 [00:46<00:00, 45.51it/s]
100%|██████████| 28/28 [00:00<00:00, 120.27it/s]


Epoch 7/10, Train Loss: 0.1892, Test Loss: 0.5249, Duration: 0:00:46.505028


100%|██████████| 2105/2105 [00:45<00:00, 45.92it/s]
100%|██████████| 28/28 [00:00<00:00, 120.39it/s]


Epoch 8/10, Train Loss: 0.1754, Test Loss: 0.5149, Duration: 0:00:46.086642


100%|██████████| 2105/2105 [00:45<00:00, 45.99it/s]
100%|██████████| 28/28 [00:00<00:00, 109.31it/s]


Epoch 9/10, Train Loss: 0.1642, Test Loss: 0.5594, Duration: 0:00:46.051293


100%|██████████| 2105/2105 [00:48<00:00, 43.04it/s]
100%|██████████| 28/28 [00:00<00:00, 123.23it/s]

Epoch 10/10, Train Loss: 0.1559, Test Loss: 0.5698, Duration: 0:00:49.151043





In [29]:
# Accuracy

model.eval()
n_correct = 0
n_total = 0
for batch in tqdm(train_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(batch['input_ids'], batch['attention_mask'])

    _, predictions = torch.max(outputs, -1)
    n_correct += (predictions == batch['labels']).sum().item()
    n_total += batch['labels'].size(0)

train_accuracy = n_correct / n_total 

n_correct = 0
n_total = 0
for batch in tqdm(valid_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(batch['input_ids'], batch['attention_mask'])

    _, predictions = torch.max(outputs, -1)
    n_correct += (predictions == batch['labels']).sum().item()
    n_total += batch['labels'].size(0)

test_accuracy = n_correct / n_total 
print(f'Train acc: {train_accuracy:.4f}, Test acc: {test_accuracy:.4f}')

100%|██████████| 2105/2105 [00:20<00:00, 100.51it/s]
100%|██████████| 28/28 [00:00<00:00, 113.87it/s]

Train acc: 0.9658, Test acc: 0.7959



