## **0. Install Dependencies**

In [1]:
!pip install tiktoken



## **1. Import Libraries & Device Configuration**

In [2]:
import os
import math
import tiktoken
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## **2. Dataset & Dataloader Preparation**

In [4]:
# Download dataset
!gdown 1WTjIveEsM7XpN28xm6F1qgX57QxaoFI_
!gdown 1WbyeG8f-V7VmpKdQam-0tNg4x6XDWoML

Downloading...
From: https://drive.google.com/uc?id=1WTjIveEsM7XpN28xm6F1qgX57QxaoFI_
To: /content/train_set.csv
100% 524k/524k [00:00<00:00, 37.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WbyeG8f-V7VmpKdQam-0tNg4x6XDWoML
To: /content/validation_set.csv
100% 112k/112k [00:00<00:00, 60.9MB/s]


In [5]:
# Read dataset
TRAIN_PATH = '/content/train_set.csv'
VAL_PATH = '/content/validation_set.csv'
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)

In [6]:
print("Training Set:")
train_df.head()

Training Set:


Unnamed: 0,_id,label,title,label_numeric
0,66b5aabf8a38820e82e0b6ce,Xu hướng,"100+ STT Né thính, Cap né thính hài hước, NÉT ...",7
1,66b5a9838a38820e82e0b64d,Xu hướng,"Top 111+ stt cuộc sống an nhiên, bình dị tự tạ...",7
2,66b5cb358a38820e82e0c408,Xu hướng,"Top hạt giống hoa dễ trồng, nở quanh năm cho n...",7
3,66b5c7548a38820e82e0c271,Dinh dưỡng,Chi tiết 3 cách nấu rau bò khai đơn giản mà th...,1
4,66b5c7a78a38820e82e0c294,Nhà,Top 10 quạt cây hơi nước được ưa chuộng nhất h...,4


In [7]:
print("Validation Set:")
val_df.head()

Validation Set:


Unnamed: 0,_id,label,title,label_numeric
0,66b5a0f18a38820e82e0b2cb,Công Nghệ,Tổng hợp Code Hiền Nhân Thuật mới nhất 08/2024,0
1,66b5cdc28a38820e82e0c518,Làm Đẹp,"Cách chăm sóc da nhờn ở nam giới, top sản phẩm...",3
2,66b5a0ad8a38820e82e0b2ae,Làm Đẹp,Review Top 4 kem chống nắng Goodal bán chạy nh...,3
3,66b5db8b8a38820e82e0caad,Công Nghệ,Review điện thoại Xiaomi Redmi K40 thực tế sau...,0
4,66b5b77b8a38820e82e0bbfb,Khuyến mãi,Giao hàng hỏa tốc Lazada là gì? Cách chọn giao...,2


In [8]:
# Define custom dataset class to tokenize titles and return (input_id, label) pairs
class TextDataset(Dataset):
  def __init__(self, dataframe, tokenizer):
    self.titles = dataframe['title'].str.lower().values
    self.labels = dataframe['label_numeric'].values
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.titles)

  def __getitem__(self, idx):
    title = self.titles[idx]
    label = self.labels[idx]
    encoding = self.tokenizer.encode(title)
    input_id = torch.tensor(encoding, dtype=torch.long)
    return input_id, label

In [9]:
# Define collate function to pad sequences to a uniform length for batching
def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  labels = [item[1] for item in batch]
  max_length = max(len(input_id) for input_id in input_ids)
  input_ids = torch.stack([torch.cat([input_id, torch.zeros(max_length - len(input_id), dtype=torch.long)]) for input_id in input_ids])
  labels = torch.tensor(labels, dtype=torch.long)
  return input_ids, labels

In [10]:
# Set up tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

In [11]:
# Create datasets
train_dataset = TextDataset(
    dataframe=train_df,
    tokenizer=tokenizer
)
val_dataset = TextDataset(
    dataframe=val_df,
    tokenizer=tokenizer
)

In [12]:
len(train_dataset)

4543

In [13]:
len(val_dataset)

974

In [14]:
train_dataset[0]

(tensor([ 3064,    10,   336,    83,   299,  2634,   294, 39588,    71,    11,
          1451,   299,  2634,   294, 39588,    71,   289, 24247,    72,   289,
           130,   108,   157,   119,   249,    66,    11,   299, 25125,   269,
           128,   225,   782,    11,   269,   157,   119,   109,    66, 23370,
           157,   118,   100,    84]),
 np.int64(7))

In [15]:
val_dataset[0]

(tensor([   83,   157,   119,   243,   782,   289,   157,   119,    96,    79,
          2438, 23105,   157,   119,   223,    77,   299,    71, 22940,    77,
           294,    84,   157,   118,   255,    83,   285,   157,   119,   249,
            72,   299,    71,   157,   118,    98,    83,  8487,    14,  1238,
          1731]),
 np.int64(0))

In [16]:
# Create dataloaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)

## **3. Build Model**

In [17]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000):
    super(PositionalEncoding, self).__init__()

    # Create position indices as a column vector
    positions = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) # (max_len, 1)

    div_term = 1 / (10000.0 ** (torch.arange(0, d_model, 2).float() / d_model)) # (d_model/2,)
    # # Alternative method: avoid explicitly raising a base to a power by using exp for better numerical stability
    # div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

    # Initialize positional encoding tensor filled with 0
    pe_matrix = torch.zeros(max_len, d_model)  # (max_len, d_model)

    # Apply sine to even-indexed columns and cosine to odd-indexed columns using broadcasting between position and div_term
    pe_matrix[:, 0::2] = torch.sin(positions * div_term)
    pe_matrix[:, 1::2] = torch.cos(positions * div_term)

    # Add a batch dimension to match the input tensor for broadcasting
    pe_matrix = pe_matrix.unsqueeze(0)  # (1, max_len, d_model)

    # Register positional encoding as a buffer so it's not a learnable parameter
    self.register_buffer('pe_matrix', pe_matrix)


  def forward(self, x):
    # Add positional encoding matching input's sequence length (batch_size, seq_len, d_model) for broadcasting
    x = x + self.pe_matrix[:, :x.size(1), :] # (batch_size, seq_len, d_model)
    return x

In [18]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
    super(TransformerEncoderLayer, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads
    self.depth = d_model // num_heads

    # Linear layers for Q, K, V matrices
    self.wq = nn.Linear(d_model, d_model) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
    self.wk = nn.Linear(d_model, d_model) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
    self.wv = nn.Linear(d_model, d_model) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)

    # Output linear transformation
    self.dense = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)

    # Feed-forward network
    self.feed_forward = nn.Sequential(
        nn.Linear(d_model, d_ff), # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff)
        nn.ReLU(),
        nn.Linear(d_ff, d_model)  # (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
    )

    # Layer normalization and dropout
    self.layernorm1 = nn.LayerNorm(d_model) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
    self.layernorm2 = nn.LayerNorm(d_model) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
    self.dropout = nn.Dropout(dropout)


  def split_heads(self, x, batch_size):
    # x: (batch_size, seq_len, d_model)

    # Reshape x to (batch_size, seq_len, num_heads, depth)
    x = x.view(batch_size, -1, self.num_heads, self.depth)

    # Transpose x to (batch_size, num_heads, seq_len, depth)
    return x.transpose(1, 2)


  def scaled_dot_product_attention(self, q, k, v, mask=None):
    # q: (batch_size, num_heads, seq_len_q, depth)
    # k: (batch_size, num_heads, seq_len_k, depth)
    # v: (batch_size, num_heads, seq_len_v, depth) (where seq_len_v = seq_len_k)
    # d_model = num_heads x depth

    k_T = k.transpose(-1, -2) # (batch_size, num_heads, depth, seq_len_k)
    scores = torch.matmul(q, k_T) # (batch_size, num_heads, seq_len_q, seq_len_k)

    dk = k.size(-1)
    scale = torch.sqrt(torch.tensor(dk, dtype=torch.float32))

    scaled_scores = scores / scale  # (batch_size, num_heads, seq_len_q, seq_len_k)

    if mask is not None:
      scaled_scores = scaled_scores.masked_fill(mask == 0, -1e9)

    attention_weights = torch.nn.functional.softmax(scaled_scores, dim=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)

    output = torch.matmul(attention_weights, v) # (batch_size, num_heads, seq_len_q, depth)

    return output, attention_weights


  def forward(self, x, mask=None):
    # x: (batch_size, seq_len, d_model)

    batch_size = x.size(0)

    # Apply linear layers and split into heads
    q = self.split_heads(self.wq(x), batch_size)  # (batch_size, num_heads, seq_len, depth)
    k = self.split_heads(self.wk(x), batch_size)  # (batch_size, num_heads, seq_len, depth)
    v = self.split_heads(self.wv(x), batch_size)  # (batch_size, num_heads, seq_len, depth)

    # Apply the custom scaled dot-product attention
    scaled_attention, _ = self.scaled_dot_product_attention(q, k, v, mask)  # (batch_size, num_heads, seq_len, depth)

    # Transpose and reshape back
    scaled_attention = scaled_attention.transpose(1, 2).contiguous()  # (batch_size, seq_len, num_heads, depth)
    concat_attention = scaled_attention.view(batch_size, -1, self.d_model)  # (batch_size, seq_len, d_model)

    # Apply the final linear layer to combine the heads
    attn_output = self.dense(concat_attention)  # (batch_size, seq_len, d_model)

    # Add & Norm
    x = self.layernorm1(x + self.dropout(attn_output))  # (batch_size, seq_len, d_model)

    # Feed-forward
    ff_output = self.feed_forward(x)  # (batch_size, seq_len, d_model)

    # Add & Norm
    x = self.layernorm2(x + self.dropout(ff_output))  # (batch_size, seq_len, d_model)

    return x

In [19]:
class TransformerModel(nn.Module):
  def __init__(self, vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout=0.1):
    super(TransformerModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.positional_encoding = PositionalEncoding(d_model)

    self.encoder_layers = nn.ModuleList([
        TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
        for _ in range(num_layers)
    ])

    self.fc = nn.Linear(d_model, output_size)
    self.dropout = nn.Dropout(dropout)


  def forward(self, x, mask=None):
    x = self.embedding(x) # (batch_size, seq_len, embed_size)
    x = self.positional_encoding(x) # (batch_size, seq_len, d_model)

    for layer in self.encoder_layers:
      x = layer(x, mask)  # (batch_size, seq_len, d_model)

    x = x.mean(dim=1)  # (batch_size, d_model)
    x = self.fc(self.dropout(x))  # (batch_size, output_size)
    return x

In [20]:
# Initialize the Transformer model
vocab_size = tokenizer.n_vocab
embed_size = 256
d_model = 256
num_heads = 8
d_ff = 512
output_size = len(train_df['label_numeric'].unique())
num_layers = 4
dropout = 0.1

model = TransformerModel(vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout)

In [21]:
model = model.to(device)

## **4. Train & Evaluate Model**

In [22]:
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

In [23]:
criterion = nn.CrossEntropyLoss()

In [24]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [25]:
num_epochs = 20
best_accuracy = 0.0


for epoch in range(1, num_epochs + 1):
  # Train model
  model.train()

  for input_ids, labels in tqdm(train_dataloader, total=len(train_dataloader)):
    input_ids = input_ids.to(device)
    labels = labels.to(device)

    outputs = model(input_ids)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f"Epoch {epoch}/{num_epochs}, Loss: {loss.item()}")


  # Evaluate model
  model.eval()
  correct = 0
  total = 0
  with torch.no_grad():
    for input_ids, labels in tqdm(val_dataloader, total=len(val_dataloader)):
      input_ids = input_ids.to(device)
      labels = labels.to(device)

      outputs = model(input_ids)
      predicted = torch.argmax(outputs, dim=1)

      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total

    # Save the model if it achieves a new best validation accuracy
    if accuracy > best_accuracy:
      best_accuracy = accuracy
      torch.save(model.state_dict(), os.path.join('models', 'best_model.pth'))
      print(f"Saved best model at epoch {epoch} - Validation Accuracy: {accuracy:.2f}%")

100%|██████████| 142/142 [02:06<00:00,  1.12it/s]


Epoch 1/20, Loss: 1.5751787424087524


100%|██████████| 31/31 [00:07<00:00,  4.26it/s]


Saved best model at epoch 1 - Validation Accuracy: 64.58%


100%|██████████| 142/142 [02:06<00:00,  1.13it/s]


Epoch 2/20, Loss: 1.4378870725631714


100%|██████████| 31/31 [00:06<00:00,  4.82it/s]


Saved best model at epoch 2 - Validation Accuracy: 65.81%


100%|██████████| 142/142 [02:08<00:00,  1.11it/s]


Epoch 3/20, Loss: 1.192897081375122


100%|██████████| 31/31 [00:07<00:00,  4.19it/s]


Saved best model at epoch 3 - Validation Accuracy: 71.97%


100%|██████████| 142/142 [02:19<00:00,  1.02it/s]


Epoch 4/20, Loss: 0.8467778563499451


100%|██████████| 31/31 [00:07<00:00,  4.09it/s]


Saved best model at epoch 4 - Validation Accuracy: 73.31%


100%|██████████| 142/142 [02:21<00:00,  1.00it/s]


Epoch 5/20, Loss: 0.9631362557411194


100%|██████████| 31/31 [00:06<00:00,  4.53it/s]
100%|██████████| 142/142 [02:26<00:00,  1.03s/it]


Epoch 6/20, Loss: 0.6691944599151611


100%|██████████| 31/31 [00:07<00:00,  4.29it/s]


Saved best model at epoch 6 - Validation Accuracy: 76.08%


100%|██████████| 142/142 [02:34<00:00,  1.09s/it]


Epoch 7/20, Loss: 0.989790678024292


100%|██████████| 31/31 [00:07<00:00,  4.22it/s]
100%|██████████| 142/142 [03:03<00:00,  1.29s/it]


Epoch 8/20, Loss: 0.9719287753105164


100%|██████████| 31/31 [00:08<00:00,  3.69it/s]
100%|██████████| 142/142 [03:11<00:00,  1.35s/it]


Epoch 9/20, Loss: 0.8582125306129456


100%|██████████| 31/31 [00:08<00:00,  3.56it/s]
100%|██████████| 142/142 [03:34<00:00,  1.51s/it]


Epoch 10/20, Loss: 0.591472327709198


100%|██████████| 31/31 [00:07<00:00,  3.93it/s]
100%|██████████| 142/142 [03:51<00:00,  1.63s/it]


Epoch 11/20, Loss: 0.8468060493469238


100%|██████████| 31/31 [00:08<00:00,  3.73it/s]
100%|██████████| 142/142 [03:49<00:00,  1.62s/it]


Epoch 12/20, Loss: 0.7094195485115051


100%|██████████| 31/31 [00:08<00:00,  3.77it/s]
100%|██████████| 142/142 [03:50<00:00,  1.63s/it]


Epoch 13/20, Loss: 0.7934747934341431


100%|██████████| 31/31 [00:08<00:00,  3.52it/s]
100%|██████████| 142/142 [04:03<00:00,  1.72s/it]


Epoch 14/20, Loss: 1.0133702754974365


100%|██████████| 31/31 [00:09<00:00,  3.44it/s]
100%|██████████| 142/142 [04:13<00:00,  1.78s/it]


Epoch 15/20, Loss: 0.5131596922874451


100%|██████████| 31/31 [00:07<00:00,  3.93it/s]
100%|██████████| 142/142 [04:16<00:00,  1.80s/it]


Epoch 16/20, Loss: 0.7966384291648865


100%|██████████| 31/31 [00:08<00:00,  3.51it/s]
100%|██████████| 142/142 [04:25<00:00,  1.87s/it]


Epoch 17/20, Loss: 0.6404994130134583


100%|██████████| 31/31 [00:08<00:00,  3.77it/s]
100%|██████████| 142/142 [04:07<00:00,  1.74s/it]


Epoch 18/20, Loss: 0.7462441325187683


100%|██████████| 31/31 [00:08<00:00,  3.53it/s]
100%|██████████| 142/142 [04:04<00:00,  1.72s/it]


Epoch 19/20, Loss: 0.6306554675102234


100%|██████████| 31/31 [00:08<00:00,  3.57it/s]
100%|██████████| 142/142 [04:03<00:00,  1.72s/it]


Epoch 20/20, Loss: 0.45637303590774536


100%|██████████| 31/31 [00:07<00:00,  3.96it/s]
