In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import math
from collections import Counter

# ==========================================
# 1. Cấu hình Hyperparameters
# ==========================================
Config = {
    'vocab_size': 10000,
    'd_model': 128,
    'n_head': 4,
    'n_layers': 3,
    'd_ff': 512,
    'dropout': 0.1,
    'batch_size': 32,
    'learning_rate': 0.001,
    'epochs': 10,
    'max_len': 100
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ==========================================
# 2. Xử lý dữ liệu (Dataset & Tokenizer)
# ==========================================

class TextTokenizer:
    def __init__(self, vocab_size=10000):
        self.vocab_size = vocab_size
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        self.idx2word = {0: "<PAD>", 1: "<UNK>"}

    def fit(self, texts):
        # Tách từ đơn giản bằng khoảng trắng
        all_words = []
        for text in texts:
            all_words.extend(str(text).lower().split())

        counts = Counter(all_words)
        # Lấy top từ phổ biến nhất
        common_words = counts.most_common(self.vocab_size - 2)

        for word, _ in common_words:
            if word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

    def encode(self, text, max_len=100):
        words = str(text).lower().split()
        indices = [self.word2idx.get(w, 1) for w in words] # 1 là <UNK>

        # Padding hoặc cắt ngắn (Truncate)
        if len(indices) < max_len:
            indices += [0] * (max_len - len(indices)) # 0 là <PAD>
        else:
            indices = indices[:max_len]
        return indices

import json
import pandas as pd
import torch
from torch.utils.data import Dataset

class ViOCDDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_len=100, is_train=False):
        # 1. Đọc file JSON
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)

        # Chuyển đổi từ dict {"0": {...}, "1": {...}} sang list [{...}, {...}]
        data_list = list(raw_data.values())
        self.df = pd.DataFrame(data_list)

        self.text_col = 'review'
        self.label_col = 'domain'

        self.texts = self.df[self.text_col].values
        self.labels = self.df[self.label_col].values

        if is_train:
            tokenizer.fit(self.texts)

        self.tokenizer = tokenizer
        self.max_len = max_len

        unique_labels = sorted(list(set(self.labels)))
        self.label_map = {lbl: i for i, lbl in enumerate(unique_labels)}
        self.num_classes = len(unique_labels)

        print(f"Loaded {len(self.texts)} samples from {file_path}")
        print(f"Domains found: {self.label_map}")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label_str = self.labels[idx]

        # Encode text thành các con số
        encoded_text = self.tokenizer.encode(text, self.max_len)

        # Chuyển label từ chữ (ví dụ 'mobile') sang số (ví dụ 0)
        label_idx = self.label_map[label_str]

        return torch.tensor(encoded_text, dtype=torch.long), torch.tensor(label_idx, dtype=torch.long)

# ==========================================
# 3. Xây dựng Mô hình (Transformer Encoder)
# ==========================================

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Tạo ma trận positional encoding
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        x = x + self.pe[:, :x.size(1), :]
        return x

class TransformerClassifier(nn.Module):
    def __init__(self, config, num_classes):
        super(TransformerClassifier, self).__init__()

        # 1. Embedding Layer
        self.embedding = nn.Embedding(config['vocab_size'], config['d_model'])

        # 2. Positional Encoding
        self.pos_encoder = PositionalEncoding(config['d_model'], config['max_len'])

        # 3. Transformer Encoder Layers (Stacked)
        # encoder_layer định nghĩa 1 block (MultiHeadAttn + FeedForward)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config['d_model'],
            nhead=config['n_head'],
            dim_feedforward=config['d_ff'],
            dropout=config['dropout'],
            batch_first=True # Quan trọng: input shape là [batch, seq, feature]
        )

        # Stack 3 lớp encoder
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config['n_layers'])

        # 4. Classification Head
        self.fc = nn.Linear(config['d_model'], num_classes)
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, x):
        # x shape: [batch_size, max_len]

        # Mask padding (để attention không chú ý vào token 0 <PAD>)
        # src_key_padding_mask: True ở vị trí padding
        src_key_padding_mask = (x == 0)

        # Embedding & Positional Encoding
        x = self.embedding(x) * math.sqrt(Config['d_model'])
        x = self.pos_encoder(x)

        # Qua Transformer Encoder
        # Output: [batch_size, max_len, d_model]
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        # Pooling: Lấy vector trung bình của các từ (Global Average Pooling)
        x = x.mean(dim=1)

        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# ==========================================
# 4. Huấn luyện và Đánh giá
# ==========================================

def train_model():
    # Load data
    tokenizer = TextTokenizer(vocab_size=Config['vocab_size'])

    print("Loading Train set...")
    train_dataset = ViOCDDataset('train.json', tokenizer, Config['max_len'], is_train=True)
    print("Loading Dev set...")
    dev_dataset = ViOCDDataset('dev.json', tokenizer, Config['max_len'])
    print("Loading Test set...")
    test_dataset = ViOCDDataset('test.json', tokenizer, Config['max_len'])

    train_loader = DataLoader(train_dataset, batch_size=Config['batch_size'], shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=Config['batch_size'])
    test_loader = DataLoader(test_dataset, batch_size=Config['batch_size'])

    print(f"Number of classes: {train_dataset.num_classes}")

    # Init model
    model = TransformerClassifier(Config, num_classes=train_dataset.num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=Config['learning_rate'])

    # Training Loop
    for epoch in range(Config['epochs']):
        model.train()
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for texts, labels in dev_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f"Epoch {epoch+1}/{Config['epochs']}, Loss: {total_loss/len(train_loader):.4f}, Dev Acc: {100 * correct / total:.2f}%")

    # Testing
    print("\nEvaluating on Test set...")
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

if __name__ == "__main__":
    print("Go!")
    train_model()

Using device: cpu
Go!
Loading Train set...
Loaded 4387 samples from train.json
Domains found: {'app': 0, 'cosmetic': 1, 'fashion': 2, 'mobile': 3}
Loading Dev set...
Loaded 548 samples from dev.json
Domains found: {'app': 0, 'cosmetic': 1, 'fashion': 2, 'mobile': 3}
Loading Test set...
Loaded 549 samples from test.json
Domains found: {'app': 0, 'cosmetic': 1, 'fashion': 2, 'mobile': 3}
Number of classes: 4


  output = torch._nested_tensor_from_mask(


Epoch 1/10, Loss: 0.6890, Dev Acc: 70.26%
Epoch 2/10, Loss: 0.3770, Dev Acc: 81.57%
Epoch 3/10, Loss: 0.2957, Dev Acc: 79.74%
Epoch 4/10, Loss: 0.2611, Dev Acc: 80.47%
Epoch 5/10, Loss: 0.2338, Dev Acc: 80.47%
Epoch 6/10, Loss: 0.2537, Dev Acc: 83.21%
Epoch 7/10, Loss: 0.1838, Dev Acc: 82.66%
Epoch 8/10, Loss: 0.1792, Dev Acc: 81.02%
Epoch 9/10, Loss: 0.1627, Dev Acc: 85.58%
Epoch 10/10, Loss: 0.1440, Dev Acc: 86.31%

Evaluating on Test set...
Test Accuracy: 86.34%


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import math
import json
from collections import Counter

# ==========================================
# 1. Cấu hình Hyperparameters
# ==========================================
Config = {
    'vocab_size': 10000,
    'd_model': 128,
    'n_head': 4,
    'n_layers': 3,
    'd_ff': 512,
    'dropout': 0.1,
    'batch_size': 32,
    'learning_rate': 0.001,
    'epochs': 10,
    'max_len': 100,
    'pad_idx': 0,
    'unk_idx': 1,
    'ignore_index': -100
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ==========================================
# 2. Xử lý dữ liệu (Vocab & Dataset)
# ==========================================

class Vocab:
    def __init__(self, vocab_size=10000):
        self.vocab_size = vocab_size
        self.word2idx = {"<PAD>": Config['pad_idx'], "<UNK>": Config['unk_idx']}
        self.idx2word = {Config['pad_idx']: "<PAD>", Config['unk_idx']: "<UNK>"}
        self.tag2idx = {}
        self.idx2tag = {}

    def build_vocab(self, words_list, tags_list):
        # 1. Build Word Vocab
        all_words = [w.lower() for sent in words_list for w in sent]
        counts = Counter(all_words)
        common_words = counts.most_common(self.vocab_size - 2)

        for word, _ in common_words:
            if word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

        # 2. Build Tag Vocab
        # Thêm padding tag để map đồng bộ, dù loss function sẽ ignore nó
        all_tags = sorted(list(set([t for sent in tags_list for t in sent])))
        self.tag2idx = {tag: i for i, tag in enumerate(all_tags)}
        self.idx2tag = {i: tag for tag, i in self.tag2idx.items()}

        print(f"Vocab size: {len(self.word2idx)}")
        print(f"Num tags: {len(self.tag2idx)}")
        print(f"Tags map: {self.tag2idx}")

    def encode_text(self, sent_list, max_len=100):
        # Input là list các từ: ["Tôi", "đi", "học"]
        indices = [self.word2idx.get(w.lower(), Config['unk_idx']) for w in sent_list]

        if len(indices) < max_len:
            indices += [Config['pad_idx']] * (max_len - len(indices))
        else:
            indices = indices[:max_len]
        return indices

    def encode_tags(self, tag_list, max_len=100):
        # Input là list các nhãn: ["O", "O", "O"]
        indices = [self.tag2idx.get(t, 0) for t in tag_list]

        # Padding cho nhãn: Dùng ignore_index (-100) để model không tính loss vào đây
        if len(indices) < max_len:
            indices += [Config['ignore_index']] * (max_len - len(indices))
        else:
            indices = indices[:max_len]
        return indices

class PhoNERDataset(Dataset):
    def __init__(self, file_path, vocab, max_len=100, is_train=False):
        self.sentences = []
        self.tags = []

        print(f"Processing {file_path}...")

        try:
            # Cấu trúc: {"words": [[s1], [s2]], "tags": [[t1], [t2]]}
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                self.sentences = data['words']
                self.tags = data['tags']

        except json.JSONDecodeError:
            # Xử lý trường hợp file là dạng JSON Lines hoặc bị nối đuôi nhau
            print(f"-> Detected JSON Lines format or multiple objects in {file_path}. Switching parse mode.")
            with open(file_path, 'r', encoding='utf-8') as f:
                # Đọc từng dòng
                for line in f:
                    line = line.strip()
                    if not line: continue
                    try:
                        obj = json.loads(line)
                        # Kiểm tra xem dòng này chứa 1 list các câu hay chỉ 1 câu
                        if 'words' in obj and 'tags' in obj:
                            # Nếu words là list lồng nhau [[...]], dùng extend
                            if len(obj['words']) > 0 and isinstance(obj['words'][0], list):
                                self.sentences.extend(obj['words'])
                                self.tags.extend(obj['tags'])
                            else:
                                # Nếu words là list đơn [...], dùng append
                                self.sentences.append(obj['words'])
                                self.tags.append(obj['tags'])
                    except json.JSONDecodeError:
                        continue

        # Validation: Kiểm tra xem dữ liệu có khớp không
        assert len(self.sentences) == len(self.tags), "Mismatch between sentences and tags length!"

        self.vocab = vocab
        self.max_len = max_len

        # Nếu là train set thì build vocab
        if is_train:
            self.vocab.build_vocab(self.sentences, self.tags)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sent = self.sentences[idx]
        tag = self.tags[idx]

        encoded_sent = self.vocab.encode_text(sent, self.max_len)
        encoded_tag = self.vocab.encode_tags(tag, self.max_len)

        return torch.tensor(encoded_sent, dtype=torch.long), torch.tensor(encoded_tag, dtype=torch.long)

# ==========================================
# 3. Xây dựng Mô hình (Token Classification)
# ==========================================

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

class TransformerTagger(nn.Module):
    def __init__(self, config, num_tags):
        super(TransformerTagger, self).__init__()

        self.embedding = nn.Embedding(len(config['vocab'].word2idx), config['d_model'], padding_idx=config['pad_idx'])
        self.pos_encoder = PositionalEncoding(config['d_model'], config['max_len'])

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config['d_model'],
            nhead=config['n_head'],
            dim_feedforward=config['d_ff'],
            dropout=config['dropout'],
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config['n_layers'])

        # Output layer: Project từ d_model ra số lượng tags
        # Shape output sẽ là [batch, seq_len, num_tags]
        self.fc = nn.Linear(config['d_model'], num_tags)
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, x):
        # x: [batch, seq_len]

        # Tạo mask để Transformer không attention vào padding
        src_key_padding_mask = (x == Config['pad_idx'])

        x = self.embedding(x) * math.sqrt(Config['d_model'])
        x = self.pos_encoder(x)

        # Output Transformer: [batch, seq_len, d_model]
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        x = self.dropout(x)

        # Ta cần dự đoán cho từng vị trí token.
        logits = self.fc(x) # [batch, seq_len, num_tags]

        return logits

# ==========================================
# 4. Huấn luyện và Đánh giá
# ==========================================

def calculate_accuracy(preds, y, ignore_index):
    # Flatten: [batch * seq_len]
    preds = preds.view(-1)
    y = y.view(-1)

    # Chỉ tính accuracy trên các token thật (bỏ qua padding -100)
    mask = (y != ignore_index)
    correct = (preds[mask] == y[mask]).sum()
    total = mask.sum()

    return correct.item(), total.item()

def train_model():
    vocab = Vocab(vocab_size=Config['vocab_size'])
    Config['vocab'] = vocab # Lưu vocab vào config để model dùng embedding size chuẩn

    print("--- LOADING DATA ---")
    train_dataset = PhoNERDataset('train_word.json', vocab, Config['max_len'], is_train=True)

    # Dev/Test dùng chung vocab của Train
    dev_dataset = PhoNERDataset('dev_word.json', vocab, Config['max_len'])
    test_dataset = PhoNERDataset('test_word.json', vocab, Config['max_len'])

    train_loader = DataLoader(train_dataset, batch_size=Config['batch_size'], shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=Config['batch_size'])
    test_loader = DataLoader(test_dataset, batch_size=Config['batch_size'])

    num_tags = len(vocab.tag2idx)

    # Init Model
    model = TransformerTagger(Config, num_tags=num_tags).to(device)

    # Loss function: ignore_index=-100 để bỏ qua padding khi tính loss
    criterion = nn.CrossEntropyLoss(ignore_index=Config['ignore_index'])
    optimizer = optim.Adam(model.parameters(), lr=Config['learning_rate'])

    print("--- START TRAINING ---")
    for epoch in range(Config['epochs']):
        model.train()
        total_loss = 0
        total_correct = 0
        total_tokens = 0

        for texts, tags in train_loader:
            texts, tags = texts.to(device), tags.to(device)

            optimizer.zero_grad()
            outputs = model(texts) # [batch, seq_len, num_tags]

            # Reshape để tính loss
            # outputs: [batch * seq_len, num_tags]
            # tags: [batch * seq_len]
            loss = criterion(outputs.view(-1, num_tags), tags.view(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Calc Accuracy
            _, predicted = torch.max(outputs, 2)
            c, t = calculate_accuracy(predicted, tags, Config['ignore_index'])
            total_correct += c
            total_tokens += t

        train_acc = 100 * total_correct / total_tokens

        # Validation
        model.eval()
        val_correct = 0
        val_tokens = 0
        with torch.no_grad():
            for texts, tags in dev_loader:
                texts, tags = texts.to(device), tags.to(device)
                outputs = model(texts)
                _, predicted = torch.max(outputs, 2)

                c, t = calculate_accuracy(predicted, tags, Config['ignore_index'])
                val_correct += c
                val_tokens += t

        val_acc = 100 * val_correct / val_tokens
        print(f"Epoch {epoch+1}/{Config['epochs']} | Loss: {total_loss/len(train_loader):.4f} | Train Acc: {train_acc:.2f}% | Dev Acc: {val_acc:.2f}%")

    # Testing
    print("\n--- EVALUATING ON TEST SET ---")
    model.eval()
    test_correct = 0
    test_tokens = 0
    with torch.no_grad():
        for texts, tags in test_loader:
            texts, tags = texts.to(device), tags.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs, 2)
            c, t = calculate_accuracy(predicted, tags, Config['ignore_index'])
            test_correct += c
            test_tokens += t

    print(f"Test Accuracy: {100 * test_correct / test_tokens:.2f}%")

if __name__ == "__main__":
    train_model()

Using device: cpu
--- LOADING DATA ---
Processing train_word.json...
-> Detected JSON Lines format or multiple objects in train_word.json. Switching parse mode.
Vocab size: 4741
Num tags: 20
Tags map: {'B-AGE': 0, 'B-DATE': 1, 'B-GENDER': 2, 'B-JOB': 3, 'B-LOCATION': 4, 'B-NAME': 5, 'B-ORGANIZATION': 6, 'B-PATIENT_ID': 7, 'B-SYMPTOM_AND_DISEASE': 8, 'B-TRANSPORTATION': 9, 'I-AGE': 10, 'I-DATE': 11, 'I-JOB': 12, 'I-LOCATION': 13, 'I-NAME': 14, 'I-ORGANIZATION': 15, 'I-PATIENT_ID': 16, 'I-SYMPTOM_AND_DISEASE': 17, 'I-TRANSPORTATION': 18, 'O': 19}
Processing dev_word.json...
-> Detected JSON Lines format or multiple objects in dev_word.json. Switching parse mode.
Processing test_word.json...
-> Detected JSON Lines format or multiple objects in test_word.json. Switching parse mode.
--- START TRAINING ---
Epoch 1/10 | Loss: 0.6772 | Train Acc: 83.10% | Dev Acc: 84.92%
Epoch 2/10 | Loss: 0.3235 | Train Acc: 89.91% | Dev Acc: 88.00%
Epoch 3/10 | Loss: 0.2415 | Train Acc: 92.28% | Dev Acc: 89.