In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BATCH_SIZE = 32
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
LEARNING_RATE = 0.001
EPOCHS = 3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
TAG_PAD_IDX = -1  # Giá trị ignore_index cho Loss function

In [3]:
# 1. Tải dữ liệu
dataset = load_dataset("conll2003", trust_remote_code=True)

ner_feature = dataset["train"].features["ner_tags"]
label_names = ner_feature.feature.names
print(f"Available tags: {label_names}")

# Xây vocab
word_to_ix = {PAD_TOKEN: 0, UNK_TOKEN: 1}
tag_to_ix = {}

# word_to_ix
for tokens in dataset["train"]["tokens"]:
    for word in tokens:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

for idx, label in enumerate(label_names):
    tag_to_ix[label] = idx

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

print(f"Vocab size: {len(word_to_ix)}")
print(f"Number of tags: {len(tag_to_ix)}")

Available tags: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Vocab size: 23625
Number of tags: 9


In [4]:
class NERDataset(Dataset):
    def __init__(self, dataset_split, word_to_ix, tag_to_ix, label_names):
        self.sentences = dataset_split["tokens"]
        self.ner_tags_ids = dataset_split["ner_tags"]  # Dữ liệu gốc là ID số nguyên
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.label_names = label_names

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        token_list = self.sentences[idx]
        tag_id_list = self.ner_tags_ids[idx]

        # Chuyển từ -> index (xử lý UNK)
        sentence_indices = [self.word_to_ix.get(w, self.word_to_ix[UNK_TOKEN]) for w in token_list]
        tag_indices = [self.tag_to_ix[self.label_names[t_id]] for t_id in tag_id_list]

        return torch.tensor(sentence_indices, dtype=torch.long), torch.tensor(tag_indices, dtype=torch.long)


def collate_fn(batch):
    sentences, tags = zip(*batch)

    # Pad sentences với giá trị 0 (index của <PAD>)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_ix[PAD_TOKEN])

    # Pad tags với giá trị -1 (TAG_PAD_IDX)
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=TAG_PAD_IDX)

    return sentences_padded, tags_padded


train_dataset = NERDataset(dataset["train"], word_to_ix, tag_to_ix, label_names)
val_dataset = NERDataset(dataset["validation"], word_to_ix, tag_to_ix, label_names)
test_dataset = NERDataset(dataset["test"], word_to_ix, tag_to_ix, label_names)  # Optional

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [5]:
class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleRNNForTokenClassification, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word_to_ix[PAD_TOKEN])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # -> [batch_size, seq_len, embed_dim]
        outputs, _ = self.lstm(embedded)
        predictions = self.fc(outputs)  # -> [batch_size, seq_len, output_dim]
        return predictions


model = SimpleRNNForTokenClassification(
    vocab_size=len(word_to_ix),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=len(tag_to_ix)
).to(DEVICE)

print(model)

SimpleRNNForTokenClassification(
  (embedding): Embedding(23625, 100, padding_idx=0)
  (lstm): LSTM(100, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=9, bias=True)
)


In [6]:
def evaluate(model, dataloader, criterion):
    model.eval()
    correct_tokens = 0
    total_tokens = 0
    total_loss = 0

    start_time = time.perf_counter()
    with torch.no_grad():
        for sentences, tags in dataloader:
            sentences, tags = sentences.to(DEVICE), tags.to(DEVICE)

            outputs = model(sentences)
            loss = criterion(outputs.view(-1, len(tag_to_ix)), tags.view(-1))
            predicted_tags = torch.argmax(outputs, dim=-1)
            mask = (tags != TAG_PAD_IDX)

            # chỉ tại các vị trí mask == True
            correct = (predicted_tags == tags) & mask

            correct_tokens += correct.sum().item()
            total_tokens += mask.sum().item()
            total_loss += loss.item()

    end_time = time.perf_counter()
    val_time = end_time - start_time
    val_loss = total_loss / len(dataloader)
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
    return val_loss, accuracy, val_time


optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# Ignore_index=-1 để bỏ qua padding khi tính loss
criterion = nn.CrossEntropyLoss(ignore_index=TAG_PAD_IDX)


def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    correct_tokens = 0
    total_tokens = 0

    start_time = time.perf_counter()
    for batch_idx, (sentences, tags) in enumerate(dataloader):
        sentences, tags = sentences.to(DEVICE), tags.to(DEVICE)
        optimizer.zero_grad()
        predictions = model(sentences)  # [batch, seq_len, num_tags]
        loss = criterion(predictions.view(-1, len(tag_to_ix)), tags.view(-1))

        loss.backward()
        optimizer.step()

        predicted_tags = torch.argmax(predictions, dim=-1)
        mask = (tags != TAG_PAD_IDX)
        correct = (predicted_tags == tags) & mask
        correct_tokens += correct.sum().item()

        total_tokens += mask.sum().item()
        total_loss += loss.item()

    end_time = time.perf_counter()
    train_time = end_time - start_time
    avg_loss = total_loss / len(dataloader)
    train_acc = correct_tokens / total_tokens

    return avg_loss, train_acc, train_time


# Vòng lặp huấn luyện
for epoch in range(EPOCHS):
    start_time = time.perf_counter()
    avg_loss, train_acc, train_time = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, avg_dev_acc, val_time = evaluate(model, val_loader, criterion)
    end_time = time.perf_counter()
    total_time = end_time - start_time

    print(f'Epoch: {epoch + 1:02} | '
          f'Train Loss: {avg_loss:.3f} | '
          f'Val Loss: {val_loss:.3f} | '
          f'Train Acc: {train_acc:.3f} | '
          f'Val Acc: {avg_dev_acc:.3f} | '
          f'Training time: {train_time:.3f} | '
          f'Validation time: {val_time:.3f} | '
          f'Total time: {total_time:.3f} | ')

Epoch: 01 | Train Loss: 0.620 | Val Loss: 0.445 | Train Acc: 0.841 | Val Acc: 0.877 | Training time: 2.207 | Validation time: 0.151 | Total time: 2.358 | 
Epoch: 02 | Train Loss: 0.324 | Val Loss: 0.291 | Train Acc: 0.903 | Val Acc: 0.916 | Training time: 2.040 | Validation time: 0.144 | Total time: 2.184 | 
Epoch: 03 | Train Loss: 0.201 | Val Loss: 0.233 | Train Acc: 0.939 | Val Acc: 0.931 | Training time: 1.983 | Validation time: 0.142 | Total time: 2.125 | 


In [7]:
def predict_sentence(sentence_str):
    model.eval()

    tokens = sentence_str.split()

    indices = [word_to_ix.get(w, word_to_ix[UNK_TOKEN]) for w in tokens]
    tensor_input = torch.tensor([indices], dtype=torch.long).to(DEVICE)  # Batch size = 1

    with torch.no_grad():
        output = model(tensor_input)
        predicted_indices = torch.argmax(output, dim=-1).squeeze(0).cpu().numpy()

    results = []
    for token, idx in zip(tokens, predicted_indices):
        label = ix_to_tag[idx]
        results.append((token, label))

    print(f"\nSentence: {sentence_str}")
    print(f"{'Token':<15} {'Predicted Label'}")
    print("-" * 30)
    for token, label in results:
        print(f"{token:<15} {label}")


# Test
sample_sentence = "VNU University of Science is located in Hanoi"
predict_sentence(sample_sentence)

sample_sentence_2 = "WHO is an organization based in Geneva"
predict_sentence(sample_sentence_2)


Sentence: VNU University of Science is located in Hanoi
Token           Predicted Label
------------------------------
VNU             B-ORG
University      I-ORG
of              I-ORG
Science         I-ORG
is              O
located         O
in              O
Hanoi           O

Sentence: WHO is an organization based in Geneva
Token           Predicted Label
------------------------------
WHO             O
is              O
an              O
organization    O
based           O
in              O
Geneva          B-LOC
