# Train PhoBERT for Vietnamese COVID-19 NER on Google Colab

This notebook is designed to run the training process for the Vietnamese COVID-19 NER task on Google Colab. It combines the configuration, dataset, and training scripts into a single file.

## 1. Setup Environment

First, need to install the required Python libraries.

In [None]:
!pip install transformers seqeval pandas torch

## 2. Mount Google Drive and Prepare Data

Mount your Google Drive to access the dataset. You will need to upload your `data` folder to your Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 3. Configuration

This cell contains all the configurations and hyperparameters for the project.

In [None]:
import os

# --- 1. Paths ---
# IMPORTANT: Update this path to point to your project folder on Google Drive
BASE_PROJECT_DIR = '/content/drive/MyDrive/'
DATA_DIR = os.path.join(BASE_PROJECT_DIR, 'raw/PhoNER_COVID19/')
TRAIN_FILE = os.path.join(DATA_DIR, 'train_word.json')
DEV_FILE = os.path.join(DATA_DIR, 'dev_word.json')
TEST_FILE = os.path.join(DATA_DIR, 'test_word.json')

# Thư mục để lưu các mô hình đã huấn luyện
MODEL_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'models/phobert-ner-covid')


# --- 2. Cấu hình Mô hình (Model Configuration) ---
PRE_TRAINED_MODEL_NAME = "vinai/phobert-base"


# --- 3. Siêu tham số Huấn luyện (Training Hyperparameters) ---
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 3e-5
RANDOM_SEED = 42


# --- 4. Tag Configuration ---
UNIQUE_TAGS = [
    'O', 'B-AGE', 'I-AGE', 'B-DATE', 'I-DATE', 'B-GENDER', 'B-JOB', 'I-JOB',
    'B-LOCATION', 'I-LOCATION', 'B-NAME', 'I-NAME', 'B-ORGANIZATION', 'I-ORGANIZATION',
    'B-PATIENT_ID', 'I-PATIENT_ID', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE',
    'B-TRANSPORTATION', 'I-TRANSPORTATION'
]

TAGS_TO_IDS = {tag: i for i, tag in enumerate(UNIQUE_TAGS)}
IDS_TO_TAGS = {i: tag for i, tag in enumerate(UNIQUE_TAGS)}
SUBWORD_TAG_ID = -100

## 4. Dataset Class

This cell defines the `NerDataset` class, which is responsible for loading, preprocessing, and preparing the data for training and evaluation.

In [None]:
import torch
import pandas as pd

class NerDataset(torch.utils.data.Dataset):
    def __init__(self, file_path, tokenizer, max_len, tags_to_ids):
        self.file_path = file_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tags_to_ids = tags_to_ids
        self.subword_tag_id = -100
        self.sentences, self.tags = self._read_data()

    def _read_data(self):
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File not found at: {self.file_path}")
        df = pd.read_json(self.file_path, lines=True, encoding='utf-8')
        sentences = df['words'].tolist()
        tags = df['tags'].tolist()
        return sentences, tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        words = self.sentences[index]
        tags = self.tags[index]
        input_ids = []
        target_tags = []

        for i, word in enumerate(words):
            word_tokens = self.tokenizer.tokenize(word)
            if len(word_tokens) > 0:
                input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_tokens))
                tag_id = self.tags_to_ids.get(tags[i], self.tags_to_ids['O'])
                target_tags.append(tag_id)
                target_tags.extend([self.subword_tag_id] * (len(word_tokens) - 1))

        if len(input_ids) > self.max_len - 2:
            input_ids = input_ids[:self.max_len - 2]
            target_tags = target_tags[:self.max_len - 2]

        final_input_ids = [self.tokenizer.cls_token_id] + input_ids + [self.tokenizer.sep_token_id]
        final_target_tags = [self.subword_tag_id] + target_tags + [self.subword_tag_id]
        attention_mask = [1] * len(final_input_ids)

        padding_length = self.max_len - len(final_input_ids)
        final_input_ids = final_input_ids + ([self.tokenizer.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        final_target_tags = final_target_tags + ([self.subword_tag_id] * padding_length)
        
        return {
            "input_ids": torch.tensor(final_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(final_target_tags, dtype=torch.long)
        }

## 5. Training and Evaluation Functions

This cell contains the core logic for training and evaluating the NER model.

In [None]:
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_linear_schedule_with_warmup
from tqdm import tqdm
from seqeval.metrics import f1_score, precision_score, recall_score

def set_seed(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

def train_one_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device, ids_to_tags):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            true_labels = labels.cpu().numpy()

            for i in range(len(true_labels)):
                pred_tags = [ids_to_tags[p] for p, l in zip(predictions[i], true_labels[i]) if l != SUBWORD_TAG_ID]
                label_tags = [ids_to_tags[l] for l in true_labels[i] if l != SUBWORD_TAG_ID]
                all_preds.append(pred_tags)
                all_labels.append(label_tags)

    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    return avg_loss, f1, precision, recall

## 6. Run Training

This is the main function to run the entire training process.

In [None]:
def run_training():
    set_seed(RANDOM_SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(
        PRE_TRAINED_MODEL_NAME,
        num_labels=len(UNIQUE_TAGS),
        id2label=IDS_TO_TAGS,
        label2id=TAGS_TO_IDS
    )
    model.to(device)

    train_dataset = NerDataset(
        file_path=TRAIN_FILE,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        tags_to_ids=TAGS_TO_IDS
    )
    dev_dataset = NerDataset(
        file_path=DEV_FILE,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        tags_to_ids=TAGS_TO_IDS
    )

    train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=VALID_BATCH_SIZE)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    num_training_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    best_f1 = 0
    for epoch in range(EPOCHS):
        print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
        train_loss = train_one_epoch(model, train_dataloader, optimizer, scheduler, device)
        print(f"Train Loss: {train_loss:.4f}")
        val_loss, val_f1, val_precision, val_recall = evaluate(model, dev_dataloader, device, IDS_TO_TAGS)
        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation F1: {val_f1:.4f} | Precision: {val_precision:.4f} | Recall: {val_recall:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            print(f"New best F1 score: {best_f1:.4f}. Saving model...")
            model.save_pretrained(MODEL_OUTPUT_DIR)
            tokenizer.save_pretrained(MODEL_OUTPUT_DIR)
    
    print("\nTraining finished!")
    print(f"Best F1 score on validation set: {best_f1:.4f}")
    print(f"Model saved to {MODEL_OUTPUT_DIR}")

run_training()