In [None]:
!pip install transformers datasets seqeval accelerate torch


In [2]:
import os
import torch
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import json

In [3]:
class Config:
    # Model configuration
    MODEL_NAME = "demdecuong/vihealthbert-base-word"  # VihealthBERT model

    # Data paths
    TRAIN_FILE = "/content/drive/MyDrive/data_ner/train.txt"
    DEV_FILE = "/content/drive/MyDrive/data_ner/dev.txt"
    TEST_FILE = "/content/drive/MyDrive/data_ner/test.txt"

    # Training parameters
    OUTPUT_DIR = "./vihealth_ner_model"
    LEARNING_RATE = 2e-5
    BATCH_SIZE = 12
    NUM_EPOCHS = 5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    MAX_LENGTH = 256

    # Other settings
    SEED = 42
    SAVE_STEPS = 500
    EVAL_STEPS = 500
    LOGGING_STEPS = 100

config = Config()

# Set random seed for reproducibility
torch.manual_seed(config.SEED)
np.random.seed(config.SEED)

In [4]:
def read_conll_file(file_path):
    """
    Read CoNLL format file and return sentences and labels
    Expected format:
    Token Label
    Token Label
    (blank line separates sentences)
    """
    sentences = []
    labels = []

    current_tokens = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()

            if line == "" or line.startswith("-DOCSTART-"):
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token = parts[0]
                    label = parts[-1]  # Last column is label
                    current_tokens.append(token)
                    current_labels.append(label)

        # Add last sentence if exists
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels

print("Loading CoNLL datasets...")
train_tokens, train_labels = read_conll_file(config.TRAIN_FILE)
dev_tokens, dev_labels = read_conll_file(config.DEV_FILE)
test_tokens, test_labels = read_conll_file(config.TEST_FILE)

print(f"Train samples: {len(train_tokens)}")
print(f"Dev samples: {len(dev_tokens)}")
print(f"Test samples: {len(test_tokens)}")

Loading CoNLL datasets...
Train samples: 6199
Dev samples: 774
Test samples: 776


In [5]:
all_labels = set()
for labels in train_labels + dev_labels + test_labels:
    all_labels.update(labels)

label_list = sorted(list(all_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

num_labels = len(label_list)
print(f"\nNumber of labels: {num_labels}")
print(f"Labels: {label_list}")


Number of labels: 11
Labels: ['B-bien_phap_chan_doan', 'B-bien_phap_dieu_tri', 'B-nguyen_nhan_benh', 'B-ten_benh', 'B-trieu_chung_benh', 'I-bien_phap_chan_doan', 'I-bien_phap_dieu_tri', 'I-nguyen_nhan_benh', 'I-ten_benh', 'I-trieu_chung_benh', 'O']


In [6]:
def create_dataset(tokens, labels):
    """Convert tokens and labels to dataset format"""
    return Dataset.from_dict({
        'tokens': tokens,
        'ner_tags': [[label2id[label] for label in sent_labels] for sent_labels in labels]
    })

train_dataset = create_dataset(train_tokens, train_labels)
dev_dataset = create_dataset(dev_tokens, dev_labels)
test_dataset = create_dataset(test_tokens, test_labels)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset,
    'test': test_dataset
})

print("\nDataset structure:")
print(dataset)
print("\nSample from training set:")
print(dataset['train'][0])



Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 6199
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 774
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 776
    })
})

Sample from training set:
{'tokens': ['các', 'biện', 'pháp', 'điều', 'trị', 'bệnh', 'ung', 'thư', 'đại', 'tràng', 'phương', 'pháp', 'điều', 'trị', 'được', 'quyết', 'định', 'dựa', 'trên', 'giai', 'đoạn', 'của', 'ung', 'thư', 'đại', 'tràng', 'giai', 'đoạn', 'i', 'đến', 'iiia', ':', 'thông', 'thường', 'có', 'thể', 'được', 'điều', 'trị', 'bằng', 'phẫu', 'thuật', 'cắt', 'bỏ', 'khối', 'u', 'giai', 'đoạn', 'iiib', 'hoặc', 'iiic', ':', 'hóa', 'trị', 'kèm', 'theo', 'phẫu', 'thuật', 'để', 'ngăn', 'ngừa', 'các', 'tế', 'bào', 'ung', 'thư', 'tấn', 'công', 'các', 'cơ', 'quan', 'khác', 'của', 'cơ', 'thể', 'giai', 'đoạn', 'iv', ':', 'hóa', 'trị', 'là', 'phương', 'pháp', 'hiệu', 'quả', 'để

In [7]:
print("\nLoading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)



Loading tokenizer and model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at demdecuong/vihealthbert-base-word and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize_and_align_labels(examples):
    """
    Tokenize inputs and align labels with tokens
    Compatible with non-fast tokenizers (VihealthBERT)
    """
    labels = []
    input_ids_list = []
    attention_mask_list = []

    for tokens, ner_tags in zip(examples['tokens'], examples['ner_tags']):
        # Tokenize each word individually
        tokenized_words = []
        label_ids = []

        # Add CLS token
        tokenized_words.append(tokenizer.cls_token_id)
        label_ids.append(-100)

        # Process each word
        for word, label in zip(tokens, ner_tags):
            # Tokenize the word
            word_tokens = tokenizer.encode(word, add_special_tokens=False)

            # Truncate if needed
            if len(tokenized_words) + len(word_tokens) + 1 > config.MAX_LENGTH:
                break

            # Add tokens
            tokenized_words.extend(word_tokens)

            # Assign label to first subword, -100 to others
            label_ids.append(label)
            label_ids.extend([-100] * (len(word_tokens) - 1))

        # Add SEP token
        tokenized_words.append(tokenizer.sep_token_id)
        label_ids.append(-100)

        # Create attention mask
        attention_mask = [1] * len(tokenized_words)

        input_ids_list.append(tokenized_words)
        attention_mask_list.append(attention_mask)
        labels.append(label_ids)

    return {
        'input_ids': input_ids_list,
        'attention_mask': attention_mask_list,
        'labels': labels
    }

print("\nTokenizing datasets...")
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset['train'].column_names
)


Tokenizing datasets...


model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

Map:   0%|          | 0/6199 [00:00<?, ? examples/s]

Map:   0%|          | 0/774 [00:00<?, ? examples/s]

Map:   0%|          | 0/776 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True
)

In [10]:
def compute_metrics(eval_pred):
    """Compute precision, recall, and F1 score using seqeval"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for prediction, label in zip(predictions, labels):
        true_label = []
        true_prediction = []

        for pred_id, label_id in zip(prediction, label):
            if label_id != -100:
                true_label.append(id2label[label_id])
                true_prediction.append(id2label[pred_id])

        true_labels.append(true_label)
        true_predictions.append(true_prediction)

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [11]:
import transformers
print(transformers.__version__)

4.57.1


In [12]:
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=config.EVAL_STEPS,
    save_strategy="steps",
    save_steps=config.SAVE_STEPS,
    learning_rate=config.LEARNING_RATE,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    num_train_epochs=config.NUM_EPOCHS,
    weight_decay=config.WEIGHT_DECAY,
    warmup_ratio=config.WARMUP_RATIO,
    logging_steps=config.LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    seed=config.SEED,
    save_total_limit=2,
)


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
print("\n" + "="*50)
print("STARTING TRAINING")
print("="*50 + "\n")

train_result = trainer.train()

print("\n" + "="*50)
print("TRAINING COMPLETED")
print("="*50 + "\n")

# Save training metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)


STARTING TRAINING



Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.2398,0.369955,0.551813,0.726608,0.627261
1000,0.2051,0.352535,0.604368,0.701267,0.649222
1500,0.1779,0.355268,0.61022,0.704191,0.653846
2000,0.1508,0.368844,0.590381,0.717836,0.6479
2500,0.1242,0.381566,0.620704,0.712963,0.663643



TRAINING COMPLETED

***** train metrics *****
  epoch                    =        5.0
  total_flos               =  1433136GF
  train_loss               =     0.1843
  train_runtime            = 0:12:23.92
  train_samples_per_second =     41.664
  train_steps_per_second   =      3.475


In [16]:
print("\n" + "="*50)
print("EVALUATING ON VALIDATION SET")
print("="*50 + "\n")

val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])
print("Validation Metrics:")
for key, value in val_metrics.items():
    print(f"{key}: {value:.4f}")

trainer.log_metrics("eval", val_metrics)
trainer.save_metrics("eval", val_metrics)


EVALUATING ON VALIDATION SET



Validation Metrics:
eval_loss: 0.3816
eval_precision: 0.6207
eval_recall: 0.7130
eval_f1: 0.6636
eval_runtime: 4.7639
eval_samples_per_second: 162.4710
eval_steps_per_second: 13.6440
epoch: 5.0000
***** eval metrics *****
  epoch                   =        5.0
  eval_f1                 =     0.6636
  eval_loss               =     0.3816
  eval_precision          =     0.6207
  eval_recall             =      0.713
  eval_runtime            = 0:00:04.76
  eval_samples_per_second =    162.471
  eval_steps_per_second   =     13.644


In [17]:
print("\n" + "="*50)
print("EVALUATING ON TEST SET")
print("="*50 + "\n")

test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets['test'])
print("Test Metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value:.4f}")

trainer.log_metrics("test", test_metrics)
trainer.save_metrics("test", test_metrics)



EVALUATING ON TEST SET



Test Metrics:
eval_loss: 0.4361
eval_precision: 0.6333
eval_recall: 0.6880
eval_f1: 0.6595
eval_runtime: 4.3210
eval_samples_per_second: 179.5890
eval_steps_per_second: 15.0430
epoch: 5.0000
***** test metrics *****
  epoch                   =        5.0
  eval_f1                 =     0.6595
  eval_loss               =     0.4361
  eval_precision          =     0.6333
  eval_recall             =      0.688
  eval_runtime            = 0:00:04.32
  eval_samples_per_second =    179.589
  eval_steps_per_second   =     15.043


In [18]:
print("\n" + "="*50)
print("DETAILED CLASSIFICATION REPORT (TEST SET)")
print("="*50 + "\n")

predictions = trainer.predict(tokenized_datasets['test'])
preds = np.argmax(predictions.predictions, axis=2)

true_labels = []
true_predictions = []

for prediction, label in zip(preds, predictions.label_ids):
    true_label = []
    true_prediction = []

    for pred_id, label_id in zip(prediction, label):
        if label_id != -100:
            true_label.append(id2label[label_id])
            true_prediction.append(id2label[pred_id])

    true_labels.append(true_label)
    true_predictions.append(true_prediction)

print(classification_report(true_labels, true_predictions))


DETAILED CLASSIFICATION REPORT (TEST SET)



                     precision    recall  f1-score   support

bien_phap_chan_doan       0.54      0.64      0.59       130
 bien_phap_dieu_tri       0.50      0.54      0.52       239
   nguyen_nhan_benh       0.27      0.31      0.29       157
           ten_benh       0.78      0.83      0.80       922
   trieu_chung_benh       0.56      0.62      0.59       382

          micro avg       0.63      0.69      0.66      1830
          macro avg       0.53      0.59      0.56      1830
       weighted avg       0.64      0.69      0.66      1830



In [19]:
print("\n" + "="*50)
print("SAVING MODEL")
print("="*50 + "\n")

trainer.save_model(config.OUTPUT_DIR)
tokenizer.save_pretrained(config.OUTPUT_DIR)

# Save label mapping
with open(os.path.join(config.OUTPUT_DIR, 'label_mapping.json'), 'w') as f:
    json.dump({'label2id': label2id, 'id2label': id2label}, f, ensure_ascii=False, indent=2)

print(f"Model saved to: {config.OUTPUT_DIR}")


SAVING MODEL

Model saved to: ./vihealth_ner_model


In [22]:
print("\n" + "="*50)
print("INFERENCE EXAMPLE")
print("="*50 + "\n")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict_ner(text):
    """Predict NER tags for input text"""
    # Tokenize và đưa input lên cùng device
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=config.MAX_LENGTH,
        is_split_into_words=False
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 🔥 dòng quan trọng

    # Dự đoán
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    results = []
    for token, pred in zip(tokens, predictions[0]):
        if token not in ['[CLS]', '[SEP]', '[PAD]']:
            results.append((token, id2label[pred.item()]))

    return results

# Test
test_text = "Bệnh nhân bị đau đầu và sốt cao"
predictions = predict_ner(test_text)

print(f"Input: {test_text}")
print("\nPredictions:")
for token, label in predictions:
    print(f"{token:20} -> {label}")

print("\n" + "="*50)
print("TRAINING PIPELINE COMPLETED!")
print("="*50)



INFERENCE EXAMPLE

Input: Bệnh nhân bị đau đầu và sốt cao

Predictions:
<s>                  -> O
Bệnh                 -> O
nhân                 -> O
bị                   -> O
đau                  -> B-trieu_chung_benh
đầu                  -> I-trieu_chung_benh
và                   -> O
sốt                  -> B-trieu_chung_benh
cao                  -> I-trieu_chung_benh
</s>                 -> O

TRAINING PIPELINE COMPLETED!
