## Library & Label

In [1]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=9b6bd13fd1aa7f4b67ac8f0625b3cd5c64eaa944be678b19cf9261472770909f
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset as TorchDataset
from datasets import Dataset
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import EvalPrediction
import wandb
from huggingface_hub import login
import logging
import copy
import json
import os

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

label_list = ['B-DISTRICT', 'B-PROVINCE', 'B-STREET', 'B-WARD', 'I-DISTRICT', 'I-PROVINCE', 'I-STREET', 'I-WARD', 'O']
label_map = {label: i for i, label in enumerate(label_list)}

2025-04-25 07:15:57.584988: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745565357.780521      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745565357.836864      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data Preprocessing

In [3]:
def prepare_dataset(df):
    """Chuyển cột tokens và labels thành chuỗi cách nhau bởi dấu cách."""
    df['tokens'] = df['tokens'].apply(lambda x: ' '.join(eval(x)))
    df['labels'] = df['labels'].apply(lambda x: ' '.join(eval(x)))
    return df

def process_string_to_array(dataset):
    """Chuyển chuỗi tokens và labels thành danh sách."""
    return {
        'tokens': [sentence.split() for sentence in dataset['tokens']],
        'labels': [label_seq.split() for label_seq in dataset['labels']]
    }

def load_and_process_data(train_path, dev_path, test_path):
    """Tải và xử lý các tập dữ liệu từ file CSV."""
    train_dataset = pd.read_csv(train_path, encoding='utf-8')
    dev_dataset = pd.read_csv(dev_path, encoding='utf-8')
    test_dataset = pd.read_csv(test_path, encoding='utf-8')

    train_dataset = prepare_dataset(train_dataset)
    dev_dataset = prepare_dataset(dev_dataset)
    test_dataset = prepare_dataset(test_dataset)

    train_dataset = process_string_to_array(train_dataset)
    dev_dataset = process_string_to_array(dev_dataset)
    test_dataset = process_string_to_array(test_dataset)

    return (
        Dataset.from_dict(train_dataset),
        Dataset.from_dict(dev_dataset),
        Dataset.from_dict(test_dataset)
    )

# Example class for dataset

In [4]:
class Example:
    def __init__(self, words, slot_labels, guid=None):
        self.words = words
        self.slot_labels = slot_labels
        self.guid = guid

def convert_to_examples(dataset):
    """Chuyển dataset thành danh sách các đối tượng Example."""
    return [
        Example(words=tokens, slot_labels=labels, guid=i)
        for i, (tokens, labels) in enumerate(zip(dataset['tokens'], dataset['labels']))
    ]

# Feature conversion

In [5]:
class InputFeatures:
    def __init__(self, input_ids, attention_mask, token_type_ids, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.slot_labels_ids = slot_labels_ids

    def to_dict(self):
        return copy.deepcopy(self.__dict__)

    def to_json_string(self):
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

# Hàm chuyển Example thành features
def convert_examples_to_features(
    examples, max_seq_len, tokenizer, pad_label_id=-100,
    cls_token_segment_id=0, pad_token_segment_id=0, sequence_segment_id=0, mask_padding_with_zero=True
):
    """Chuyển Example thành features cho mô hình."""
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id
    features = []

    for example_index, example in enumerate(examples):
        if example_index % 400 == 0:
            logger.info(f"Đang xử lý example {example_index}/{len(examples)}")

        tokens = []
        label_ids = []

        for word, label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]
            tokens.extend(word_tokens)
            label_id = label_map[label]
            label_ids.extend([label_id] + [pad_label_id] * (len(word_tokens) - 1))

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:max_seq_len - special_tokens_count]
            label_ids = label_ids[:max_seq_len - special_tokens_count]

        tokens.append(sep_token)
        label_ids.append(pad_label_id)
        token_type_ids = [sequence_segment_id] * len(tokens)

        tokens = [cls_token] + tokens
        label_ids = [pad_label_id] + label_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        padding_length = max_seq_len - len(input_ids)
        input_ids += [pad_token_id] * padding_length
        attention_mask += [0 if mask_padding_with_zero else 1] * padding_length
        token_type_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_label_id] * padding_length

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                slot_labels_ids=label_ids,
            )
        )

    return features

# NER Dataset

In [6]:
class NERDataset(TorchDataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        return {
            'input_ids': torch.tensor(feature.input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(feature.attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(feature.token_type_ids, dtype=torch.long),
            'labels': torch.tensor(feature.slot_labels_ids, dtype=torch.long),
        }

# Metrics

In [7]:
def compute_metrics(p: EvalPrediction):
    predictions = p.predictions.argmax(axis=2)
    labels = p.label_ids

    pred_labels = []
    true_labels = []

    for pred_seq, true_seq in zip(predictions, labels):
        pred_label_seq = []
        true_label_seq = []
        for pred_idx, true_idx in zip(pred_seq, true_seq):
            if true_idx == -100:
                continue
            if pred_idx < len(label_list) and true_idx < len(label_list):
                pred_label_seq.append(label_list[pred_idx])
                true_label_seq.append(label_list[true_idx])
            else:
                logger.warning(f"Chỉ số ngoài phạm vi: pred_idx={pred_idx}, true_idx={true_idx}")

        pred_labels.append(pred_label_seq)
        true_labels.append(true_label_seq)

    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)

    logger.info("Báo cáo phân loại:")
    logger.info(classification_report(true_labels, pred_labels))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Train model

In [8]:
# Hàm huấn luyện mô hình
def train_model(
    train_dataset, dev_dataset, test_dataset, output_dir='./ner-results-3',
    resume_from_checkpoint=None, huggingface_token=None, max_seq_len=64, num_epochs=2
):
    # Khởi tạo tokenizer
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

    # Chuyển dataset thành examples và features
    train_examples = convert_to_examples(train_dataset)
    dev_examples = convert_to_examples(dev_dataset)
    test_examples = convert_to_examples(test_dataset)

    train_features = convert_examples_to_features(train_examples, max_seq_len, tokenizer)
    dev_features = convert_examples_to_features(dev_examples, max_seq_len, tokenizer)
    test_features = convert_examples_to_features(test_examples, max_seq_len, tokenizer)

    train_dataset = NERDataset(train_features)
    dev_dataset = NERDataset(dev_features)
    test_dataset = NERDataset(test_features)

    # Khởi tạo mô hình
    model = RobertaForTokenClassification.from_pretrained(
        'roberta-base',
        num_labels=len(label_list)
    )

    # Cấu hình tham số huấn luyện
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",        # Đánh giá sau mỗi epoch
        save_strategy="epoch",        # Lưu checkpoint sau mỗi epoch (sửa lỗi)
        per_device_train_batch_size=32,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2,           # Giữ tối đa 2 checkpoint
        load_best_model_at_end=True,  
        push_to_hub=True if huggingface_token else False,
        hub_model_id="datmieu2k4/ner-results-3" if huggingface_token else None, 
        hub_token=huggingface_token,
        save_safetensors=True  # Lưu định dạng safetensors
    )

    # Khởi tạo Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics
    )

    # Huấn luyện hoặc tiếp tục từ checkpoint
    if resume_from_checkpoint:
        logger.info(f"Tiếp tục huấn luyện từ checkpoint: {resume_from_checkpoint}")
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    else:
        logger.info("Huấn luyện từ đầu")
        trainer.train()

    # Đánh giá trên tập test
    eval_results = trainer.evaluate(test_dataset)
    logger.info(f"Kết quả đánh giá trên tập test: {eval_results}")

    # Lưu mô hình cuối cùng
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Đẩy lên Hugging Face
    if huggingface_token:
        trainer.push_to_hub(commit_message="Hoàn tất huấn luyện", tags="bert-ner-address")

    return trainer, model, tokenizer

# Load and process data

In [9]:
train_path = "/kaggle/input/ner-dataset-location/train_df.csv"
dev_path = "/kaggle/input/ner-dataset-location/val_df.csv"
test_path = "/kaggle/input/ner-dataset-location/test_df.csv"

train_dataset, dev_dataset, test_dataset = load_and_process_data(train_path, dev_path, test_path)

# Training

In [None]:
# Token Hugging Face
huggingface_token = "" 

# Đăng nhập Hugging Face
login(token=huggingface_token)

# Đăng nhập Wandb
wandb.login(key="") 

# Huấn luyện mô hình
checkpoint_path = "" 
trainer, model, tokenizer = train_model(
    train_dataset,
    dev_dataset,
    test_dataset,
    output_dir='./ner-results-3',
    resume_from_checkpoint=checkpoint_path if os.path.exists(checkpoint_path) else None,
    huggingface_token=huggingface_token,
    max_seq_len=64,
    num_epochs=2
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdatt2505myethuy5[0m ([33mdatt2505myethuy5-university-of-engineering-and-technolog[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Tracking run with wandb version 0.19.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250425_074903-o2tus31g[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./ner-results-3[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/datt2505myethuy5-university-of-engineering-and-technolog/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/datt2505myethuy5-university-of-engineering-and-technolog/huggingface/runs/o2tus31g[0m


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0176,0.014808,0.99323,0.99534,0.994284
2,0.008,0.010802,0.995048,0.995841,0.995445


# Inference example

In [11]:
# Hàm suy luận
def predict_ner(text, model, tokenizer, label_list, max_seq_len=64):
    tokens = text.split()
    encoding = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, padding='max_length', max_length=max_seq_len)
    
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)

    pred_labels = [label_list[pred.item()] for pred in predictions[0] if pred.item() < len(label_list)]
    
    word_ids = encoding.word_ids()
    filtered_labels = []
    for i, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        if i > 0 and word_id == word_ids[i-1]:
            continue
        filtered_labels.append(pred_labels[i])

    return list(zip(tokens, filtered_labels[:len(tokens)]))

In [12]:
test_case = "Số 7 đường đê Tả đáy , phường Đồng Mai , Quận Hà Đông , TP Hà Nội"
predictions = predict_ner(test_case, model, tokenizer, label_list)
print("Inference results:")
for token, label in predictions:
    print(f"{token}: {label}")

Inference results:
Số: B-STREET
7: I-STREET
đường: I-STREET
đê: I-STREET
Tả: I-STREET
đáy: I-STREET
,: O
phường: B-WARD
Đồng: I-WARD
Mai: I-WARD
,: O
Quận: B-DISTRICT
Hà: I-DISTRICT
Đông: I-DISTRICT
,: O
TP: B-PROVINCE
Hà: I-PROVINCE
Nội: I-PROVINCE
