In [2]:
"""
    Convert Label Studio JSON to Hugging Face Dataset for NER Training
"""
import json
import os
import numpy as np
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict

# 1. Configure Model & Tokenizer
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# BIO label mapping (Tổng cộng 41 nhãn bao gồm cả O)
label_list = [
    "O",
    "B-KEY_TRANS_DATE", "I-KEY_TRANS_DATE", "B-VALUE_TRANS_DATE", "I-VALUE_TRANS_DATE",
    "B-KEY_REF_NUM", "I-KEY_REF_NUM", "B-VALUE_REF_NUM", "I-VALUE_REF_NUM",
    "B-KEY_DEBT_NUM", "I-KEY_DEBT_NUM", "B-VALUE_DEBT_NUM", "I-VALUE_DEBT_NUM",
    "B-KEY_CRED_NUM", "I-KEY_CRED_NUM", "B-VALUE_CRED_NUM", "I-VALUE_CRED_NUM",
    "B-KEY_ACC_NAME", "I-KEY_ACC_NAME", "B-VALUE_ACC_NAME", "I-VALUE_ACC_NAME",
    "B-KEY_ACC_NUM", "I-KEY_ACC_NUM", "B-VALUE_ACC_NUM", "I-VALUE_ACC_NUM",
    "B-KEY_CIFS", "I-KEY_CIFS", "B-VALUE_CIFS", "I-VALUE_CIFS",
    "B-KEY_CURRENCY_TYPE", "I-KEY_CURRENCY_TYPE", "B-VALUE_CURRENCY_TYPE", "I-VALUE_CURRENCY_TYPE",
    "B-KEY_ACC_ADDR", "I-KEY_ACC_ADDR", "B-VALUE_ACC_ADDR", "I-VALUE_ACC_ADDR",
    "B-KEY_TRANS_DESC", "I-KEY_TRANS_DESC", "B-VALUE_TRANS_DESC", "I-VALUE_TRANS_DESC"
]

label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}


def label_studio_to_hf(json_path):
    if not os.path.exists(json_path):
        print(f"❌ Dataset file not found: {json_path}")
        return []

    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []

    for item in data:
        text = item['data']['text']
        results = [ann for ann in item['annotations'][0]['result'] if 'value' in ann and 'labels' in ann['value']]

        results.sort(key=lambda x: x['value']['end'] - x['value']['start'], reverse=True)

        tokenized_input = tokenizer(text, truncation=True, return_offsets_mapping=True, padding=False)
        input_ids = tokenized_input["input_ids"]
        offsets = tokenized_input["offset_mapping"]

        labels = [0] * len(input_ids)

        for ann in results:
            start, end = ann['value']['start'], ann['value']['end']
            label_name = ann['value']['labels'][0]

            first_token = True
            for i, (o_start, o_end) in enumerate(offsets):
                if o_start == o_end:
                    labels[i] = -100
                    continue

                if o_start >= start and o_end <= end:
                    if labels[i] != 0 and labels[i] != -100:
                        continue

                    b_label = f"B-{label_name}"
                    i_label = f"I-{label_name}"

                    if b_label in label_to_id:
                        if first_token:
                            labels[i] = label_to_id[b_label]
                            first_token = False
                        else:
                            labels[i] = label_to_id[i_label]

        processed_data.append({
            "input_ids": input_ids,
            "attention_mask": tokenized_input["attention_mask"],
            "labels": labels
        })

    return processed_data

# Process training data
train_data = label_studio_to_hf("training/train.json")
if train_data:
    print(f"✅ Convert labels to BIS for  {len(train_data)} tasks successful.")

✅ Convert labels to BIS for  40 tasks successful.


In [3]:
"""
    Check sample data after labeling
"""
sample = None
for data in train_data:
    if any(l > 0 for l in data['labels']):
        sample = data
        break

if sample:
    print("✅ Sample was labeled:")
    for i in range(len(sample['input_ids'])):
        token = tokenizer.decode([sample['input_ids'][i]])
        label_id = sample['labels'][i]
        if label_id != 0:
            label_name = label_list[label_id] if label_id != -100 else "IGNORE"
            print(f"Token: {token:15} | Label: {label_name}")
else:
    print("❌ No sample was labeled !")

✅ Sample was labeled:
Token: <s>             | Label: IGNORE
Token: Account         | Label: B-KEY_ACC_NUM
Token: No              | Label: I-KEY_ACC_NUM
Token: 7               | Label: B-VALUE_ACC_NUM
Token: 100             | Label: I-VALUE_ACC_NUM
Token: 440             | Label: I-VALUE_ACC_NUM
Token: 375             | Label: I-VALUE_ACC_NUM
Token: 1               | Label: I-VALUE_ACC_NUM
Token: C               | Label: B-KEY_CIFS
Token: IF              | Label: I-KEY_CIFS
Token: 10              | Label: B-VALUE_CIFS
Token: 13              | Label: I-VALUE_CIFS
Token: 46              | Label: I-VALUE_CIFS
Token: Tiền            | Label: B-KEY_CURRENCY_TYPE
Token: tệ              | Label: I-KEY_CURRENCY_TYPE
Token: V               | Label: B-VALUE_CURRENCY_TYPE
Token: ND              | Label: I-VALUE_CURRENCY_TYPE
Token: N               | Label: B-KEY_ACC_ADDR
Token: ơ               | Label: I-KEY_ACC_ADDR
Token: i               | Label: I-KEY_ACC_ADDR
Token: ở               | Label: I

In [5]:
"""
    Combine training, validation & testing to be a dataset for training step
"""
from datasets import Dataset, DatasetDict

# 1. Convert to Hugging Face for Validation and Testing
val_data = label_studio_to_hf("../training/val.json")
test_data = label_studio_to_hf("../training/test.json")

# 2. Create DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data)
})

print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 40
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})


In [None]:
"""
    Start Training

    Training using:
        - Using model: xlm-roberta-base as it is fit for Finance token that is supported Vietnamese & English.
        - Set learning_rate: 2 nhân 10 mũ -5 - Good setting for each steps while find the 'cực tiểu' of Loss Function
        - W & B: Find the best f(x) = Wx + b for loss function
        - Gradient Descent using AdamW in Transformer.
        - Force large W to the Backpropagation to avoid Overfitting
"""

import numpy as np
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

# Load function F1-score to measure the metric for NER
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Loại bỏ các token có nhãn -100 (special tokens) khi tính toán
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 1. Load model labeling
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

# 2. Set Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 3. Configure Parameters
training_args = TrainingArguments(
    output_dir="../model/checkpoints",
    eval_strategy="epoch",    # Validate Gradient Descent sau mỗi vòng epoch
    save_strategy="epoch",          # Save weights for each epoch
    learning_rate=2e-5,             # This setting is recommended to find 'cực tiểu' of Loss Function
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=30,            # Set 30 để model được train cho 16 tasks
    weight_decay=0.01,              # This setting avoid the Overfitting, and recommended for NER models
    logging_steps=5,
    load_best_model_at_end=True,    # Tự động chọn ra bộ W & b tốt nhất khi done
    metric_for_best_model="f1",     # Dựa trên F1-score để đánh giá học của model
    save_total_limit=2,             # Keep 2 model checkpoints only
    fp16=True
    # use_mps_device=True

)

# 4. Init Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start Training - Start W & b
trainer.train()

# --- Save Model ---
final_model_path = "../model/banking_reconciliation_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Finish Training & save model in: {final_model_path}")

In [None]:
"""
    Traing until loss function is not being reduced in next 5 epochs

    Training using:
        - Using model: xlm-roberta-base as it is fit for Finance token that is supported Vietnamese & English.
        - Set learning_rate: 2 nhân 10 mũ -5 - Good setting for each steps while find the 'cực tiểu' of Loss Function
        - W & B: Find the best f(x) = Wx + b for loss function
        - Gradient Descent using AdamW in Transformer.
        - Force large W to the Backpropagation to avoid Overfitting
"""

import numpy as np
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback

# Load function F1-score to measure the metric for NER
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Loại bỏ các token có nhãn -100 (special tokens) khi tính toán
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 1. Load model labeling
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

# 2. Set Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 3. Configure Parameters
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# 3. Configure Parameters with Early Stopping Callback
training_args = TrainingArguments(
    output_dir="../model/checkpoints",
    eval_strategy="steps",       # Evaluate every certain steps
    eval_steps=20,               # check loss every 20 steps
    save_strategy="steps",       # Save checkpoint every 20 steps to match eval_steps
    save_steps=20,
    learning_rate=2e-5,             # This setting is recommended to find 'cực tiểu' of Loss Function
    per_device_train_batch_size=8,  # Optimized for GPU server
    per_device_eval_batch_size=8,
    num_train_epochs=100,            # Set 30 để model được train cho 16 tasks
    weight_decay=0.01,              # This setting avoid the Overfitting, and recommended for NER models
    fp16=True,                      # Use mixed precision for faster training on NVIDIA GPU
    logging_steps=5,
    save_total_limit=2,
    load_best_model_at_end=True, # Stopping when reaching to the best model
    metric_for_best_model="eval_loss", # Base on the loss of validation set
    greater_is_better=False,     # Loss reduction is better
)

# Init Trainer with Callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Start Training - Start W & b
trainer.train()

# --- Save Model ---
final_model_path = "../model/banking_reconciliation_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Finish Training & save model in: {final_model_path}")

In [None]:
"""
    Check all logs of Loss Function

"""

history = trainer.state.log_history
for log in history:
    if 'loss' in log:
        print(f"Step {log['step']}: Loss = {log['loss']:.4f}")

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "model/banking_reconciliation_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# 2. Create Pipeline
nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

# 3. Data to Inference
text = "Chủ tài khoản: CT TNHH TM& DV SIEU THI BIG C AN LAC | Số tài khoản: 71004403751 | CIF: 101346 | Loại tiền: VND | Địa chỉ: 1231 QLO 1A,KP5,F.BINHTRI DONG B,BINHTAN\nNgày giao dịch: 30/04/2024 | Số tham chiếu: 9925 - 30554 | Số tiền ghi nợ: N/A | Số tiền ghi có: 1323706 | Mô tả: T/t T/ung the MASTER:EBA TRA VINH; MerchNo: 7400000217 Gross Amt: Not On-Us=1,345,230.00 VND; VAT Amt:1,345,230.00*1.600%/11 = 1,957.00 VND(VAT code:0301472278); Code:1005; SLGD: Not On-Us=1; Ngay 30/04/2024."

# 4. Run Inference
print(f"\n--- Extract data ---")
results = nlp(text)

THRESHOLD = 0.4
found = False

for res in results:
    if res['score'] >= THRESHOLD:
        found = True
        clean_word = res['word'].replace(' ', ' ').strip()
        print(f"object: {clean_word:30} | label: {res['entity_group']:20} | reliable: {res['score']:.2%}")

if not found:
    print("⚠ Search not found.")

In [None]:
"""
    Convert results to structured JSON
"""
def results_to_structured_json(results, threshold=0.4):
    structured_data = {}

    for res in results:
        if res['score'] < threshold:
            continue

        label = res['entity_group']
        # Làm sạch text
        value = res['word'].replace(' ', ' ').strip()

        if label not in structured_data:
            structured_data[label] = value
        else:
            if label.startswith("VALUE"):
                structured_data[label] += value
            else:
                structured_data[label] = value

    return structured_data

final_json = results_to_structured_json(results)
print(final_json)