In [None]:
"""
    Convert excel to markdown
"""
import os
from pathlib import Path
from docling.document_converter import DocumentConverter

file_path = '../data/Banking Statement.xlsx'

def get_docling_result(path):
    if not os.path.exists(path):
        print(f"Error: File does not exist {path}")
        return None

    try:
        converter = DocumentConverter()
        markdown_text = converter.convert(path)
        return markdown_text
    except Exception as e:
        print(f"Error while converting: {e}")
        return None

markdown_data = get_docling_result(file_path)
if markdown_data:
    result = markdown_data.document.export_to_markdown()
    print(f"Convert to markdown successfully")
    print(result)
else:
    print(f"Parse markdown was getting error")



In [14]:
"""
    Transform markdown to clean text for NER labeling
"""
import re

def transform_to_markdown(markdown_raw):
    lines = markdown_raw.split('\n')
    ner_ready_data = []

    exclude_keywords = ['tổng số', '---', 'ngày thực hiện', 'sao kê tài khoản']

    for line in lines:
        line_clean = line.replace('|', ' ').replace('None', '').replace('\xa0', ' ')
        line_clean = re.sub(r'\s+', ' ', line_clean).strip()

        if not line_clean or any(key in line_clean.lower() for key in exclude_keywords):
            continue

        parts = line_clean.split()
        unique_parts = []
        for p in parts:
            if not unique_parts or p != unique_parts[-1]:
                unique_parts.append(p)

        ner_ready_data.append(" ".join(unique_parts))

    return "\n".join(ner_ready_data)

clean_markdown = transform_to_markdown(result)

print("--- Markdown Text ---\n")
print(clean_markdown)

--- Markdown Text ---

Số dư cuối kỳ 26415781065 26.415.781.065
Số tiền ghi nợ Số tiền ghi có Mô tả
Chủ tài khoản: CT TNHH TM& DV SIEU THI BIG C AN LAC CT TNHH TM& DV SIEU THI BIG C AN LAC CT TNHH TM& DV SIEU THI BIG C AN LAC CT TNHH TM& DV SIEU THI BIG C AN LAC
Số tài khoản: 71004403751
Địa chỉ: 1231 QLO 1A,KP5,F.BINHTRI DONG B,BINHTAN 1231 QLO 1A,KP5,F.BINHTRI DONG B,BINHTAN 1231 QLO 1A,KP5,F.BINHTRI DONG B,BINHTAN 1231 QLO 1A,KP5,F.BINHTRI DONG B,BINHTAN
CIF: 101346
Loại tiền: VND
Từ: 30/04/2024 Đến: 30/04/2024 Từ: 30/04/2024 Đến: 30/04/2024 Từ: 30/04/2024 Đến: 30/04/2024 Từ: 30/04/2024 Đến: 30/04/2024 Từ: 30/04/2024 Đến: 30/04/2024
Số dư đầu kỳ 25035392349 Số dư cuối kỳ 26415781065 26.415.781.065
Ngày giao dịch Số tham chiếu Số tiền ghi nợ Số tiền ghi có Mô tả
2024-04-30 00:00:00 5219 - 65354 156000 TKP#P021071000143#333684.300424.230435.nop tien pmh cty 125tr go ba ria
2024-04-30 00:00:00 9925 - 34743 9798337 T/t T/ung the VISA:SIEU THI BIG C AN LAC BA RIA; MerchNo:11500000457 Gro

In [13]:
import re
import json
import os

def transform_label_data(clean_text):
    lines = clean_text.strip().split('\n')
    metadata = {"chu_tk": "", "so_tk": "", "dia_chi": "", "cif": "", "loai_tien": ""}
    transaction_tasks = []

    def get_val(l):
        val = l.split(":", 1)[1].strip()
        parts = val.split()
        half = len(parts) // 2
        return " ".join(parts[:half]) if half > 0 and parts[:half] == parts[half:] else val

    for line in lines:
        lower_line = line.lower()
        if "chủ tài khoản:" in lower_line: metadata["chu_tk"] = get_val(line)
        elif "số tài khoản:" in lower_line: metadata["so_tk"] = get_val(line)
        elif "địa chỉ:" in lower_line: metadata["dia_chi"] = get_val(line)
        elif "cif:" in lower_line: metadata["cif"] = get_val(line)
        elif "loại tiền:" in lower_line: metadata["loai_tien"] = get_val(line)

        match = re.match(r'(\d{4}-\d{2}-\d{2})\s\d{2}:\d{2}:\d{2}\s+([\d\s\-]{10,20})\s+(\d+)\s+(.*)', line)

        if match:
            y, m, d = match.group(1).split('-')
            date_vn = f"{d}/{m}/{y}"
            ref_num = match.group(2).strip()
            amount = match.group(3).strip()
            description = match.group(4).strip()

            debt = "N/A"
            credit = amount

            transaction_tasks.append({
                "ngay": date_vn, "ref": ref_num, "debt": debt, "credit": credit, "mota": description
            })

    header_info = f"Chủ tài khoản: {metadata['chu_tk']} | Số tài khoản: {metadata['so_tk']} | CIF: {metadata['cif']} | Loại tiền: {metadata['loai_tien']} | Địa chỉ: {metadata['dia_chi']}"

    tasks = []
    for item in transaction_tasks:
        text = (f"{header_info} | "
                f"Ngày giao dịch: {item['ngay']} | Số tham chiếu: {item['ref']} | "
                f"Số tiền ghi nợ: {item['debt']} | Số tiền ghi có: {item['credit']} | "
                f"Mô tả: {item['mota']}")
        tasks.append({"data": {"text": text}})

    return tasks

output_path = '../dataset/base/base_dataset_label.json'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
individual_tasks = transform_label_data(clean_markdown)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(individual_tasks, f, ensure_ascii=False, indent=4)

print(f"✅ Base dataset has created for {len(individual_tasks)} tasks!")

✅ Base dataset has created for 42 tasks!


In [None]:
import json
import random

def augment_and_shuffle_tasks(input_path, output_path, sample_size=10):
    with open(input_path, 'r', encoding='utf-8') as f:
        tasks = json.load(f)

    # Dict for Augmentation
    synonyms = {
        "Chủ tài khoản": ["Chủ TK", "Tên KH", "Tên tài khoản", "Khách hàng"],
        "Số tài khoản": ["Số TK", "STK", "Account No", "Số hiệu TK"],
        "Địa chỉ": ["Đ/c", "Address", "Nơi ở", "DiaChi"],
        "Mô tả": ["Chi tiết GD", "Nội dung", "Chi tiết giao dịch", "Diễn giải"],
        "Loại tiền": ["Currency", "Đơn vị", "Tiền tệ"],
        "Ngày giao dịch": ["Ngày GD", "Ngay GD", "Ngay G/D"],
        "Số tiền ghi nợ": ["Ghi nợ", "Số nợ", "Debt", "Số nợ"],
        "Số tiền ghi có": ["Amount", "Thành tiền", "Số tiền phải trả"],
        "Số tham chiếu": ["Mã tham chiếu", "ref", "Mã GD", "Số GD", "Số giao dịch"]
    }

    random.shuffle(tasks)

    selected_tasks = tasks[:sample_size]

    augmented_tasks = []
    for task in selected_tasks:
        text = task['data']['text']

        # 4. Change keyword
        for key, variations in synonyms.items():
            if key in text:
                # Take keyword randomly
                replacement = random.choice(variations)
                text = text.replace(key, replacement)

        augmented_tasks.append({
            "data": {
                "text": text
            }
        })

    # 5. Save dataset for labeling
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(augmented_tasks, f, ensure_ascii=False, indent=4)

    return len(augmented_tasks)

# Dataset Path
input_p = '../dataset/base/base_dataset_label.json'
output_p = '../dataset/label/dataset_for_label.json'

num_created = augment_and_shuffle_tasks(input_p, output_p)
print(f"✅ Create Dataset for label was shuttled {num_created} tasks in: {output_p}")

In [6]:
"""
    Data Augmentation for Training set
"""
import json
import random
import os
from datetime import datetime, timedelta

def generate_random_date():
    start_date = datetime(2023, 1, 1)
    return (start_date + timedelta(days=random.randint(0, 730))).strftime("%d/%m/%Y")

def generate_random_acc():
    return "".join([str(random.randint(0, 9)) for _ in range(11)])

def generate_random_amount():
    return str(random.randint(100000, 50000000))

def augment_sample(sample):
    new_sample = json.loads(json.dumps(sample))
    text = new_sample['data']['text']

    results = [
        ann for ann in new_sample['annotations'][0]['result']
        if 'value' in ann and 'labels' in ann['value']
    ]

    results.sort(key=lambda x: x['value']['start'], reverse=True)

    for ann in results:
        label = ann['value']['labels'][0]
        start = ann['value']['start']
        end = ann['value']['end']

        new_val = ann['value']['text']

        if "VALUE_TRANS_DATE" in label:
            new_val = generate_random_date()
        elif "VALUE_ACC_NUM" in label or "VALUE_CIFS" in label:
            new_val = generate_random_acc()
        elif any(x in label for x in ["VALUE_DEBT_NUM", "VALUE_CRED_NUM", "VALUE_AMOUNT"]):
            new_val = generate_random_amount()
        elif "VALUE_REF_NUM" in label:
            new_val = f"{random.randint(1000, 9999)} - {random.randint(10000, 99999)}"

        text = text[:start] + new_val + text[end:]

        ann['value']['text'] = new_val
        ann['value']['end'] = start + len(new_val)

    new_sample['annotations'][0]['result'] = results
    new_sample['data']['text'] = text

    if 'id' in new_sample: del new_sample['id']
    return new_sample

input_path = "../dataset/label/dataset_already_labeled.json"
output_path = "../dataset/label/dataset_sample_for_training.json"

if os.path.exists(input_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        original_data = json.load(f)

    augmented_data = []
    for task in original_data:
        augmented_data.append(task)
        for _ in range(30):
            augmented_data.append(augment_sample(task))

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(augmented_data, f, ensure_ascii=False, indent=2)

    print(f"✅ Generate {len(augmented_data)} tasks in {output_path}")
else:
    print(f"❌ data labeled is not found : {input_path}")

✅ Generate 310 tasks in ../dataset/label/dataset_sample_for_training.json


In [None]:
"""
    Save the sample data for labeling and training
    training: 80%
    validation: 10%
    testing: 10%
"""
import json
import random
import os

input_path = "../dataset/label/dataset_sample_for_training.json"

if not os.path.exists(input_path):
    print(f"❌ File not found: {input_path}")
else:
    with open(input_path, "r", encoding="utf-8") as f:
        dataset_to_split = json.load(f)

    random.seed(42)
    random.shuffle(dataset_to_split)

    # Figure out all the index
    total = len(dataset_to_split)
    train_end = int(total * 0.8)
    val_end = int(total * 0.9)

    # Data splitting
    train_set = dataset_to_split[:train_end]
    val_set = dataset_to_split[train_end:val_end]
    test_set = dataset_to_split[val_end:]

    output_dir = "../training/"
    os.makedirs(output_dir, exist_ok=True)

    sets = {
        "train.json": train_set,
        "val.json": val_set,
        "test.json": test_set
    }

    for filename, data in sets.items():
        path = os.path.join(output_dir, filename)
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"✅ Saved {len(data)} tasks to: {path}")

    # https://repository.centralretail.com.vn/platform/automation/ml.git
