In [3]:
# !pip install spacy datasets
# !python -m spacy download en_core_web_sm

In [5]:
import spacy
from spacy.training.example import Example
from datasets import load_dataset
import json

# ---------- Ekstraksi ----------
def extract_from_cord(example):
    tokens, labels = [], []
    parsed = json.loads(example['ground_truth'])
    for item in parsed.get('gt_parse', {}).get('menu', []):
        if isinstance(item, dict):
            # Handle item_name as list or str
            item_name_raw = item.get('nm', '')
            if isinstance(item_name_raw, list):
                item_name = " ".join(item_name_raw).strip()
            elif isinstance(item_name_raw, str):
                item_name = item_name_raw.strip()
            else:
                item_name = ''

            quantity = item.get('cnt', 0)
            price = item.get('price', 0)

            if item_name and quantity and price:
                tokens += [item_name, str(quantity), str(price)]
                labels += ['B-ITEM', 'B-QUANTITY', 'B-PRICE']
    return tokens, labels

def extract_from_donut(example):
    tokens, labels = [], []
    parsed = json.loads(example['ground_truth'])
    for item in parsed.get('gt_parse', {}).get('items', []):
        item_name = item.get('item_desc', '').strip()
        quantity = item.get('item_qty', 0)
        price = item.get('item_gross_worth') or item.get('item_net_price') or 0
        if item_name and quantity and price:
            tokens += [item_name, str(quantity), str(price)]
            labels += ['B-ITEM', 'B-QUANTITY', 'B-PRICE']
    return tokens, labels

def convert_tokens_to_spacy_format(tokens, labels):
    text = " ".join(tokens)
    entities = []
    start = 0
    for token, label in zip(tokens, labels):
        start = text.find(token, start)
        if start == -1:
            continue
        end = start + len(token)
        entities.append((start, end, label.replace("B-", "")))
        start = end
    return (text, {"entities": entities})

# ---------- Load dataset ----------
cord = load_dataset("naver-clova-ix/cord-v2")
donut = load_dataset("katanaml-org/invoices-donut-data-v1")

def make_examples(dataset, extractor_fn):
    examples = []
    for ex in dataset:
        tokens, labels = extractor_fn(ex)
        if tokens:
            examples.append(convert_tokens_to_spacy_format(tokens, labels))
    return examples

train_examples = make_examples(cord["train"], extract_from_cord) + make_examples(donut["train"], extract_from_donut)
val_examples = make_examples(cord["validation"], extract_from_cord) + make_examples(donut["validation"], extract_from_donut)
test_examples = make_examples(cord["test"], extract_from_cord) + make_examples(donut["test"], extract_from_donut)

print(f"Train: {len(train_examples)}, Val: {len(val_examples)}, Test: {len(test_examples)}")

# ---------- Training spaCy ----------
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Tambah label
for _, ann in train_examples:
    for _, _, label in ann["entities"]:
        ner.add_label(label)

# Training
optimizer = nlp.begin_training()
for i in range(20):
    losses = {}
    for text, annotations in train_examples:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.35, losses=losses)
    print(f"Epoch {i+1}: Loss = {losses['ner']:.4f}")

# Simpan model
nlp.to_disk("ner_receipt_model")

# ---------- Evaluasi ----------
def evaluate_ner(nlp, examples):
    tp = {"ITEM": 0, "QUANTITY": 0, "PRICE": 0}
    fp = {"ITEM": 0, "QUANTITY": 0, "PRICE": 0}
    fn = {"ITEM": 0, "QUANTITY": 0, "PRICE": 0}

    for text, ann in examples:
        pred = nlp(text)
        pred_ents = {(ent.start_char, ent.end_char, ent.label_) for ent in pred.ents}
        true_ents = {(start, end, label) for start, end, label in ann["entities"]}

        for ent in pred_ents:
            if ent in true_ents:
                tp[ent[2]] += 1
            else:
                fp[ent[2]] += 1
        for ent in true_ents:
            if ent not in pred_ents:
                fn[ent[2]] += 1

    print("\n--- Evaluation Report ---")
    for label in tp:
        precision = tp[label] / (tp[label] + fp[label] + 1e-6)
        recall = tp[label] / (tp[label] + fn[label] + 1e-6)
        f1 = 2 * precision * recall / (precision + recall + 1e-6)
        print(f"{label}: P={precision:.2f} R={recall:.2f} F1={f1:.2f}")

print("\n✅ Validation Evaluation:")
evaluate_ner(nlp, val_examples)

print("\n✅ Test Evaluation:")
evaluate_ner(nlp, test_examples)

Train: 827, Val: 100, Test: 81


[2025-05-07 20:10:25,080] [INFO] Created vocabulary
[2025-05-07 20:10:25,081] [INFO] Finished initializing nlp object
  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist


Epoch 1: Loss = 1660.3038
Epoch 2: Loss = 516.5396
Epoch 3: Loss = 329.6005
Epoch 4: Loss = 288.5896
Epoch 5: Loss = 253.7034
Epoch 6: Loss = 230.5863
Epoch 7: Loss = 217.8163
Epoch 8: Loss = 152.0029
Epoch 9: Loss = 140.4869
Epoch 10: Loss = 98.9411
Epoch 11: Loss = 117.5162
Epoch 12: Loss = 57.2309
Epoch 13: Loss = 87.6488
Epoch 14: Loss = 117.0263
Epoch 15: Loss = 130.9066
Epoch 16: Loss = 63.1573
Epoch 17: Loss = 40.5748
Epoch 18: Loss = 69.2594
Epoch 19: Loss = 87.4858
Epoch 20: Loss = 88.2810

✅ Validation Evaluation:

--- Evaluation Report ---
ITEM: P=0.99 R=0.98 F1=0.99
QUANTITY: P=1.00 R=0.99 F1=0.99
PRICE: P=1.00 R=0.99 F1=0.99

✅ Test Evaluation:

--- Evaluation Report ---
ITEM: P=0.99 R=0.99 F1=0.99
QUANTITY: P=1.00 R=0.99 F1=1.00
PRICE: P=1.00 R=1.00 F1=1.00


In [6]:
import spacy

nlp = spacy.load("ner_receipt_model")
text = "Nasi Goreng 2 25000"

doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Nasi Goreng ITEM
2 QUANTITY
25000 PRICE


In [7]:
import shutil

# Menyimpan model ke dalam format zip
shutil.make_archive("ner_receipt_model", 'zip', "ner_receipt_model")

'/kaggle/working/ner_receipt_model.zip'