In [1]:
import json
import math

In [2]:
# Load training data
list_data = []
with open("./resources/annotations.json", 'r') as f:
    list_data = json.load(f)

# Ambil hanya annotation
annotations = list_data['annotations']

# dapatkan total jumlah annotation
len_annotations = len(annotations)
print(f"Jumlah annotations adalah {len_annotations}")

# split annotation menjadi 8:2
len_training_annotations = math.floor(len_annotations * .8)
training_annotations = annotations[:len_training_annotations]
test_annotations = annotations[len_training_annotations:]
print(f"Jumlah training annotations adalah {len_training_annotations}")
print(f"Jumlah test annotations adalah {len_annotations-len_training_annotations}")

Jumlah annotations adalah 254
Jumlah training annotations adalah 203
Jumlah test annotations adalah 51


In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

# Load spacy model baru
nlp = spacy.blank("id")
db = DocBin()


# referensi: https://turbolab.in/build-a-custom-ner-model-using-spacy-3-0/
# referensi: https://agateteam.org/spacynerannotate/
# Proses load data annotation menjadi format spacy
for text, annot in annotations:
    doc = nlp(text)
    ents = []
    for (start, end, label) in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    try:
        doc.ents = ents
        db.add(doc)
    except:
        print(text, annot)

# simpan data dengan format spacy
db.to_disk("./output/train.spacy")

In [4]:
import subprocess

# Membuat config dengan menggunakan base_config.cfg
# python -m spacy init fill-config base_config.cfg config.cfg
process = subprocess.run(['python', '-m', 'spacy', 'init', 'fill-config',
                         './resources/base_config.cfg', './output/config.cfg'], capture_output=True)
output = process.stdout.decode()

print(output)

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
output/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy



In [5]:
import subprocess
import os

def func_training_model(located, force = False):
    # Training data
    # python -m spacy train config.cfg --output ./output
    if os.path.exists(located) == False or force:
        process = subprocess.run(['python', '-m', 'spacy', 'train',
                                './output/config.cfg', '--output', located], capture_output=True)
        output = process.stdout.decode()
        # Butuh waktu sekitar 18 menit an
        print(output)
    else:
        print('Training already success')

func_training_model("./models", False)

Training already success


In [6]:
# Testing data

# Proses load data annotation menjadi format spacy
for text, annot in test_annotations:
    doc = nlp(text)
    ents = []
    for (start, end, label) in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    try:
        doc.ents = ents
        db.add(doc)
    except:
        print(text, annot)

# simpan data dengan format spacy
db.to_disk("./output/test.spacy")

In [14]:
import spacy
from spacy.scorer import Scorer
from spacy.training import Example


nlp = spacy.load('./models/model-last/')
examples = []


for text, annotations in test_annotations:
    doc_pred=nlp(text)
    example=Example.from_dict(doc_pred, annotations)
    examples.append(example)


scorer = Scorer(nlp)
scores = scorer.score(examples)

print(examples)
print((scores))
print(f"Precision = {scores['ents_p']}")
print(f"Recall = {scores['ents_r']}")
print(f"F1-Score = {scores['ents_f']}")

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.9952153110047847, 'ents_r': 0.9904761904761905, 'ents_f': 0.9928400954653939, 'ents_per_type': {'CRIME': {'p': 0.9878048780487805, 'r': 1.0, 'f': 0.9938650306748467}, 'PERSON': {'p': 1.0, 'r': 0.9375, 'f': 0.967741935483871}, 'ORGANIZATION': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'LOCATION': {'p': 1.0, 'r': 1.0, 'f': 1.0}}}
Precision = 0.9952153110047847
Recall = 0.9904761904761905
F1-Score = 0.9928400954653939


In [29]:
def get_entities(doc):
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]


def evaluate_ner(model, test_data):
    correct_entities = 0
    total_entities = 0

    for text, annotations in test_data:
        true_entities = annotations['entities']
        total_entities += len(true_entities)

        doc = model(text)
        pred_entities = get_entities(doc)

        for entity in pred_entities:
            ent = []
            ent.append(entity[1])
            ent.append(entity[2])
            ent.append(entity[3])
            if ent in true_entities:
                correct_entities += 1

    entity_accuracy = correct_entities / total_entities if total_entities > 0 else 0
    return entity_accuracy


# Evaluate the model
entity_accuracy = evaluate_ner(nlp, test_data=test_annotations)
print(f"Entity-level Accuracy: {entity_accuracy:.2f}")

Entity-level Accuracy: 0.99


In [8]:
import pandas as pd


df = pd.read_csv("./resources/FIX.csv")
new_df = df[['full_text']]

new_df.to_csv("./resources/upload.csv", index=False)