In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from pathlib import Path
import json

In [6]:
TRAIN_JSONL = Path("../work/train.jsonl")
DEV_JSONL   = Path("../work/dev.jsonl")
PATTERNS    = Path("./entity_ruler_patterns/patterns.jsonl")
OUTDIR      = Path("../work/model_minimal")

In [3]:
def jsonl_to_examples(jsonl_path, nlp):
    examples = []
    with open(jsonl_path, "r") as f:
        for line in f:
            item = json.loads(line)
            text = item["text"]
            ents = item.get("entities", [])
            doc = nlp.make_doc(text)
            spans = []
            for start, end, label in ents:
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span is not None:
                    spans.append(span)
            doc.ents = spans
            examples.append(Example.from_dict(doc, {"entities": [(e.start_char, e.end_char, e.label_) for e in spans]}))
    return examples

In [8]:
nlp = spacy.blank("en")

In [9]:
ruler = nlp.add_pipe("entity_ruler", first=True)
ruler.from_disk(PATTERNS)
print(f"Loaded {len(ruler.patterns)} patterns")

Loaded 63 patterns


In [10]:
ner = nlp.add_pipe("ner")

In [15]:
examples = jsonl_to_examples(TRAIN_JSONL, nlp)
for eg in examples:
    for span in eg.get_aligned_ner():
        ner.add_label(span)
print("NER labels:", list(ner.labels))

NER labels: ['B-LOC', 'B-PERSON', 'I-LOC', 'I-PERSON', 'L-LOC', 'L-PERSON', 'O', 'U-GPE', 'U-NORP', 'U-PERSON']


In [16]:
optimizer = nlp.initialize(lambda: examples)
for epoch in range(10):
    losses = {}
    for example in examples:
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Epoch {epoch} Losses: {losses}")

Epoch 0 Losses: {'ner': np.float32(1188.2955)}
Epoch 1 Losses: {'ner': np.float32(70.04648)}
Epoch 2 Losses: {'ner': np.float32(28.500517)}
Epoch 3 Losses: {'ner': np.float32(15.905543)}
Epoch 4 Losses: {'ner': np.float32(15.147819)}
Epoch 5 Losses: {'ner': np.float32(7.4423537)}
Epoch 6 Losses: {'ner': np.float32(7.312463)}
Epoch 7 Losses: {'ner': np.float32(3.2810335)}
Epoch 8 Losses: {'ner': np.float32(1.7211083)}
Epoch 9 Losses: {'ner': np.float32(2.0357819)}


In [17]:
dev_examples = jsonl_to_examples(DEV_JSONL, nlp)
scorer = nlp.evaluate(dev_examples)
print("Dev F-score:", scorer["ents_f"])

Dev F-score: 0.9736842105263158




In [18]:
print(scorer)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.9487179487179487, 'ents_r': 1.0, 'ents_f': 0.9736842105263158, 'ents_per_type': {'PERSON': {'p': 0.9259259259259259, 'r': 1.0, 'f': 0.9615384615384615}, 'NORP': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'GPE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'LOC': {'p': 1.0, 'r': 1.0, 'f': 1.0}}, 'speed': 21694.75997998748}


In [9]:
data = {"text": "When this had been said, the Venerable Ananda spoke to the Blessed One, saying: 'Let it not be, Lord, that the Blessed One should pass away in this mean place, this uncivilized township in the midst of the jungle, a mere outpost of the province. There are great cities, Lord, such as Campa , Rajagaha, Savatthi , Saketa , Kosambi , and Benares â€” let the Blessed One have his final passing away in one of those. For in those cities dwell many wealthy nobles and brahmans and householders who are devotees of the Tathagata, and they will render due honor to the remains of the Tathagata.'\n", "spans": [{"start": 29, "end": 45, "label": "PERSON"}, {"start": 59, "end": 70, "label": "PERSON"}, {"start": 111, "end": 122, "label": "PERSON"}, {"start": 284, "end": 289, "label": "GPE"}, {"start": 292, "end": 300, "label": "GPE"}, {"start": 302, "end": 310, "label": "GPE"}, {"start": 313, "end": 319, "label": "GPE"}, {"start": 322, "end": 329, "label": "GPE"}, {"start": 336, "end": 343, "label": "GPE"}, {"start": 354, "end": 365, "label": "PERSON"}, {"start": 511, "end": 520, "label": "PERSON"}, {"start": 575, "end": 584, "label": "PERSON"}, {"start": 461, "end": 469, "label": "NORP"}]}


In [10]:
data.keys()


dict_keys(['text', 'spans'])

In [12]:
for s in data["spans"]:
    print(s.get('start'), s.get('end'), s.get('label'))

29 45 PERSON
59 70 PERSON
111 122 PERSON
284 289 GPE
292 300 GPE
302 310 GPE
313 319 GPE
322 329 GPE
336 343 GPE
354 365 PERSON
511 520 PERSON
575 584 PERSON
461 469 NORP
