In [1]:
# This script was derived from parse_data.py but made more generic as a template for various REL parsing needs

import json
import random
import typer
from pathlib import Path

from spacy.tokens import DocBin, Doc
from spacy.vocab import Vocab
from wasabi import Printer

msg = Printer()

# TODO: define your labels used for annotation either as "symmetrical" or "directed"
SYMM_LABELS = []
DIRECTED_LABELS = ["KNOWS", "HAS"]

# TODO: define splits for train/dev/test. What is not in test or dev, will be used as train.
# test_portion = 0.1
# dev_portion = 0.1

# TODO: set this bool to False if you didn't annotate all relations in all sentences.
# If it's true, entities that were not annotated as related will be used as negative examples.
is_complete = True

In [2]:
train_count = 138
test_count = 1300
dev_count = 301

In [2]:
"""Creating the corpus from the Prodigy annotations."""
Doc.set_extension("rel", default={})
vocab = Vocab()
docs = {"train": [], "dev": [], "test": []}
count_all = {"train": 0, "dev": 0, "test": 0}
count_pos = {"train": 0, "dev": 0, "test": 0}

In [3]:
json_loc = Path('../assets/golden_skill_annotations2.jsonl')
# json_loc = Path('assets/all_annotations_6nov.jsonl')

In [5]:
import spacy
from spacy import displacy

# nlp = spacy.load("en_core_web_sm")
# doc = nlp("This is a sentence.")
# displacy.serve(doc, style="dep")

In [6]:
from spacy.util import filter_spans
from spacy.tokens import Span

colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "OCC": "linear-gradient(90deg, #ff9c00, #aa9c55)"}

options = {"colors": colors}

train_count = 100
test_count = 30
dev_count = 8

"""Creating the corpus from the Prodigy annotations."""
# Doc.set_extension("rel", default={})
vocab = Vocab()

docs = {"train": [], "dev": [], "test": []}
count_all = {"train": 0, "dev": 0, "test": 0}
count_pos = {"train": 0, "dev": 0, "test": 0}

with json_loc.open("r", encoding="utf8") as jsonfile:
    for line in jsonfile:
        example = json.loads(line)
        span_starts = set()
        if example["answer"] == "accept":
            neg = 0
            pos = 0
            # Parse the tokens
            words = [t["text"] for t in example["tokens"]]
            spaces = [t["ws"] for t in example["tokens"]]
            doc = Doc(vocab, words=words, spaces=spaces)
            
            # Parse the entities
            spans = example["spans"]
            entities = []
            span_end_to_start = {}
            for span in spans:
                
                # Discard really long entities
                # if (span["token_end"] - span["token_start"] > 3):
                #     continue

                entity = doc.char_span(
                    span["start"], span["end"], label=span["label"]
                )
                
                span_end_to_start[span["token_end"]] = span["token_start"]
                
                entities.append(entity)
                
                span_starts.add(span["token_start"])

            if not entities:
                msg.warn("Could not parse any entities from the JSON file.")
            

            filtered = filter_spans(entities)
            doc.ents = filtered

            doc.user_data["title"] = "Occupation - Skills"
            displacy.serve(doc, style="ent", options=options)
            # displacy.render(doc, style="ent", options=options)

            # Parse the relations
            rels = {}
            for x1 in span_starts:
                for x2 in span_starts:
                    rels[(x1, x2)] = {}
            relations = example["relations"]
            for relation in relations:
                # Ignoring relations that are not between spans (they are annotated on the token level)
                # if not relation["head_span"]["token_start"] in span_end_to_start or not relation["child_span"]["token_end"] in span_end_to_start:
                if not relation["head"] in span_end_to_start or not relation["child"] in span_end_to_start:
                    msg.warn(f"This script only supports relationships between annotated entities.")
                    break
                # the 'head' and 'child' annotations refer to the end token in the span
                # but we want the first token
                start = span_end_to_start[relation["head"]]
                end = span_end_to_start[relation["child"]]
                label = relation["label"]
                if label not in SYMM_LABELS + DIRECTED_LABELS:
                    msg.warn(f"Found label '{label}' not defined in SYMM_LABELS or DIRECTED_LABELS - skipping")
                    break
                if label not in rels[(start, end)]:
                    rels[(start, end)][label] = 1.0
                    pos += 1
                if label in SYMM_LABELS:
                    if label not in rels[(end, start)]:
                        rels[(end, start)][label] = 1.0
                        pos += 1
            # If the annotation is complete, fill in zero's where the data is missing
            if is_complete:
                for x1 in span_starts:
                    for x2 in span_starts:
                        for label in SYMM_LABELS + DIRECTED_LABELS:
                            if label not in rels[(x1, x2)]:
                                neg += 1
                                rels[(x1, x2)][label] = 0.0
            doc._.rel = rels

            if pos > 0:
                if train_count > 0:
                    docs["train"].append(doc)
                    count_pos["train"] += pos
                    count_all["train"] += pos + neg
                    train_count -= 1
                elif test_count > 0:
                    docs["test"].append(doc)
                    count_pos["test"] += pos
                    count_all["test"] += pos + neg
                    test_count -= 1
                elif dev_count > 0:
                    docs["dev"].append(doc)
                    count_pos["dev"] += pos
                    count_all["dev"] += pos + neg
                    dev_count -= 1




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
[38;5;3m⚠ This script only supports relationships between annotated
entities.[0m


In [36]:
docs['train'][0]

The Security Operations Manager plans and oversees monitoring and maintenance of security operations and provides direction and leadership to internal resources. He/She provides expertise on security technologies and innovative security concepts and works toward enhancing the resilience of security operations. He coordinates ongoing reviews of existing security programs, protocols and planned upgrades. He establishes escalation processes for security incidents and develops contingency plans and disaster recovery procedures. He focuses on policy implementation and control. He is familiar with cyber security standards, protocols and frameworks, and ensures the organisations compliance with the Cyber Security Act 2018. He is knowledgeable in using various cyber security monitoring and testing tools and techniques. The Security Operations Manager is diligent and watchful in monitoring security operations, systems and activities. He is also a confident leader who develops plans and solution

In [None]:
displacy.serve(docs['train'][0], style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
displacy.serve(docs['train'][0], style="span")


Using the 'span' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [07/Nov/2023 13:47:35] "GET / HTTP/1.1" 200 1561
127.0.0.1 - - [07/Nov/2023 13:47:35] "GET /favicon.ico HTTP/1.1" 200 1561


Shutting down server on port 5000.


In [25]:
for idx, i in enumerate(docs['train']):
     print(len(i.ents_))

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'ents_'

In [38]:
examples[0].keys()

dict_keys(['text', '_input_hash', '_task_hash', '_is_binary', 'spans', 'tokens', '_view_id', 'relations', 'answer', '_timestamp', '_annotator_id', '_session_id'])

In [35]:
span_starts = set()
if examples[0]["answer"] == "accept":
    neg = 0
    pos = 0
    # Parse the tokens
    words = [t["text"] for t in example["tokens"]]
    # print(len(words)) # 192
    spaces = [t["ws"] for t in example["tokens"]]
    # print(len(spaces)) # 192
    doc = Doc(vocab, words=words, spaces=spaces)
    # Parse the entities
    spans = example["spans"]
    # print(len(spans)) # 30
    entities = []
    span_end_to_start = {}
    for span in spans:
        # Agregué esta linea para evitar overlapping de "two labels on the same token"
        if any(e.label_ == span["label"] for e in entities):
            continue
        
        entity = doc.char_span(
            span["start"], span["end"], label=span["label"]
        )
        print(entity)
        span_end_to_start[span["token_end"]] = span["token_start"]
        entities.append(entity)
        span_starts.add(span["token_start"])
    if not entities:
        msg.warn("Could not parse any entities from the JSON file.")
    doc.ents = entities

30
Senior Product Manager
product portfolio roadmap


In [41]:
len(doc.ents)

2

In [None]:
docbin = DocBin(docs=docs["train"], store_user_data=True)
docbin.to_disk(train_file)
msg.info(
    f"{len(docs['train'])} training sentences, "
    f"{count_pos['train']}/{count_all['train']} pos instances."
)
docbin = DocBin(docs=docs["dev"], store_user_data=True)
docbin.to_disk(dev_file)
msg.info(
    f"{len(docs['dev'])} dev sentences, "
    f"{count_pos['dev']}/{count_all['dev']} pos instances."
)
docbin = DocBin(docs=docs["test"], store_user_data=True)
docbin.to_disk(test_file)
msg.info(
    f"{len(docs['test'])} test sentences, "
    f"{count_pos['test']}/{count_all['test']} pos instances."
)


if __name__ == "__main__":
    typer.run(main)


In [2]:
import spacy
model = spacy.load('../training/v4/model-best/')

ValueError: [E002] Can't find factory for 'relation_extractor' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer