In [18]:
import typer
import spacy
import srsly
import pandas as pd
from spacy.tokens import DocBin, Doc, Span
from spacy.training.example import Example
import random
import tqdm

In [2]:
# make the factory work
from rel_pipe import make_relation_extractor, score_relations

# make the config work
from rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors

In [41]:
# Load the pre-trained model
nlp = spacy.load("../training/model-best")

In [42]:
# Print the available pipeline procedures
for name, proc in nlp.pipeline:
     print("name: ", name, "\tproc: ", proc)
     #    pred = proc(pred)

name:  transformer 	proc:  <spacy_transformers.pipeline_component.Transformer object at 0x2c75c0290>
name:  relation_extractor 	proc:  <rel_pipe.RelationExtractor object at 0x2c549c710>


In [40]:
# Example script to retrieve a specific pipeline procedure
relation_extractor = nlp.get_pipe("relation_extractor")
transformer = nlp.get_pipe("transformer")
print(relation_extractor)
print(transformer)

<rel_pipe.RelationExtractor object at 0x2c50df9b0>
<spacy_transformers.pipeline_component.Transformer object at 0x2c50de9f0>


In [43]:
# Example script to retrieve a function from a pipeline procedure
get_instances = relation_extractor.model.attrs["get_instances"]
print(get_instances)

<function create_instances.<locals>.<lambda> at 0x2c50be5c0>


In [44]:
# Predict on a document without modifying it
doc1 = nlp("The Associate Security Analyst supports security systems, operations administration, monitoring and maintenance of cyber security systems and applications. He/She monitors security alerts and events. He collects and documents information based on established practices and supports the preparation and publishing of security advisories. He assists with the analysis of security-related information and events, escalation of incidents for validation and remediation. He is required to be on standby with on-call availability with varied shifts including nights, weekends and holidays. He is familiar with cyber security standards, protocols and frameworks, and is required to act in accordance with the Cyber Security Act 2018. He is knowledgeable in using various cyber security tools and techniques to monitor and resolve incidents. The Associate Security Analyst is alert and vigilant in performing monitoring activities and is able to analyse and resolve security-related issues critically. He communicates clearly in his interactions with others and coordinates effectively with his team to perform security operations.")
doc2 = nlp("The Chief Information Security Officer develops and drives the vision for the information security function. He/She acts as the authority for the development and enforcement of organisation security strategy, standards and policies, and has ultimate responsibility for ensuring the protection of corporate information. He guides the design and continuous improvement of the IT security architecture and Cyber Risk Maturity Model that balances business needs with security risks. He advises the board and top executives on all security matters and sets directions for complying with regulatory inquiries, legal and compliance regulations, inspections and audits. He is an expert in cyber security compliance standards, protocols and frameworks, as well as the Cyber Security Act 2018. He is keeps abreast of cyber-related applications and hardware technologies and services, and is constantly on the look-out for new technologies that may be leveraged on to enhance work processes, or which may pose as potential threats. The Chief Information Security Officer is an inspirational and influential leader, who displays sound judgement and decisiveness in ensuring that corporate information is well protected and secured. He is strategic in his approach toward resource management and capability development among his teams.")

[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m


In [46]:
doc1._.rel

{}

In [64]:
# Let's go a step back an add manual instances to doc1 & doc2
doc1.ents = [
    Span(doc1, 1, 4, label="OCCUPATION"),
    Span(doc1, 5, 7, label="SKILL"),
    Span(doc1, 8, 10, label="SKILL"),
    Span(doc1, 15, 17, label="SKILL"),
]

doc2.ents = [
    Span(doc1, 1, 5, label="OCCUPATION"),
    Span(doc1, 12, 14, label="SKILL"),
    # Span(doc1, 28, 30, label="SKILL"),
    # Span(doc1, 43, 45, label="SKILL"),
]

In [68]:
scores = relation_extractor.predict([doc1])

In [69]:
scores

array([[9.98137116e-01, 1.13621921e-08],
       [9.97541308e-01, 9.77544889e-09],
       [9.98295128e-01, 1.47525085e-08],
       [4.52181703e-05, 3.71496078e-08],
       [2.50027166e-03, 2.90826185e-09],
       [3.60449357e-03, 4.38897052e-09],
       [3.86377360e-05, 4.11782395e-08],
       [2.82057608e-03, 3.74691034e-09],
       [3.08154104e-03, 4.86491647e-09],
       [3.50350128e-05, 4.80279638e-08],
       [2.55823880e-03, 4.37018377e-09],
       [1.93827704e-03, 3.75987552e-09]], dtype=float32)

In [76]:
# get_instances = relation_extractor.model.attrs["get_instances"]

# for doc in docs:
# for (e1, e2) in get_instances(doc1):
#      print(e1.start, ", ", e2.start)
# relation_extractor.set_annotations([doc1], scores)
doc1._.rel

{(1, 5): {'KNOWS': 0.9981371, 'HAS': 1.1362192e-08}}

In [53]:
relation_extractor.add_label("OCCUPATION")

1

In [4]:
# Load the jsonl data to predict
data = srsly.read_jsonl("./golden.jsonl")
data_instances = [eg["text"] for eg in data]
print(data_instances[:5])

['The Associate Security Analyst supports security systems, operations administration, monitoring and maintenance of cyber security systems and applications. He/She monitors security alerts and events. He collects and documents information based on established practices and supports the preparation and publishing of security advisories. He assists with the analysis of security-related information and events, escalation of incidents for validation and remediation. He is required to be on standby with on-call availability with varied shifts including nights, weekends and holidays. He is familiar with cyber security standards, protocols and frameworks, and is required to act in accordance with the Cyber Security Act 2018. He is knowledgeable in using various cyber security tools and techniques to monitor and resolve incidents. The Associate Security Analyst is alert and vigilant in performing monitoring activities and is able to analyse and resolve security-related issues critically. He c

In [9]:
# Represent data via DocBin [list of docs]
db = DocBin()
for text in data_instances:
     doc = nlp.make_doc(text)
     db.add(doc)

print("Added ", len(db), " documents")

db.to_disk("predict.spacy")

Added  138  documents


In [19]:
# Load data into memory
doc_bin = DocBin(store_user_data=True).from_disk('./predict.spacy')
# docs = doc_bin.get_docs(nlp.vocab)
docs = db.get_docs(nlp.vocab)

In [20]:
for doc in docs:
     print(doc.ents)

()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()


In [17]:
def _score_and_format(examples, thresholds):
    for threshold in thresholds:
        r = score_relations(examples, threshold)
        results = {k: "{:.2f}".format(v * 100) for k, v in r.items()}
        print(f"threshold {'{:.2f}'.format(threshold)} \t {results}")

In [None]:
# Make predictions for entities
examples = []
random_examples = []
for gold in docs:
    pred = Doc(
        nlp.vocab,
        words=[t.text for t in gold],
        spaces=[t.whitespace_ for t in gold],
    )
    pred.ents = gold.ents

    for name, proc in nlp.pipeline:
        pred = proc(pred)
    examples.append(Example(pred, gold))

    relation_extractor = nlp.get_pipe("relation_extractor")
    get_instances = relation_extractor.model.attrs["get_instances"]
    for (e1, e2) in get_instances(pred):
        offset = (e1.start, e2.start)
        if offset not in pred._.rel:
            pred._.rel[offset] = {}
        for label in relation_extractor.labels:
            pred._.rel[offset][label] = random.uniform(0, 1)
    random_examples.append(Example(pred, gold))