In [1]:
import typer
import spacy
import srsly
import pandas as pd
from spacy.tokens import DocBin, Doc, Span
from spacy.training.example import Example
import random
import tqdm

In [4]:
# make the factory work
from rel_pipe import make_relation_extractor, score_relations

# make the config work
from rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors

In [43]:
# Load the pre-trained model
nlp = spacy.load("../training/model-best")

In [58]:
# Load the jsonl data to predict
data = srsly.read_jsonl("./golden.jsonl")
data_instances = [eg["text"] for eg in data]

# Represent data via DocBin [list of docs]
db = DocBin()
for text in data_instances:
     db.add(nlp(text))
     # doc = nlp.make_doc(text)
     # db.add(doc)

db.to_disk("../data/predict.spacy")

[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m
[38;5;4mℹ Could not determine any instances in doc - returning doc as is.[0m


In [59]:
# Load data into memory
doc_bin = DocBin(store_user_data=False).from_disk('../data/predict.spacy')

In [60]:
docs = doc_bin.get_docs(nlp.vocab)

In [63]:
examples = []
for gold in docs:
   pred = Doc(
       nlp.vocab,
       words=[t.text for t in gold],
       spaces=[t.whitespace_ for t in gold],
   )
   pred.ents = gold.ents
   for name, proc in nlp.pipeline:
       pred = proc(pred)
   examples.append(Example(pred, gold))
   print()
   print(f"Text: {gold.text}")
   print(f"spans: {[(e.start, e.text, e.label_) for e in pred.ents]}")
   for value, rel_dict in pred._.rel.items():
       gold_labels = [k for (k, v) in gold._.rel[value].items() if v == 1.0]
       if gold_labels:
           print(
               f" pair: {value} --> gold labels: {gold_labels} --> predicted values: {rel_dict}"
           )
   print()

In [62]:
len(examples)

138

In [8]:
# Print the available pipeline procedures
for name, proc in nlp.pipeline:
     print("name: ", name, "\tproc: ", proc)
     #    pred = proc(pred)

name:  tok2vec 	proc:  <spacy.pipeline.tok2vec.Tok2Vec object at 0x2cb8efd70>
name:  ner 	proc:  <spacy.pipeline.ner.EntityRecognizer object at 0x2cb8e9070>
name:  relation_extractor 	proc:  <rel_pipe.RelationExtractor object at 0x2cb8469f0>


In [6]:
# Example script to retrieve a specific pipeline procedure
tok2vec = nlp.get_pipe("tok2vec")
# transformer = nlp.get_pipe("transformer")
ner = nlp.get_pipe("ner")
relation_extractor = nlp.get_pipe("relation_extractor")
print(tok2vec)
print(ner)
print(relation_extractor)
# print(transformer)

<spacy.pipeline.tok2vec.Tok2Vec object at 0x2aadde6f0>
<spacy.pipeline.ner.EntityRecognizer object at 0x2aadbdf50>
<rel_pipe.RelationExtractor object at 0x2aaddfa70>


In [11]:
# Example script to retrieve a function from a pipeline procedure
get_instances = relation_extractor.model.attrs["get_instances"]
print(get_instances)

<function create_instances.<locals>.<lambda> at 0x2aadb3c40>


In [29]:
# Predict on a document without modifying it
doc1 = nlp("The Associate Security Analyst supports security systems, operations administration, monitoring and maintenance of cyber security systems and applications. He/She monitors security alerts and events. He collects and documents information based on established practices and supports the preparation and publishing of security advisories. He assists with the analysis of security-related information and events, escalation of incidents for validation and remediation. He is required to be on standby with on-call availability with varied shifts including nights, weekends and holidays. He is familiar with cyber security standards, protocols and frameworks, and is required to act in accordance with the Cyber Security Act 2018. He is knowledgeable in using various cyber security tools and techniques to monitor and resolve incidents. The Associate Security Analyst is alert and vigilant in performing monitoring activities and is able to analyse and resolve security-related issues critically. He communicates clearly in his interactions with others and coordinates effectively with his team to perform security operations.")
doc2 = nlp("The Chief Information Security Officer develops and drives the vision for the information security function. He/She acts as the authority for the development and enforcement of organisation security strategy, standards and policies, and has ultimate responsibility for ensuring the protection of corporate information. He guides the design and continuous improvement of the IT security architecture and Cyber Risk Maturity Model that balances business needs with security risks. He advises the board and top executives on all security matters and sets directions for complying with regulatory inquiries, legal and compliance regulations, inspections and audits. He is an expert in cyber security compliance standards, protocols and frameworks, as well as the Cyber Security Act 2018. He is keeps abreast of cyber-related applications and hardware technologies and services, and is constantly on the look-out for new technologies that may be leveraged on to enhance work processes, or which may pose as potential threats. The Chief Information Security Officer is an inspirational and influential leader, who displays sound judgement and decisiveness in ensuring that corporate information is well protected and secured. He is strategic in his approach toward resource management and capability development among his teams.")
doc3 = nlp("The .NET Framework is a software framework developed by Microsoft that runs primarily on Microsoft Windows. It includes a large class library called Framework Class Library (FCL) and provides language interoperability across several programming languages. Programs written for .NET Framework execute in a software environment named the Common Language Runtime (CLR). The CLR is an application virtual machine that provides services such as security, memory management, and exception handling. As such, computer code written using .NET Framework is called \"managed code\". FCL and CLR together constitute the .NET Framework.")
doc4 = nlp("\nDefined by Microsoft for use in recent versions of Windows, an assembly in the Common Language Infrastructure (CLI) is a compiled code library used for deployment, versioning, and security. There are two types: process assemblies (EXE) and library assemblies (DLL). A process assembly represents a process that will use classes defined in library assemblies. CLI assemblies contain code in CIL, which is usually generated from a CLI language, and then compiled into machine language at run time by the just-in-time compiler. In the .NET Framework implementation, this compiler is part of the Common Language Runtime (CLR).")
doc5 = nlp("10 Gigabit Ethernet is a group of computer networking technologies for transmitting Ethernet frames at a rate of 10 gigabits per second. It was first defined by the IEEE 802.3ae-2002 standard. Unlike previous Ethernet standards, 10 Gigabit Ethernet defines only full-duplex point-to-point links which are generally connected by network switches; shared-medium CSMA/CD operation has not been carried over from the previous generations Ethernet standards so half-duplex operation and repeater hubs do not exist in 10GbE.")

In [30]:
print(f"spans: {[(e.start, e.text, e.label_) for e in doc5.ents]}")

spans: [(1, 'Gigabit Ethernet is', 'SKILL'), (7, 'computer networking technologies', 'SKILL'), (12, 'Ethernet frames', 'SKILL'), (35, 'Ethernet standards', 'SKILL'), (39, 'Gigabit Ethernet defines', 'SKILL'), (57, 'network switches', 'SKILL'), (76, 'Ethernet standards', 'SKILL')]


In [33]:
for e in doc5.ents:
     print(e, e.label_)

Gigabit Ethernet is SKILL
computer networking technologies SKILL
Ethernet frames SKILL
Ethernet standards SKILL
Gigabit Ethernet defines SKILL
network switches SKILL
Ethernet standards SKILL


In [34]:
for e in doc5._.rel.items():
     if e[1]['KNOWS'] > 0.5:
          print(e)

In [25]:
print(f"spans: {[(e.start, e.text, e.label_) for e in doc4.ents]}")

spans: [(10, 'Windows,', 'SKILL'), (17, 'Language Infrastructure', 'OCC'), (34, 'security. There are', 'SKILL'), (73, 'CIL,', 'OCC'), (82, 'language,', 'SKILL'), (88, 'machine language', 'SKILL'), (115, 'Language Runtime (CLR).', 'OCC')]


In [29]:
from spacy.training.example import Example

pred = Doc(
    nlp.vocab,
    words=[t.text for t in doc4],
    spaces=[t.whitespace_ for t in doc4],
)
pred.ents = doc4.ents
for name, proc in nlp.pipeline:
    pred = proc(pred)

example = Example(pred, doc4)

In [48]:
# print(doc3.ents)
# doc4._.rel

# for value, rel_dict in doc4._.rel.items():
#      print(rel_dict)

for (k, v) in doc4._.rel[(10, 17)].items():
     print(v)

0.031369336
0.011399968


In [64]:
# Let's go a step back an add manual instances to doc1 & doc2
# doc1.ents = [
#     Span(doc1, 1, 4, label="OCCUPATION"),
#     Span(doc1, 5, 7, label="SKILL"),
#     Span(doc1, 8, 10, label="SKILL"),
#     Span(doc1, 15, 17, label="SKILL"),
# ]

# doc2.ents = [
#     Span(doc1, 1, 5, label="OCCUPATION"),
#     Span(doc1, 12, 14, label="SKILL"),
#     # Span(doc1, 28, 30, label="SKILL"),
#     # Span(doc1, 43, 45, label="SKILL"),
# ]

In [68]:
scores = relation_extractor.predict([doc1])

In [69]:
scores

array([[9.98137116e-01, 1.13621921e-08],
       [9.97541308e-01, 9.77544889e-09],
       [9.98295128e-01, 1.47525085e-08],
       [4.52181703e-05, 3.71496078e-08],
       [2.50027166e-03, 2.90826185e-09],
       [3.60449357e-03, 4.38897052e-09],
       [3.86377360e-05, 4.11782395e-08],
       [2.82057608e-03, 3.74691034e-09],
       [3.08154104e-03, 4.86491647e-09],
       [3.50350128e-05, 4.80279638e-08],
       [2.55823880e-03, 4.37018377e-09],
       [1.93827704e-03, 3.75987552e-09]], dtype=float32)

In [76]:
# get_instances = relation_extractor.model.attrs["get_instances"]

# for doc in docs:
# for (e1, e2) in get_instances(doc1):
#      print(e1.start, ", ", e2.start)
# relation_extractor.set_annotations([doc1], scores)
doc1._.rel

{(1, 5): {'KNOWS': 0.9981371, 'HAS': 1.1362192e-08}}

In [53]:
relation_extractor.add_label("OCCUPATION")

1

In [4]:
# Load the jsonl data to predict
data = srsly.read_jsonl("./golden.jsonl")
data_instances = [eg["text"] for eg in data]
print(data_instances[:5])

['The Associate Security Analyst supports security systems, operations administration, monitoring and maintenance of cyber security systems and applications. He/She monitors security alerts and events. He collects and documents information based on established practices and supports the preparation and publishing of security advisories. He assists with the analysis of security-related information and events, escalation of incidents for validation and remediation. He is required to be on standby with on-call availability with varied shifts including nights, weekends and holidays. He is familiar with cyber security standards, protocols and frameworks, and is required to act in accordance with the Cyber Security Act 2018. He is knowledgeable in using various cyber security tools and techniques to monitor and resolve incidents. The Associate Security Analyst is alert and vigilant in performing monitoring activities and is able to analyse and resolve security-related issues critically. He c

In [9]:
# Represent data via DocBin [list of docs]
db = DocBin()
for text in data_instances:
     doc = nlp.make_doc(text)
     db.add(doc)

print("Added ", len(db), " documents")

db.to_disk("predict.spacy")

Added  138  documents


In [19]:
# Load data into memory
doc_bin = DocBin(store_user_data=True).from_disk('./predict.spacy')
# docs = doc_bin.get_docs(nlp.vocab)
docs = db.get_docs(nlp.vocab)

In [20]:
for doc in docs:
     print(doc.ents)

()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()


In [17]:
def _score_and_format(examples, thresholds):
    for threshold in thresholds:
        r = score_relations(examples, threshold)
        results = {k: "{:.2f}".format(v * 100) for k, v in r.items()}
        print(f"threshold {'{:.2f}'.format(threshold)} \t {results}")

In [None]:
# Make predictions for entities
examples = []
random_examples = []
for gold in docs:
    pred = Doc(
        nlp.vocab,
        words=[t.text for t in gold],
        spaces=[t.whitespace_ for t in gold],
    )
    pred.ents = gold.ents

    for name, proc in nlp.pipeline:
        pred = proc(pred)
    examples.append(Example(pred, gold))

    relation_extractor = nlp.get_pipe("relation_extractor")
    get_instances = relation_extractor.model.attrs["get_instances"]
    for (e1, e2) in get_instances(pred):
        offset = (e1.start, e2.start)
        if offset not in pred._.rel:
            pred._.rel[offset] = {}
        for label in relation_extractor.labels:
            pred._.rel[offset][label] = random.uniform(0, 1)
    random_examples.append(Example(pred, gold))

In [3]:
import spacy
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training.example import Example

examples = [
    ('Who is Shaka Khan?',
     {(7, 17, 'PERSON')}),
    ('I like London and Berlin.',
     {(7, 13, 'LOC'), (18, 24, 'LOC')})
]

def evaluate(ner_model, examples):
    scorer = Scorer()
    example = []
    for input_, annot in examples:
        pred = ner_model(input_)
        print(pred,annot)
        temp = Example.from_dict(pred, dict.fromkeys(annot))
        example.append(temp)
    scores = scorer.score(example)
    return scores

ner_model = spacy.load('en_core_web_md') # for spaCy's pretrained use 'en_core_web_sm'
results = evaluate(ner_model, examples)
print(results)

Who is Shaka Khan? {(7, 17, 'PERSON')}
I like London and Berlin. {(18, 24, 'LOC'), (7, 13, 'LOC')}
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': None, 'ents_r': None, 'ents_f': None, 'ents_per_type': None, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}
