# Triple Extraction using stanford's OpenIE

In [17]:
from openie import StanfordOpenIE # first get torch from pytorch.org/get-started/locally

import os

import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import SVG
print('imports done')

imports done


In [2]:
new_keys = {'subject':'S', 'relation':'P', 'object':'O'}
def change_triple_keys(triple):
    return dict((new_keys[key], value) for (key, value) in triple.items())

In [39]:
def generate_text_graph(triples: dict = None, svg_filename: str = 'out/graph.svg'):
    graph = nx.MultiDiGraph()
    triple_count = len(triples)
    for idx, sor in enumerate(triples):
        if idx%round(triple_count/10) == 0:
            print(idx," ",sor)
        graph.add_edge(sor['subject'], sor['object'], label=sor['relation'])

    print(graph)

    pos=nx.spring_layout(graph)

    p=nx.drawing.nx_pydot.to_pydot(graph)
    p.write_svg(svg_filename)
    return svg_filename

In [4]:
sample_sentences = ["The corona vaccine contains dangerous chemicals",
                    "The corona vaccine contains only safe ingredients and is harmless",
                    "The corona vaccine contains a compound called Formaldehyde"]

with StanfordOpenIE() as client:
    for sentence in sample_sentences:
        print('\n__ %s.' % sentence)
        for triple in client.annotate(sentence):
            print('|-', change_triple_keys(triple))


__ The corona vaccine contains dangerous chemicals.
Starting server with command: java -Xmx8G -cp C:\Users\emiel\stanfordnlp_resources\stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-518c64b3d6e0404e.props -preload openie
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'chemicals'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'dangerous chemicals'}

__ The corona vaccine contains only safe ingredients and is harmless.
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'safe ingredients'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'ingredients'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'only ingredients'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'only safe ingredients'}

__ The corona vaccine contains a compound called Formaldehyde.
|- {'S': 'compound', 'P': 'called', 'O': 'Formaldehyde'}


In [5]:
txt_path = os.path.join('sources', 'article_corona_vaccine.txt')
extracted_triples = list()
with StanfordOpenIE() as client:
    with open(txt_path, 'r', encoding='utf-8') as r:
        text = r.read().replace('\n', ' ').replace('\r', '')
        extracted_triples = client.annotate(text, simple_format=True)
        print('\n',text[:500:],'\n', ) # to check the formatting
        print('Found %s triples in the corpus.' % len(extracted_triples))

Starting server with command: java -Xmx8G -cp C:\Users\emiel\stanfordnlp_resources\stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-3c94889c679d4596.props -preload openie

 Developing a vaccine is a long-term process  It is difficult to say when there will be an effective vaccine against the novel coronavirus.  It takes a long time to develop a vaccine for a new infectious disease, usually as long as 5 to 10 years. Every effort is being made to accelerate development of a vaccine for COVID-19. More than 100 vaccines are in development  A total of more than 100 different vaccines for SARS-CoV-2 are under development. A small number of them have reached the stage of  

Found 429 triples in the corpus.


In [41]:
SVG(generate_text_graph(extracted_triples))
print('done')

0   {'subject': 'It', 'relation': 'be', 'object': 'when vaccine against novel coronavirus'}
43   {'subject': 'phases', 'relation': 'have', 'object': 'Only when have successfully completed'}
86   {'subject': 'we', 'relation': 'are protected After', 'object': 'this'}
129   {'subject': 'S protein', 'relation': 'is attached to', 'object': 'exterior of virus'}
172   {'subject': 'RNA vaccines', 'relation': 'add', 'object': 'piece to immune cells in our body'}
215   {'subject': 'RNA vaccines', 'relation': 'add', 'object': 'piece of material to specific immune cells in our body'}
258   {'subject': 'immune cells', 'relation': 'are', 'object': 'why also referred to as cells'}
301   {'subject': 'lymphocytes', 'relation': 'neutralise', 'object': 'bacteria'}
344   {'subject': 'category', 'relation': 'are', 'object': 'adenoviruses'}
387   {'subject': 'Royal Netherlands Society', 'relation': 'keep track on', 'object': 'basis of progress on development in trials'}

done
