# Triple Extraction using stanford's OpenIE

In [1]:
from openie import StanfordOpenIE # first get torch from pytorch.org/get-started/locally
import graphviz

import os
import tempfile
from subprocess import Popen
from sys import stderr

In [2]:
new_keys = {'subject':'S', 'relation':'P', 'object':'O'}
def change_triple_keys(triple):
    return dict((new_keys[key], value) for (key, value) in triple.items())

In [3]:
def generate_text_graph(triples: dict = None, png_filename: str = './out/graph.png'):
    
    graph = list()
    graph.append('digraph {')
    for er in triples:
        graph.append('"{}" -> "{}" [ label="{}" ];'.format(er['subject'].lower(), er['object'].lower(), er['relation'].lower()))
    graph.append('}')
    
    output_dir = os.path.join('.', os.path.dirname(png_filename))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    out_dot = os.path.join(tempfile.gettempdir(), 'graph.dot')
    with open(out_dot, 'w') as output_file:
        output_file.writelines(graph)

    command = 'dot -Tpng {} -o {}'.format(out_dot, png_filename)
    dot_process = Popen(command, stdout=stderr, shell=True)
    dot_process.wait()
    assert not dot_process.returncode, 'ERROR: Call to dot exited with a non-zero code status.'
    

In [4]:
sample_sentences = ["The corona vaccine contains dangerous chemicals",
                    "The corona vaccine contains only safe ingredients and is harmless",
                    "The corona vaccine contains a compound called Formaldehyde"]

with StanfordOpenIE() as client:
    for sentence in sample_sentences:
        print('\n__ %s.' % sentence)
        for triple in client.annotate(sentence):
            print('|-', change_triple_keys(triple))


__ The corona vaccine contains dangerous chemicals.
Starting server with command: java -Xmx8G -cp C:\Users\emiel\stanfordnlp_resources\stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-b20656fe8d24439c.props -preload openie
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'chemicals'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'dangerous chemicals'}

__ The corona vaccine contains only safe ingredients and is harmless.
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'safe ingredients'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'ingredients'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'only ingredients'}
|- {'S': 'corona vaccine', 'P': 'contains', 'O': 'only safe ingredients'}

__ The corona vaccine contains a compound called Formaldehyde.
|- {'S': 'compound', 'P': 'called', 'O': 'Formaldehyde'}


In [5]:
txt_path = os.path.join('sources', 'article_corona_vaccine.txt')
extracted_triples = list()
with StanfordOpenIE() as client:
    with open(txt_path, 'r', encoding='utf-8') as r:
        text = r.read().replace('\n', ' ').replace('\r', '')
        extracted_triples = client.annotate(text, simple_format=True)
        print('\n',text[:500:],'\n', ) # to check the formatting
        print('Found %s triples in the corpus.' % len(extracted_triples))

Starting server with command: java -Xmx8G -cp C:\Users\emiel\stanfordnlp_resources\stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-3e4ea2dade22457d.props -preload openie

 Developing a vaccine is a long-term process  It is difficult to say when there will be an effective vaccine against the novel coronavirus.  It takes a long time to develop a vaccine for a new infectious disease, usually as long as 5 to 10 years. Every effort is being made to accelerate development of a vaccine for COVID-19. More than 100 vaccines are in development  A total of more than 100 different vaccines for SARS-CoV-2 are under development. A small number of them have reached the stage of  

Found 429 triples in the corpus.


In [6]:
graph_image = 'graph.png'
#generate_text_graph(extracted_triples, graph_image)

In [7]:
with StanfordOpenIE() as client:
    text = "Python is a programming language designed for readability"
    graph_image = 'graph.png'
    #client.generate_graphviz_graph(text, graph_image)

In [8]:
dot = graphviz.Digraph(comment = 'testgraph')
dot.edge('A','B')
dot.view()

ExecutableNotFound: failed to execute ['dot', '-Tpdf', '-O', 'Digraph.gv'], make sure the Graphviz executables are on your systems' PATH

### TODO Properly install graphviz please