# Author: ddukic

In [1]:
import os

os.environ["CORENLP_HOME"] = "/home/ddukic/oee/notebooks/.stanfordnlp_resources"

import json

# Import client module
from stanza.server import CoreNLPClient

client = CoreNLPClient(
    timeout=150000000,
    be_quiet=True,
    annotators=["openie"],
    endpoint="http://localhost:4242",
)

2023-05-11 12:50:21 INFO: Writing properties to tmp file: corenlp_server-bf4d33c3a71b4d33.props


In [2]:
implicit_realations = set()

def extract_sro(extraction_dict, role, tokens):
    t_list = []
    idx_start = extraction_dict[role + "Span"][0]

    first = True

    for i, x in enumerate(extraction_dict[role].split()):
        if i > 0:
            first = False
        if x not in tokens:
            # stupid implicit extractions
            implicit_realations.add(x)
            t_list.append(x + "-" + str(-2))
        else:
            if first:
                # add plus one to span start to be same as mini
                t_list.append(x + "-" + str(idx_start + 1))
            else:
                for j in range(idx_start + 1, len(tokens)):
                    if tokens[j] == x:
                        # add plus one to span to be same as mini
                        t_list.append(x + "-" + str(j + 1))

    return t_list


def annotate_sentences_from_file(file_path, out_file_name="ace_train_triplets.json"):
    out_path = "../data/processed/stanford/" + out_file_name

    with open(file_path, "r") as f:
        sentences = [line.strip() for line in f.readlines()]

    # sentence counter
    sent_counter = 0

    triplets = {}

    client.start()

    for text in sentences:
        # ensure to be the same as mini
        annotated_text = client.annotate(
            text,
            properties={
                "language": "english",
                "annotators": "tokenize, ssplit, pos, lemma, ner, depparse, openie",
                "tokenize.whitespace": "true",
                "ssplit.eolonly": "true",
                "outputFormat": "json",
                "parse.originalDependencies": "true",
            },
        )

        tokens = text.split()

        # triplet counter
        t_counter = 0
        triplet_dict = {}
        for triplet in annotated_text["sentences"][0]["openie"]:
            triplet_dict["triplet_" + str(t_counter)] = {
                "subject": extract_sro(triplet, "subject", tokens),
                "relation": extract_sro(triplet, "relation", tokens),
                "object": extract_sro(triplet, "object", tokens),
            }
            t_counter += 1

        triplet_dict["tokens"] = {"tokens": tokens}

        triplets[str(sent_counter)] = triplet_dict

        sent_counter += 1

    with open(out_path, "w") as f:
        json.dump(triplets, f)

    client.stop()

In [3]:
# ace
annotate_sentences_from_file(
    "../data/processed/ace_sentences_train.txt", out_file_name="ace_train_triplets.json"
)
annotate_sentences_from_file(
    "../data/processed/ace_sentences_dev.txt", out_file_name="ace_dev_triplets.json"
)
annotate_sentences_from_file(
    "../data/processed/ace_sentences_test.txt", out_file_name="ace_test_triplets.json"
)
# ednyt
annotate_sentences_from_file(
    "../data/processed/ednyt_sentences_train.txt", out_file_name="ednyt_train_triplets.json"
)
annotate_sentences_from_file(
    "../data/processed/ednyt_sentences_valid.txt", out_file_name="ednyt_valid_triplets.json"
)
annotate_sentences_from_file(
    "../data/processed/ednyt_sentences_test.txt", out_file_name="ednyt_test_triplets.json"
)
# evextra
annotate_sentences_from_file(
    "../data/processed/evextra_sentences_train.txt", out_file_name="evextra_train_triplets.json"
)
annotate_sentences_from_file(
    "../data/processed/evextra_sentences_valid.txt", out_file_name="evextra_valid_triplets.json"
)
annotate_sentences_from_file(
    "../data/processed/evextra_sentences_test.txt", out_file_name="evextra_test_triplets.json"
)
# maven
annotate_sentences_from_file(
    "../data/processed/maven_sentences_train.txt", out_file_name="maven_train_triplets.json"
)
annotate_sentences_from_file(
    "../data/processed/maven_sentences_test.txt", out_file_name="maven_test_triplets.json"
)

2023-05-11 12:50:27 INFO: Starting server with command: java -Xmx5G -cp /home/ddukic/oee/notebooks/.stanfordnlp_resources/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 4242 -timeout 150000000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-bf4d33c3a71b4d33.props -annotators openie -preload -outputFormat serialized
2023-05-11 13:02:49 INFO: Starting server with command: java -Xmx5G -cp /home/ddukic/oee/notebooks/.stanfordnlp_resources/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 4242 -timeout 150000000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-bf4d33c3a71b4d33.props -annotators openie -preload -outputFormat serialized
2023-05-11 13:03:57 INFO: Starting server with command: java -Xmx5G -cp /home/ddukic/oee/notebooks/.stanfordnlp_resources/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 4242 -timeout 150000000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-bf4d33c3

In [4]:
# stanford ads these implicit relations: 'is', 'has', 'of', 'at_time'
print("Implicit relations are:", implicit_realations)

Implicit relations are: {'is', 'has', 'of', 'at_time'}
