# Knowledge Graph Pipeline

##### Preprocess text
- replace Acronym with their expansion
- replace

In [23]:
# to start server : cp <stanford_core_nlp_folder>
# cd stanford-corenlp-4.4.0
# cd PycharmProjects\EKG\stanford-corenlp-4.4.0
# java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 10000
import json
from pycorenlp import *
import re,nltk,json,math
from io import StringIO
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import string
from collections import Iterable
from string import punctuation
import collections
import pysbd
seg = pysbd.Segmenter(language="en", clean=False)
#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

class Coref:
    """
    Resolve coreference using Standford NLP
    uri = "http://localhost:9000/"
    """

    def __init__(self,url="http://localhost:9000/"):
        self.nlp = StanfordCoreNLP(url)
        self.properties = {
            'timeout': '50000',
            'annotators': "coref",#'dcoref',
            'outputFormat': 'json',
            'ner.useSUTime': 'false',
            'coref.algorithm': 'neural', 
            'coref.neural.greedyness': '0.51',
        }

    def resolve(self,text:str):
        output = self.nlp.annotate(text,self.properties)
        if type(output) is str or type(output) is unicode:
            output = json.loads(output, strict=False)
        output = self._process(output)
        return self._coref_resolve(output)

    def _process(self, corenlp_output):
        for coref in corenlp_output['corefs']:
            mentions = corenlp_output['corefs'][coref]
            antecedent = mentions[0]
            for j in range(1, len(mentions)):
                mention = mentions[j]
                if mention['type'] == 'PRONOMINAL':
                    target_sentence = mention['sentNum']
                    target_token = mention['startIndex'] - 1
                    corenlp_output['sentences'][target_sentence - 1]['tokens'][target_token]['word'] = antecedent[
                        'text']
        return corenlp_output

    
    def _coref_resolve(self, corenlp_output):
        resolved_text = []
        possessives = ['hers', 'his', 'their', 'theirs']
        for sentence in corenlp_output['sentences']:
            for token in sentence['tokens']:
                resolved_text.append(token['word'])
                if token['lemma'] in possessives or token['pos'] == 'PRP$':
                    resolved_text.append("'s")  # add the possessive morpheme
                resolved_text.append(token['after'])
        return ''.join(resolved_text)

def extract_snippets(text:str,n:int=2):
        """ 
        Extracts snippets from text with a sliding window 
        n : sentences per snippet
        returns list of sentences
        """
        #sentences = sent_tokenizer.tokenize(text)
        sentences = seg.segment(text)
        snippets = []
        i = 0
        last_index = 0
        while i < len(sentences):
            snippet = ' '.join(sentences[i:i + n])
            if len(snippet.split(' ')) > 4:
                snippets.append(snippet)
            last_index = i + n
            i += int(math.ceil(n / 2))
        if last_index < len(sentences):
            snippet = ' '.join(sentences[last_index:])
            if len(snippet.split(' ')) > 4:
                snippets.append(snippet)
        return snippets

def replace_doctext(s:str,di:dict):
    """
    replace tokens from text with values from dict
    example : "PM and SOW" => "Project Managemnt and Statemnt of Work"
    returns sentence with replaced tokens
    """
    pattern = '|'.join(sorted(re.escape(k) for k in di))
    return re.sub(pattern, lambda m: di.get(m.group(0).upper()), s, flags=re.IGNORECASE)

def flatten(l):
    """flatten list of lists"""
    return [item for sublist in l for item in sublist]

In [2]:
coref = Coref()

In [3]:
sent1 = """
Elon Musk was born on June 28, 1971 in Africa. He is a business magnate and investor. 
He is the founder, CEO, and Chief Engineer at SpaceX, angel investor, CEO, and Product Architect of Tesla, Inc, founder of The Boring Company, and co-founder of Neuralink and OpenAI. With an estimated net worth of around US$221.4 billion as of July 2022, Musk is the wealthiest person in the world according to both the Bloomberg Billionaires Index and Forbes' real-time billionaires list.
Elon Musk's net worth was estimated at $220 billion as of June 2022, making him the wealthiest human on the planet.
Musk has promoted cryptocurrencies and supports them over traditional government-issued fiat currencies.
"""

sent2 = """
Present Value (PV) is the current value of a future sum of money.
Present Value formula is PV=FV/(1+i)n.
Present Value is the value right now of some amount of money in the future.
"""

sent3 = """
Albert Einstein was a German born theoretical physicist. He developed the theory of relativity.
"""

sent4 = """
A Statement of Work consists of Project start dates, Scope of Work, Project Estimate and Resource Deployment plan.
A Statement of Work also includes billing rates, billing cycles for the duration of the project or a specific time period and key deliverables of the project.'
"""

sent5 = """
Service system components refers to all the resources needed to successfully deliver the service e.g. human resources, hardware, software, etc. These may include components owned by Iris, customer or third party
"""

In [27]:
res

'java.lang.RuntimeException: java.lang.ClassNotFoundException: edu.stanford.nlp.hcoref.md.MentionDetectionClassifier'

### Make Snippet and Resolve coreference

In [26]:
res =nlp.annotate(sent3, properties={'annotators': 'coref', 
                                   'coref.algorithm': 'neural', 
                                   'coref.neural.greedyness': '0.51',
                                    "outputFormat": "json",
                                  }
            )
out = json.loads(res)['sentences']

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
out

In [24]:
c = Coref()

In [25]:
c.resolve(sent1)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [15]:
out

[{'index': 0,
  'parse': '(ROOT\r\n  (S\r\n    (NP (NNP Albert) (NNP Einstein))\r\n    (VP (VBD was)\r\n      (NP (DT a)\r\n        (ADJP (JJ German) (VBN born))\r\n        (JJ theoretical) (NN physicist)))\r\n    (. .)))',
  'basicDependencies': [{'dep': 'ROOT',
    'governor': 0,
    'governorGloss': 'ROOT',
    'dependent': 8,
    'dependentGloss': 'physicist'},
   {'dep': 'compound',
    'governor': 2,
    'governorGloss': 'Einstein',
    'dependent': 1,
    'dependentGloss': 'Albert'},
   {'dep': 'nsubj',
    'governor': 8,
    'governorGloss': 'physicist',
    'dependent': 2,
    'dependentGloss': 'Einstein'},
   {'dep': 'cop',
    'governor': 8,
    'governorGloss': 'physicist',
    'dependent': 3,
    'dependentGloss': 'was'},
   {'dep': 'det',
    'governor': 8,
    'governorGloss': 'physicist',
    'dependent': 4,
    'dependentGloss': 'a'},
   {'dep': 'dep',
    'governor': 6,
    'governorGloss': 'born',
    'dependent': 5,
    'dependentGloss': 'German'},
   {'dep': 'amod'

In [4]:
nlp=StanfordCoreNLP("http://localhost:9000/")
properties = {
        "timeout": "50000",
        "annotators":"openie",
        "outputFormat": "json",
        "openie.max_entailments_per_clause":"3", # no of triples from each clause
        "openie.triple.strict":"true",
    #'openie.affinity_probability_cap': 2 / 3
    }


def get_triplets(text):
    res = nlp.annotate(text, properties=properties)
    out = json.loads(res)['sentences']
    try:
        res_parsed = [map(lambda j:(j['subject'],j['relation'],j['object']),i['openie']) for i in out]
        res_list = list(itertools.chain(*res_parsed))
    except:
        res_list = []
    return res_list

In [9]:
resolved_text = coref.resolve(sent3)
sentences =extract_snippets(resolved_text,n=1)
triples = [get_triplets(sent) for sent in sentences]
flatten(triples)

[]

In [10]:
sentences

['Albert Einstein was a German born theoretical physicist. ',
 'Albert Einstein developed the theory of relativity.\n']