# Triplet Extraction

- preprocess : find and replace Acronym with their expansion
- Decoref text
- Disambiguate text into list of sentences
- Extract triples
- postprocess triples

In [1]:
import re,json, pysbd,math,warnings
from pycorenlp import *
import pandas as pd
from allennlp.predictors.predictor import Predictor
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')
seg = pysbd.Segmenter(language="en", clean=False)

from utils import AcronymDefinitionPair,replace_doctext,flatten

def preporcess(text):
    """ preporcess text"""
    di = AcronymDefinitionPair().extract(doc_text=text)
    if di:
        text = replace_doctext(text,di)
        text = replace_doctext(text,{'('+v+')':k for k,v in di.items()})
        return  " ".join(text.split())
    
    return " ".join(text.split())

# postprocessing functions
def postprocess(text):
    text_split = text.split()
    articles =  ['a','A', 'An', 'an','the','The']
    if text_split[0] in articles:
        text_split.pop(0)
    return ' '.join(text_split)

def text_to_triples(sentences):
    
    if isinstance(sentences,str):
        sentences = [sentences]

    triples = []
    for sentence in sentences:
        sentence = preporcess(sentence)
        sentence = coref_predictor.coref_resolved(sentence)
        segments = seg.segment(sentence)
        for segment in segments:
            #if _is_passive(nlp(segment)): print('Passive', segment)
            triple = tripleExtraction(segment,openie_predictor)
            triples.append(triple) 

    triples = flatten(triples)
    return triples

# AllenNLP helping function
def tripleExtraction(sentence,predictor):
    result = predictor.predict(sentence = sentence)
    triples = []
    for verb in result['verbs']:
        #print('verb description',verb['description']) 
        ARG0 = [j for j,tag in enumerate(verb['tags']) if re.match('([BI]-)ARG0',tag)]
        ARG1 = [k for k,tag in enumerate(verb['tags']) if re.match('([BI]-)ARG1',tag)]
        ARG2 = [k for k,tag in enumerate(verb['tags']) if re.match('([BI]-)ARG2',tag)]
        ARGLOC = [k for k,tag in enumerate(verb['tags']) if re.match('([BI]-)ARGM-LOC',tag)]
        #print('ARG',ARGLOC, ' '.join([result['words'][l] for l in ARGLOC]))
        
        if ARG0 and ARG1:
            ARG0 = ' '.join([result['words'][l] for l in ARG0])
            ARG1 = ' '.join([result['words'][m] for m in ARG1])
            # if Arg1 occurs before Arg0
            if verb['description'].startswith('[ARG1'):
                triples.append((ARG1, verb['verb'], ARG0))
            else:
                triples.append((ARG0, verb['verb'], ARG1))
            
        elif ARG0 and ARGLOC:
            ARG0 = ' '.join([result['words'][l] for l in ARG0])
            ARGLOC = ' '.join([result['words'][m] for m in ARGLOC])
            # if Arg1 occurs before Arg0
            if verb['description'].startswith('[ARG0'):
                triples.append((ARG0, verb['verb'], ARGLOC))
            else:
                triples.append((ARGLOC, verb['verb'], ARG0))
        
        
        elif ARG1 and ARG2:
            ARG1 = ' '.join([result['words'][m] for m in ARG1])
            ARG2 = ' '.join([result['words'][l] for l in ARG2])
            # if Arg2 occurs before Arg1
            if verb['description'].startswith('[ARG2'):
                triples.append((ARG2, verb['verb'], ARG1))
            else:
                triples.append((ARG1, verb['verb'], ARG2))
                
    return triples 

import networkx as nx
import matplotlib.pyplot as plt


def draw_kg(pairs,figsize=(20, 15)):
    k_graph = nx.from_pandas_edgelist(pairs, 'subject', 'object',
            create_using=nx.MultiDiGraph())
    node_deg = nx.degree(k_graph)
    layout = nx.spring_layout(k_graph, k=0.15, iterations=20)
    plt.figure(num=None, figsize=figsize, dpi=80)
    nx.draw_networkx(
        k_graph,
        node_size=[int(deg[1]) * 500 for deg in node_deg],
        arrowsize=20,
        linewidths=1.5,
        pos=layout,
        edge_color='black',
        edgecolors='black',
        node_color='yellow',
        )
    labels = dict(zip(list(zip(pairs.subject, pairs.object)),pairs['relation'].tolist()))
    nx.draw_networkx_edge_labels(k_graph, pos=layout, edge_labels=labels,font_color='red')
    plt.axis('off')
    plt.show()
    
#coref_predictor = Predictor.from_path("coref-spanbert-large-2021.03.10.tar.gz")
#openie_predictor = Predictor.from_path("openie-model.2020.03.26.tar.gz")

ModuleNotFoundError: No module named 'utils'

In [None]:
sentences = ["""
The Present Value (PV) is the current value of a future sum of money.
Its formula is PV=FV/(1+i)n.
Present Value is the value right now of some amount of money in the future.
""",
"""
A Statement of Work consists of Project start dates, Scope of Work, Project Estimate and Resource Deployment plan.
A Statement of Work also includes billing rates, billing cycles for the duration of the project or a specific time period and key deliverables of the project.'
""",
"""
Albert Einstein was a German born theoretical physicist. He really developed the theory of relativity.
""",
"""
Program increment planning happens at the beginning of the release but can be revisited later. 
Program increment plan is a high level plan for multiple sprints to implement prioritized software feature. 
A typical increment plan should not exceed more than 3 months however it can vary as per the business needs. 
The last sprint of a given increment plan will be containing activities required to make a production release such as Regression, deployment planning etc.
""",
"""
Agile Practices are engineering activities that ensures quality product development and help the teams in self-organizing, improving productivity & efficiency of software. 
These practices should be followed throughout the Sprints to ensure best throughput
""",
"""Test Driven Development refers to writing Unit test first before developing any code. 
This helps in reducing the defect rate thus improving the quality of the deliverables.
""",
"""
Behavior Driven Development (BDD) framework is a software development process that is an offshoot of 
Test Driven Development (TDD) framework. BDD is an agile testing methodology. 
It is the process of development, based on test-driven development and domain-driven, object-oriented analysis. 
However, it can be organized with traditional testing as well. Test Driven Development refers to writing Unit test first before developing any code. 
This helps in reducing the defect rate thus improving the quality of the deliverables.
""",
"""
Project review shall be conducted during Senior Management Reviews. 
These will be Monthly / Quarterly project review meeting with Director, Delivery Head, DE and other stakeholders as needed (e.g. HR, Facilities, IT Services, ISG, etc.), as per frequency defined in PMW.
""",
"""
Service system components refers to all the resources needed to successfully deliver the service e.g. human resources, hardware, software, etc. These may include components owned by Iris, customer or third party
""",
"""
The requirement analysis process requires eliciting, analyzing, specifying, prioritizing, verifying and approving requirements that the product must deliver and support. The results are captured in a Requirement Document. During this process it is important to have all of the Stakeholders involved. Since this is the process in which all business and processing requirements are determined and agreed to, it is critical that all parties understand the consequences of including or excluding requirements from scope.
""",
"""
Incident management refers to the activities of an organization required to identify, analyze and correct situations that threaten or weaken security e.g. a fire in the office or virus attack on the network.
An Incident Response Team (IRT), specifically designated for this task beforehand or on the spot, would then manage the organization through the incident. The first goal of the incident management process is to restore a normal service operation as quickly as possible and to minimize the impact on business operations, thus ensuring that the best possible levels of service quality and availability are maintained.
""",
"""
Employees should display appropriate personal behavior in social media interactions i.e. the behavior should be in line with the law of the land and they should take accountability for their respective actions. It is imperative that employees understand that in today’s digital age of the internet, the personal and professional spaces are fast conflating and behavior in the personal space may end up having an impact on one’s professional space as well as on the organization as a whole.
""",
"""
Each project within the scope will be reviewed by the panel.
The mandatory reviewers have to be present for the respective reviews.
The panel should be the same for the life of the project. The review owner is identified by the DH. The review owner is responsible for ensuring that the reviews are held as per the plan
"""
]

In [None]:
triples = text_to_triples(sentences)

In [32]:
s = "The review owner is identified by the DH"
text_to_triples(s)

verb description The review owner [V: is] identified by the DH
verb description [ARG1: The review owner] is [V: identified] [ARG0: by the DH]


[('The review owner', 'identified', 'by the DH')]

In [29]:
df = pd.DataFrame(triples,columns=['subject','relation','object'])#.sort_values(by='subject')
df['subject'] = df['subject'].apply(lambda x:postprocess(x))
df['object'] = df['object'].apply(lambda x:postprocess(x))
df['subject'] = df['subject'].str.lower()

In [None]:
df.sample(10)

In [None]:
draw_kg(df)
#draw_kg(df[df['subject']==df['subject'].mode()[0]])

In [30]:
triples

[('The Present Value', 'is', 'the current value of a future sum of money'),
 ("The Present Value 's formula", 'is', 'The Present Value = FV/(1+i)n'),
 ('The Present Value', 'is', 'the value right now of some amount of money'),
 ('A Statement of Work', 'consists', 'of Project start dates'),
 ('A Statement of Work',
  'includes',
  'billing rates , billing cycles for the duration of the project or a specific time period and key deliverables of the project'),
 ('Albert Einstein', 'was', 'a German born theoretical physicist'),
 ('Albert Einstein', 'developed', 'the theory of relativity'),
 ('Program increment plan',
  'is',
  'a high level plan for multiple sprints to implement prioritized software feature'),
 ('A typical increment plan',
  'exceed',
  'more than 3 months however A typical increment plan can vary as per the business needs'),
 ('The last sprint of a given increment plan',
  'be',
  'containing activities required to make a production release such as Regression , deployment 

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
text = ("""within with at in on Universit of Brown""")
doc = nlp(text)
# Token and Tag
for token in doc:
    print(token, token.pos_)

In [None]:
[t.pos_ for t in doc]

## Custom Triplets

In [15]:
from svo import findSVOs, nlp

In [17]:
triples_1 =[]
for sentence in sentences:
    sentence = preporcess(sentence)
    sentence = coref_predictor.coref_resolved(sentence)
    segments = seg.segment(sentence)
    for segment in segments:
        t = findSVOs(segment)
        triples_1.append(t)

In [22]:
triples_1 = flatten(triples_1)

In [24]:
df = pd.DataFrame(triples_1,columns=['subject','relation','object'])#.sort_values(by='subject')
df['subject'] = df['subject'].apply(lambda x:postprocess(x))
#df['object'] = df['object'].apply(lambda x:postprocess(x))
#df['subject'] = df['subject'].str.lower()

In [26]:
df

Unnamed: 0,subject,relation,object
0,Present Value,is,the current value of a future sum of money
1,Value formula,is,The Present Value
2,Value formula,is,FV/(1+i)n
3,Present Value,is,the value
4,Statement of,consists,
...,...,...,...
69,panel,review,Each project within the scope
70,mandatory reviewers,have,
71,panel,be,the same for the life of the project
72,DH,identify,The review owner


In [27]:
triples_1

[('The Present Value', 'is', 'the current value of a future sum of money'),
 ('Value formula', 'is', 'The Present Value'),
 ('Value formula', 'is', 'FV/(1+i)n'),
 ('The Present Value', 'is', 'the value'),
 ('A Statement of', 'consists'),
 ('A Statement of', 'includes', 'billing rates , cycles'),
 ('a German physicist', 'born'),
 ('Albert Einstein', 'developed', 'the theory of relativity'),
 ('the beginning of the release', 'happen', 'increment planning'),
 ('a level plan for multiple sprints', 'implement', 'feature'),
 ('feature', 'prioritized'),
 ('A increment plan', '!exceed', '3 months'),
 ('A increment plan', 'vary'),
 ('increment', 'given'),
 ('The last sprint of a increment plan', 'containing', 'activities'),
 ('activities', 'make', 'a production release as Regression , etc'),
 ('activities', 'make', 'a production release as'),
 ('activities', 'engineering'),
 ('activities', 'ensures', 'quality product development'),
 ('activities', 'help', 'the teams'),
 ('activities', 'organizi

## Textacy triplets

In [None]:
import spacy
from textacy.extract import subject_verb_object_triples
nlp=spacy.load('en_core_web_sm')

In [None]:
doc = nlp(coref_text)
final_svos = []
final_text_svos = []
entity_dict = {}
svo_labels = [] 
    
for ent in doc.ents:
    if ent not in entity_dict.keys():
        entity_dict[str(ent)] = ent.label_       
    
svos = list(subject_verb_object_triples(doc))
svos_text = [(str(x[0]).strip(), str(x[1]).strip(), str(x[2]).strip()) for x in svos]
final_svos = final_svos + svos
final_text_svos = final_text_svos + svos_text
final_text_svos

## Spacy triplets

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

OBJECT_DEPS = {"dobj", "dative", "attr", "oprd","conj"}
SUBJECT_DEPS = {"nsubj", "nsubjpass", "csubj", "agent", "expl"}
WH_WORDS = {"WP", "WP$", "WRB"}

def extract_svo(doc):
    sub = []
    at = []
    ve = []
    for token in doc:
        # is this a verb?
        if token.pos_ == "VERB":
            ve.append(token.text)
        # is this the object?
        if token.dep_ in OBJECT_DEPS or token.head.dep_ in OBJECT_DEPS:
            at.append(token.text)
        # is this the subject?
        if token.dep_ in SUBJECT_DEPS or token.head.dep_ in SUBJECT_DEPS:
            sub.append(token.text)
    return " ".join(sub).strip(), " ".join(ve).strip(), " ".join(at).strip()


In [None]:
doc = nlp(coref_text)
for s in coref_text.split('.'):
    print(extract_svo(nlp(s)))

In [54]:
def _is_passive(tokens):
    for tok in tokens:
        if tok.dep_ == "auxpass":
            return True
    return False

In [47]:
s = 'written by'
_is_passive(nlp(s))

written ROOT
by agent


False