Triplet Extraction

In [37]:
pip install textacy



In [38]:
#!/usr/bin/env python
from __future__ import unicode_literals
# Load Library files
import en_core_web_sm
import spacy
import textacy
nlp = en_core_web_sm.load()
SUBJ = ["nsubj","nsubjpass"] 
VERB = ["ROOT"] 
OBJ = ["dobj", "pobj", "dobj"] 
text = nlp(u'The cat sat on the mat. The cat jumped and picked up the biscuit. The cat ate biscuit and cookies.')
sub_toks = [tok for tok in text if (tok.dep_ in SUBJ) ]
obj_toks = [tok for tok in text if (tok.dep_ in OBJ) ]
vrb_toks = [tok for tok in text if (tok.dep_ in VERB) ]
text_ext = list(textacy.extract.subject_verb_object_triples(text))
print("Subjects:", sub_toks)
print("VERB :", vrb_toks)
print("OBJECT(s):", obj_toks)
print ("SVO:", text_ext)

Subjects: [cat, cat, cat]
VERB : [sat, jumped, ate]
OBJECT(s): [mat, biscuit, biscuit]
SVO: [(cat, ate, biscuit), (cat, ate, cookies)]


In [39]:
import spacy

import textacy

nlp = spacy.load('en')
text = nlp(u'Startup companies create jobs and support innovation. Hilary supports entrepreneurship.')

text_ext = textacy.extract.subject_verb_object_triples(text)
list(text_ext)

[(companies, create, innovation), (Hilary, supports, entrepreneurship)]

WordNet Task

In [6]:
import nltk
nltk.download('all')
from nltk.corpus import wordnet

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

In [42]:
# lets use word paint as an exqmple
syns = wordnet.synsets("good")

# An example of a synset:
print(syns[0].name())
print('\n')
# Just the word:
print(syns[0].lemmas()[0].name())
print('\n')

# Definition of that first synset:
print(syns[0].definition())
print('\n')
# Examples of the word in use in sentences:
print(syns[0].examples())
print('\n')




print('Set of hyponyms:\n', syns[0].hyponyms(), '\n' )
print('Set of hypernyms:\n', syns[0].hypernyms(), '\n' )
print('Set of root-hypernyms:\n', wordnet.synset('dog.n.01').root_hypernyms(), '\n' )
print('Set of common-hypernyms:\n', wordnet.synset('dog.n.01').common_hypernyms(wordnet.synset('cat.n.01')) , '\n' )
print('Set of lowest-common-hypernyms:\n', wordnet.synset('dog.n.01').lowest_common_hypernyms(wordnet.synset('cat.n.01')) , '\n' )
print('Set of part-meronyms:\n', wordnet.synset('table.n.2').part_meronyms(), '\n' )
print('Set of member-meronyms:\n', wordnet.synset('faculty.n.2').member_meronyms() , '\n' )
print('Set of member-holonyms:\n', wordnet.synset('kitchen.n.01').part_holonyms() , '\n' )
print('Set of part-holonyms:\n', wordnet.synset('course.n.7').part_holonyms(), '\n' )
print('Set of substance-holonyms:\n', wordnet.synset('gin.n.1').substance_holonyms()  , '\n' )
print('Set of substance-meronyms:\n', wordnet.synset('water.n.1').substance_meronyms()  , '\n' )
print('Entailment of word Snore:\n', wordnet.synset('snore.v.01').entailments(), '\n' )


good.n.01


good


benefit


['for your own good', "what's the good of worrying?"]


Set of hyponyms:
 [Synset('common_good.n.01')] 

Set of hypernyms:
 [Synset('advantage.n.01')] 

Set of root-hypernyms:
 [Synset('entity.n.01')] 

Set of common-hypernyms:
 [Synset('placental.n.01'), Synset('physical_entity.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'), Synset('animal.n.01'), Synset('carnivore.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('mammal.n.01'), Synset('organism.n.01'), Synset('entity.n.01'), Synset('chordate.n.01')] 

Set of lowest-common-hypernyms:
 [Synset('carnivore.n.01')] 

Set of part-meronyms:
 [Synset('leg.n.03'), Synset('tabletop.n.01'), Synset('tableware.n.01')] 

Set of member-meronyms:
 [Synset('professor.n.01')] 

Set of member-holonyms:
 [Synset('dwelling.n.01')] 

Set of part-holonyms:
 [Synset('meal.n.01')] 

Set of substance-holonyms:
 [Synset('gin_and_it.n.01'), Synset('gin_and_tonic.n.01'), Synset('martini.n.01'), Synset('pink

TRIPLET EXTRACTION WITH EXAMPLES

In [43]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

In [44]:
candidate_sentences = "the drawdown process is governed by astm standard d823"
doc = nlp(candidate_sentences)

In [28]:
for tok in doc:
    print(tok.text, "...", tok.dep_)

the ... det
drawdown ... amod
process ... nsubjpass
is ... auxpass
governed ... ROOT
by ... agent
astm ... compound
standard ... amod
d823 ... pobj


In [45]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [46]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [47]:
get_relation("John completed the task")

'completed'

In [48]:
text="John completed the task"

In [33]:
ent=get_entities(text)
rel=get_relation(text)

In [49]:
ent

['drawdown  process', 'astm standard']

In [50]:
new_list=[]
if len(ent)==2:
    for i,n in enumerate(ent):
        #print(i,n)
        if i==1:
            new_list.append(rel) 
        else:
            new_list.append(n)
    new_list.append(ent[1])
print(new_list)

['drawdown  process', 'governed by', 'astm standard']


In [51]:
text="the drawdown process is governed by astm standard"
ent=get_entities(text)
rel=get_relation(text)
new_list=[]
if len(ent)==2:
    for i,n in enumerate(ent):
        #print(i,n)
        if i==1:
            new_list.append(rel) 
        else:
            new_list.append(n)
    new_list.append(ent[1])
print(new_list)

['drawdown  process', 'governed by', 'astm standard']
