### Open Information Extraction using Rule-Based methods

In [2]:
import pandas as pd
import numpy as np
import os
import re
import spacy
from spacy.lang.en import English
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
# load spaCy model
nlp = spacy.load("en_core_web_lg")

In [4]:
def getRelationTriple(doc):
    
    head = ''
    relation = ''
    tail = ''
    sphrase = ''
    ophrase = ''
    
    # iterate over tokens
    for i,token in enumerate(doc): 
        # If word is a subject noun, proper noun, pronoun
        if (token.dep_ in ['nsubjpass','nsubj']) and (token.pos_ in ['NOUN','PROPN','PRON']):
            #iterate over the children nodes
            for tok in token.children:
                # If word is a adjective or has a compound dependency
                if (tok.pos_ in ['ADJ']) or (tok.dep_ in ['compound']):
                    sphrase += tok.text + ' '
            # Append to the head entity
            if len(sphrase)!=0:
                head += sphrase + token.text 
            else:
                head += token.text 
                
            
        # Predict is the root.
        if (token.dep_ in ['ROOT','agent','attr','prep']) and (token.pos_ in ['VERB','ADP','NOUN', 'AUX', 'SCONJ']):
            relation += ' '+ token.lemma_
            

        # If word is an object noun, proper noun, pronoun
        if (token.dep_ in ['pobj', 'dobj']) and (token.pos_ in ['NOUN','PROPN','PRON']):           
            #iterate over the children nodes
            for tok in token.children:
                # If word is a adjective or has a compound dependency
                if (tok.pos_ in ['ADJ', 'NUM']) or (tok.dep_ in ['compound','nummod']):
                    ophrase +=tok.text + ' '
                # Append to the tail entity
            if len(ophrase)!=0:
                tail += ophrase + token.text
            else:
                tail += ' '+token.text        
            
    return (head.strip(), relation.strip(), tail.strip())

In [6]:
def getConciseTriple(doc):
    
    head = ''
    relation = ''
    tail = ''
    sphrase = ''
    ophrase = ''
    
    # iterate over tokens
    for i,token in enumerate(doc): 
        # If word is a subject noun, proper noun, pronoun
        if (token.dep_ in ['nsubjpass','nsubj']) and (token.pos_ in ['NOUN','PROPN','PRON']):
            #iterate over the children nodes
            for tok in token.children:
                # If word is a adjective or has a compound dependency
                if (tok.dep_ in ['compound']):
                    sphrase += tok.text + ' '
            # Append to the head entity
            if len(sphrase)!=0:
                head += sphrase + token.text 
            else:
                head += token.text 
                
            
        # Predict is the root.
        if (token.dep_ in ['ROOT','agent','attr','prep']) and (token.pos_ in ['VERB','NOUN', 'AUX', 'SCONJ']):
            relation += ' '+ token.lemma_
            

        # If word is an object noun, proper noun, pronoun
        if (token.dep_ in ['pobj', 'dobj']) and (token.pos_ in ['NOUN','PROPN','PRON']):           
            #iterate over the children nodes
            for tok in token.children:
                # If word is a adjective or has a compound dependency
                if (tok.dep_ in ['compound']):
                    ophrase +=tok.text + ' '
                # Append to the tail entity
            if len(ophrase)!=0:
                tail += ophrase + token.text
            else:
                tail += ' '+token.text        
            
    return (head.strip(), relation.strip(), tail.strip())

In [7]:
#text = "London is the capital and largest city of England and the United Kingdom." 
#text = "Tableau was recently acquired by Salesforce."
#text = "Salesforce recently acquired Tableau." 
#text = "Wall Street is extending tech-led selloff."
#text = "U.S. Supplier aim at Shanghai regulator."
text = "Microsoft buys Israeli digital pen maker N-trig."

doc = nlp(text)

print(getConciseTriple(doc))
print('')
print([(e.text, e.label_) for e in doc.ents])
print('')
#print([str(e.text) for e in doc.ents])
#print('')
#print(' '.join([str(e.text) for e in doc.ents]))

('Microsoft', 'buy', 'maker trig')

[('Microsoft', 'ORG'), ('Israeli', 'NORP'), ('N-trig', 'ORG')]



In [26]:
for token in doc:
    print(token.text, "---->", token.dep_, "--->", token.pos_)

Gates ----> compound ---> PROPN
Foundation ----> nsubj ---> PROPN
will ----> aux ---> VERB
eradicate ----> ROOT ---> VERB
dideases ----> dobj ---> NOUN
. ----> punct ---> PUNCT


#### Performing additional pre-processing tasks

In [68]:
msft_unique = pd.read_csv('msft_unique_txt.csv')
#msft_unique.drop(msft_unique.columns[msft_unique.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
msft_unique.set_index(['Date'], inplace=True)
#msft_unique.reset_index(drop=True)

In [69]:
msft_unique.head(5)

Unnamed: 0_level_0,News
Date,Unnamed: 1_level_1
2015-01-02,Code update has crashed Bing search engine.
2015-01-06,Sony praises employees.
2015-01-07,OSI Group LLC aim at FDA.
2015-01-13,United States Senate are seeking foreign workers.
2015-01-16,Wet Seal Inc. has filed for bankruptcy.


In [70]:
# Remove new line characters
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub('\n ','',str(x)))
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub('\n ',' ',str(x)))

#Remove apostrophes
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub("'s",'',str(x)))

# Remove hyphens
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub("-",'',str(x)))
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub("— ",'',str(x)))
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub("—",' ',str(x)))

# Remove quotation marks
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub('\"','',str(x)))

# Remove any reference to outside text
msft_unique['News'] = msft_unique['News'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", str(x)))

#msft_unique['News'] = msft_unique['News'].apply(lambda x: x.replace('BRIEF-', ''))

In [71]:
msft_unique.tail(5)

Unnamed: 0_level_0,News
Date,Unnamed: 1_level_1
2018-12-03,Washington Supreme Court revives Microsoft law...
2018-12-06,Microsoft web browser is introduced for Macint...
2018-12-07,Microsoft adopts Face Recognition Technology E...
2018-12-23,Microsoft has released Font Makker app.
2018-12-31,Microsoft has closed as the top public company.


#### Perform Open Information Extraction

In [72]:
msft_openie = msft_unique.copy()

In [73]:
# Create tokens for each news headline
msft_openie['doc'] = msft_openie['News'].apply(lambda x: nlp(x))

In [74]:
# Extract triples
# msft_openie['triple'] = msft_openie['doc'].apply(lambda x: getRelationTriple(x))

In [75]:
# Extract concise triples
msft_openie['Ctriple'] = msft_openie['doc'].apply(lambda x: getConciseTriple(x))

In [76]:
# Get Head entity 
msft_openie['Head'] =  [tuple[0] for tuple in msft_openie['Ctriple']]

# Get relation
msft_openie['Relation'] =  [tuple[1] for tuple in msft_openie['Ctriple']]

# Get relation
msft_openie['Tail'] =  [tuple[2] for tuple in msft_openie['Ctriple']]

In [77]:
msft_openie.head(5)

Unnamed: 0_level_0,News,doc,Ctriple,Head,Relation,Tail
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-02,Code update has crashed Bing search engine.,"(Code, update, has, crashed, Bing, search, eng...","(Code update, crash, Bing search engine)",Code update,crash,Bing search engine
2015-01-06,Sony praises employees.,"(Sony, praises, employees, .)","(Sony, praise, employees)",Sony,praise,employees
2015-01-07,OSI Group LLC aim at FDA.,"(OSI, Group, LLC, aim, at, FDA, .)","(OSI Group LLC, aim, FDA)",OSI Group LLC,aim,FDA
2015-01-13,United States Senate are seeking foreign workers.,"(United, States, Senate, are, seeking, foreign...","(States Senate, seek, workers)",States Senate,seek,workers
2015-01-16,Wet Seal Inc. has filed for bankruptcy.,"(Wet, Seal, Inc., has, filed, for, bankruptcy, .)","(Wet Seal Inc., file, bankruptcy)",Wet Seal Inc.,file,bankruptcy


In [78]:
# msft_openie.to_csv('./msft_triple.csv')

#### Named Entity Recognition 

In [79]:
def getText(text):    
    return (' '.join([str(e.text) for e in nlp(text).ents]))
#print(' '.join([str(e.text) for e in doc.ents]))

def getLabel(text):    
    return [e.label_ for e in nlp(text).ents]

In [80]:
msft_openie['Head_Text'] = msft_openie['Head'].apply(lambda x: getText(x))

In [81]:
msft_openie['Head_Label'] = msft_openie['Head'].apply(lambda x: getLabel(x))

In [82]:
msft_openie['Tail_Text'] = msft_openie['Tail'].apply(lambda x: getText(x))

In [83]:
msft_openie['Tail_Label'] = msft_openie['Tail'].apply(lambda x: getLabel(x))

In [84]:
msft_openie.head(5)

Unnamed: 0_level_0,News,doc,Ctriple,Head,Relation,Tail,Head_Text,Head_Label,Tail_Text,Tail_Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-02,Code update has crashed Bing search engine.,"(Code, update, has, crashed, Bing, search, eng...","(Code update, crash, Bing search engine)",Code update,crash,Bing search engine,,[],Bing,[ORG]
2015-01-06,Sony praises employees.,"(Sony, praises, employees, .)","(Sony, praise, employees)",Sony,praise,employees,Sony,[ORG],,[]
2015-01-07,OSI Group LLC aim at FDA.,"(OSI, Group, LLC, aim, at, FDA, .)","(OSI Group LLC, aim, FDA)",OSI Group LLC,aim,FDA,OSI Group LLC,[ORG],FDA,[ORG]
2015-01-13,United States Senate are seeking foreign workers.,"(United, States, Senate, are, seeking, foreign...","(States Senate, seek, workers)",States Senate,seek,workers,Senate,[ORG],,[]
2015-01-16,Wet Seal Inc. has filed for bankruptcy.,"(Wet, Seal, Inc., has, filed, for, bankruptcy, .)","(Wet Seal Inc., file, bankruptcy)",Wet Seal Inc.,file,bankruptcy,Wet Seal Inc.,[ORG],,[]


In [86]:
msft_openie.to_csv('./msft_triple.csv')

#### Entity Linking: Getting Wikidata QCode for Entities