## Building a Knowledge Graph

In [47]:
import pandas as pd
from spacy.tokens import Span
from utils import display_ner, reset_pipeline, print_dep_tree, alias_lookup


In [2]:
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Azus\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [3]:
from nltk.corpus import reuters

# List of documents
documents = reuters.fileids()
print(str(len(documents)) + " documents")
print(str(len(reuters.categories())) + " categories:")
print(reuters.categories()[:10] + ['...'])

print(reuters.readme()[:200])

10788 documents
90 categories:
['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', '...']

      The Reuters-21578 benchmark corpus, ApteMod version

This is a publically available version of the well-known Reuters-21578
"ApteMod" corpus for text categorization.  It has been used in
public


In [4]:
from nltk.corpus import reuters
from tqdm.auto import tqdm

tqdm.pandas()

# create fileid column 
df = pd.DataFrame(reuters.fileids("acq"), columns=['fileid'])
# load raw texts
df['raw'] = df['fileid'].progress_map(lambda f: reuters.raw(f))
# df['raw'] = df['fileid'].progress_apply(lambda f: reuters.raw(f))

# set index to numeric id
df.index = df['fileid'].map(lambda f: int(f.split('/')[1]))
df.index.name = None
df = df.drop(columns=['fileid']).sort_index()

df.sample(3, random_state=12)

  from pandas import Panel
100%|██████████| 2369/2369 [00:00<00:00, 5715.80it/s]


Unnamed: 0,raw
12441,GUINNESS TO SELL RETAIL INTERESTS\n Guinness ...
3061,FIRST BOSTON AFFILIATE TO ACQUIRE ALLEGHENY IN...
17810,ATCOR&lt;ATCO.O> SEEKS BUYERS FOR CONSUMER BUS...


In [5]:
df[['headline', 'raw_text']] = df.progress_apply(lambda row: row['raw'].split('\n', 1), axis='columns', result_type='expand')

100%|██████████| 2369/2369 [00:00<00:00, 14629.10it/s]


#### Cleaningi

In [6]:
import re

def clean(text):
    text = text.replace('&lt;','<') # html escape
    text = re.sub(r'[<>]', '"', text) # quotation marks instead of <>
    text = re.sub(r'[ ]*"[A-Z\.]+"', '', text) # drop stock symbols
    text = re.sub(r'[ ]*\([A-Z\.]+\)', '', text) # drop stock symbols
    text = re.sub(r'\bdlr(s?)\b', r'dollar\1', text, flags=re.I)
    text = re.sub(r'\bmln(s?)\b', r'million\1', text, flags=re.I)
    text = re.sub(r'\bpct\b', r'%', text, flags=re.I)
    # normalize INC to Inc
    text = re.sub(r'\b(Co|Corp|Inc|Plc|Ltd)\b', lambda m: m.expand(r'\1').capitalize(), text, flags=re.I)
    text = re.sub(r'"', r'', text) # quotation marks
    text = re.sub(r'\s+', ' ', text) # multiple whitespace by one
    text = re.sub(r'acquisiton', 'acquisition', text) # typo
    text = re.sub(r'Nippon bLife', 'Nippon Life', text) # typo
    text = re.sub(r'COMSAT.COMSAT', 'COMSAT. COMSAT', text) # missing space at end of sentence
    #text = re.sub(r'Audio/Video', 'Audio-Video', text) # missing space at end of sentence

    return text.strip()

In [7]:
# that's what the substitutions do
texts = [
"""Trafalgar House Plc &lt;TRAF.L> said it has\n  acquired the entire share capital 
of &lt;Capital Homes Inc> of the\n  U.S. For 20 mln dlrs in cash.""",
"""Equiticorp Holdings Ltd &lt;EQUW.WE> now owns\n  or has received acceptances 
representing 59.93 pct of the\n  issued ordinary share capital of 
Guinness Peat Group Plc\n  &lt;GNSP.L>, Equiticorp said in a statement.""",
"""Computer Terminal Systems Inc said it has completed the sale of 200,000 shares 
of its common stock, and warrants to acquire an additional one mln shares, 
to "Sedio N.V." of Lugano, Switzerland for 50,000 dlrs.""",
"""North American Group Ltd said it has a definitive agreement 
to buy 100  pct of Pioneer Business Group Inc of Atlanta.""" 
]

for text in texts:
    print(clean(text), end="\n\n")

Trafalgar House Plc said it has acquired the entire share capital of Capital Homes Inc of the U.S. For 20 million dollars in cash.

Equiticorp Holdings Ltd now owns or has received acceptances representing 59.93 % of the issued ordinary share capital of Guinness Peat Group Plc , Equiticorp said in a statement.

Computer Terminal Systems Inc said it has completed the sale of 200,000 shares of its common stock, and warrants to acquire an additional one million shares, to Sedio N.V. of Lugano, Switzerland for 50,000 dollars.

North American Group Ltd said it has a definitive agreement to buy 100 % of Pioneer Business Group Inc of Atlanta.



#### We apply it to the raw_text and create a new text column:



In [8]:
df['text'] = df['raw_text'].progress_map(clean)
df['headline'] = df['headline'].progress_map(clean)

  0%|          | 0/2369 [00:00<?, ?it/s]

100%|██████████| 2369/2369 [00:00<00:00, 5313.22it/s]
100%|██████████| 2369/2369 [00:00<00:00, 38705.14it/s]


In [9]:
df

Unnamed: 0,raw,headline,raw_text,text
10,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,COMPUTER TERMINAL SYSTEMS COMPLETES SALE,Computer Terminal Systems Inc said\n it has...,Computer Terminal Systems Inc said it has comp...
12,OHIO MATTRESS &lt;OMT> MAY HAVE LOWER 1ST QTR ...,OHIO MATTRESS MAY HAVE LOWER 1ST QTR NET,"Ohio Mattress Co said its first\n quarter, ...","Ohio Mattress Co said its first quarter, endin..."
44,MCLEAN'S &lt;MII> U.S. LINES SETS ASSET TRANSF...,MCLEAN'S U.S. LINES SETS ASSET TRANSFER,McLean Industries Inc's United\n States Lin...,McLean Industries Inc's United States Lines In...
45,CHEMLAWN &lt;CHEM> RISES ON HOPES FOR HIGHER B...,CHEMLAWN RISES ON HOPES FOR HIGHER BIDS,ChemLawn Corp &lt;CHEM> could attract a\n h...,ChemLawn Corp could attract a higher bid than ...
68,&lt;COFAB INC> BUYS GULFEX FOR UNDISCLOSED AMO...,COFAB Inc BUYS GULFEX FOR UNDISCLOSED AMOUNT,"CoFAB Inc said it acquired &lt;Gulfex Inc>,\...","CoFAB Inc said it acquired Gulfex Inc, a Houst..."
...,...,...,...,...
21503,TRAFALGAR HOUSE BUYS U.S. BUILDER FOR 20 MLN D...,TRAFALGAR HOUSE BUYS U.S. BUILDER FOR 20 milli...,Trafalgar House Plc &lt;TRAF.L> said it has\...,Trafalgar House Plc said it has acquired the e...
21538,EQUITICORP HOLDING IN GUINNESS REACHES 59.93 P...,EQUITICORP HOLDING IN GUINNESS REACHES 59.93 %,Equiticorp Holdings Ltd &lt;EQUW.WE> now own...,Equiticorp Holdings Ltd now owns or has receiv...
21550,CABLE AND WIRELESS DETAILS MERGER OF H.K. UNIT...,CABLE AND WIRELESS DETAILS MERGER OF H.K. UNITS,Cable and Wireless Plc &lt;CAWL.L> said it\n...,Cable and Wireless Plc said it will merge its ...
21555,CABLE AND WIRELESS TO MERGE TWO H.K. UNITS INT...,CABLE AND WIRELESS TO MERGE TWO H.K. UNITS INT...,\n CABLE AND WIRELESS TO MERGE TWO H.K. UNITS...,CABLE AND WIRELESS TO MERGE TWO H.K. UNITS INT...


There are numerous articles like the one below, that have ALL CAPITAL letters, we need to drop them since they will impact out NER solution

In [10]:
# we will drop these articles with only capital letters
df[df['raw_text'].map(lambda t: t.isupper())][['headline', 'raw_text']].head(3)

Unnamed: 0,headline,raw_text
298,SHV SAYS IT MAKING TENDER OFFER FOR UP TO 33 m...,\n SHV SAYS IT MAKING TENDER OFFER FOR UP TO ...
383,"VIACOM SAID IT HAS NEW NATIONAL AMUSEMENTS, MC...",\n VIACOM SAID IT HAS NEW NATIONAL AMUSEMENTS...
398,PITTSTON AGREES TO ACQUIRE WTC INTERNATIONAL I...,\n PITTSTON AGREES TO ACQUIRE WTC INTERNATION...


In [11]:
# drop articles with only capital letters
df = df[df['raw_text'].map(lambda t: not t.isupper())]

### Named-Entity Recognition

In [30]:
import spacy 

nlp = spacy.load('en_core_web_sm') 
print(*nlp.pipeline, sep='\n')

('tagger', <spacy.pipeline.pipes.Tagger object at 0x0000020A097229D0>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x0000020A095E28E0>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x0000020A095E2C40>)


In [31]:
text = """Hughes Tool Co Chairman W.A. Kistler said its merger with 
Baker International Corp was still under consideration.
We hope to come soon to a mutual agreement, Kistler said.
The directors of Baker filed a law suit in Texas to force Hughes 
to complete the merger."""

text = re.sub(r'\s+', ' ', text).strip() ###
doc = nlp(text)

print(*[(e.text, e.label_) for e in doc.ents], sep=' ')

('Hughes Tool Co', 'ORG') ('W.A. Kistler', 'PERSON') ('Baker International Corp', 'ORG') ('Kistler', 'ORG') ('Baker', 'PERSON') ('Texas', 'GPE') ('Hughes', 'ORG')


In [32]:
from spacy import displacy
displacy.render(doc, style='ent')

### Rule-based Named-Entity Recognition

In [49]:
reset_pipeline(nlp, pipes=[])

Model: core_web_sm, Language: en
('tagger', <spacy.pipeline.pipes.Tagger object at 0x0000020A097229D0>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x0000020A095E28E0>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x0000020A095E2C40>)


In [48]:
from spacy.pipeline import EntityRuler

departments = ['Justice', 'Transportation']
patterns = [{"label": "GOV", 
             "pattern": [{"TEXT": "U.S.", "OP": "?"},
                         {"TEXT": "Department"}, {"TEXT": "of"}, 
                         {"TEXT": {"IN": departments}, "ENT_TYPE": "ORG"}]},
             {"label": "GOV", 
              "pattern": [{"TEXT": "U.S.", "OP": "?"},
                          {"TEXT": {"IN": departments}, "ENT_TYPE": "ORG"},
                          {"TEXT": "Department"}]},
             {"label": "GOV",
              "pattern": [{"TEXT": "Securities"}, {"TEXT": "and"},
                          {"TEXT": "Exchange"}, {"TEXT": "Commission"}]}]

In [50]:
# not in book, but useful if you modify the rules
if nlp.has_pipe('entity_ruler'):
    nlp.remove_pipe('entity_ruler')

In [55]:
from spacy.pipeline import EntityRuler
entity_ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
# nlp.add_pipe('entity_ruler')
nlp.add_pipe(entity_ruler, name='entity_ruler', before='ner')

Now let's test it out

In [57]:
text = """Justice Department is an alias for the U.S. Department of Justice.
Department of Transportation and the Securities and Exchange Commission
are government organisations, but the Sales Department is not."""
#text = re.sub(r'\s+', ' ', text).strip() ###

doc = nlp(text)
# print(*[([t.text for t in e], e.label_) for e in doc.ents], sep='\n') ###
displacy.render(doc, style='ent', jupyter=True)

### Coreference Resolution

#### Using spaCy's Token Extensions

In [15]:
# not in book, but usefule if you modify the extension
from spacy.tokens import Token

if Token.has_extension('ref_n'):
    _ = Token.remove_extension('ref_n') 
if Token.has_extension('ref_t'):
    _ = Token.remove_extension('ref_t') 
if Token.has_extension('ref_t_'):
    _ = Token.remove_extension('ref_t_')

In [16]:
from spacy.tokens import Token
Token.set_extension('ref_n', default='')
Token.set_extension('ref_t', default='')

In [17]:
def init_coref(doc):
    for e in doc.ents:
        if e.label_ in ['ORG', 'GOV', 'PERSON']:
            e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
    return doc

#### Alias Resolution

In [33]:

for token in ['Transportation Department', 'DOT', 'SEC', 'TWA']:
    print(token, ':', alias_lookup[token])

Transportation Department : ('U.S. Department of Transportation', 'GOV')
DOT : ('U.S. Department of Transportation', 'GOV')
SEC : ('Securities and Exchange Commission', 'GOV')
TWA : ('Trans World Airlines Inc', 'ORG')


In [22]:
# reset_pipeline(nlp, [entity_ruler, norm_entities, merge_entities, init_coref])

In [34]:
text = """
Hughes Tool Co Chairman W.A. Kistler said its merger with 
Baker International Corp. was still under consideration.
We hope to come to a mutual agreement, Kistler said.
Baker will force Hughes to complete the merger.
"""
text = re.sub(r'\s+', ' ', text).strip() ### 

doc = nlp(text) 
displacy.render(doc, style='ent', jupyter=True)

In [42]:
def propagate_ent_type(doc):
    """propagate entity type stored in ref_t"""
    ents = []
    for e in doc.ents:
        if e[0]._.ref_n != '': # if e is a coreference
            e = Span(doc, e.start, e.end, label=e[0]._.ref_t)
        ents.append(e)
    doc.ents = tuple(ents)
    return doc

In [43]:
def name_match(m1, m2):
    m2 = re.sub(r'[()\.]', '', m2) # ignore parentheses and dots
    m2 = r'\b' + m2 + r'\b' # \b marks word boundary
    m2 = re.sub(r'\s+', r'\\b.*\\b', m2)
    return re.search(m2, m1, flags=re.I) is not None

In [44]:
def name_resolver(doc):
    """create name-based reference to e1 as primary mention of e2"""
    ents = [e for e in doc.ents if e.label_ in ['ORG', 'PERSON']]
    for i, e1 in enumerate(ents):
        for e2 in ents[i+1:]:
            if name_match(e1[0]._.ref_n, e2[0].text): 
                e2[0]._.ref_n = e1[0]._.ref_n
                e2[0]._.ref_t = e1[0]._.ref_t
    return propagate_ent_type(doc)

In [45]:
nlp.add_pipe(name_resolver)

doc = nlp(text)
displacy.render(doc, style='ent', jupyter=True)

ValueError: [E007] 'name_resolver' already exists in pipeline. Existing names: ['tagger', 'parser', 'ner', 'name_resolver']

In [46]:
display_ner(doc).query("ref_n != ''")[['text', 'ent_type', 'ref_n', 'ref_t']]

Unnamed: 0,text,ent_type,ref_n,ref_t
