<a id='top'></a><a name='top'></a>
# Chapter 11: Information extraction (named entity extraction and question answering)

* [Introduction](#introduction)
* [11.0 Imports and Setup](#11.0)
* [11.1 Code](#11.1)

---
<a name='introduction'></a><a id='introduction'></a>
# Introduction
<a href="#top">[back to top]</a>

### Datasets
* No datasets


### Explore

### Key points


---
<a name='11.0'></a><a id='11.0'></a>
# 11.0 Imports and Setup
<a href="#top">[back to top]</a>

In [1]:
import os
if not os.path.exists('setup'):
    os.mkdir('setup')

In [2]:
req_file = "setup/requirements_11.txt"

In [3]:
import sys
IS_COLAB = 'google.colab' in sys.modules

if IS_COLAB:
    print("Installing packages")
    !pip install --upgrade --quiet -r {req_file}
else:
    print("Running locally.")

Running locally.


In [4]:
%%writefile setup/chp11_imports.py
import locale
import os
import pprint
import random
import re
import warnings
from collections import OrderedDict
from datetime import date

import numpy as np
import pandas as pd
import seaborn as sns
import spacy
from spacy.displacy import render
from spacy.matcher import Matcher
from tqdm.auto import tqdm
from watermark import watermark

Overwriting setup/chp11_imports.py


In [5]:
!isort setup/chp11_imports.py --sl
!cat setup/chp11_imports.py

import locale
import os
import pprint
import random
import re
from collections import OrderedDict
from datetime import date

import numpy as np
import pandas as pd
import seaborn as sns
import spacy
from spacy.displacy import render
from spacy.matcher import Matcher
from tqdm.auto import tqdm
from watermark import watermark


In [6]:
import locale
import os
import pprint
import random
import re
import warnings
from collections import OrderedDict
from datetime import date

import numpy as np
import pandas as pd
import seaborn as sns
import spacy
from spacy.displacy import render
from spacy.matcher import Matcher
from tqdm.auto import tqdm
from watermark import watermark

In [7]:
def HR():
    print("-"*40)
    
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"

locale.getpreferredencoding = getpreferredencoding
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")
tqdm.pandas(desc="progress-bar")
pp = pprint.PrettyPrinter(indent=4)
random.seed(42)
np.random.seed(42)

print(watermark(iversions=True,globals_=globals(),python=True,machine=True))

Python implementation: CPython
Python version       : 3.8.12
IPython version      : 7.34.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.3)
OS          : Darwin
Release     : 21.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit

numpy  : 1.23.5
pandas : 1.5.3
spacy  : 3.5.1
sys    : 3.8.12 (default, Dec 13 2021, 20:17:08) 
[Clang 13.0.0 (clang-1300.0.29.3)]
seaborn: 0.12.1
re     : 2.2.1



---
<a name='11.1'></a><a id='11.1'></a>
# 11.1 Named entities and relations
<a href="#top">[back to top]</a>

<a name='11.1.1'></a><a id='11.1.1'></a>
## 11.1.1 A knowledge base
<a href="#top">[back to top]</a>

<a name='11.1.2'></a><a id='11.1.2'></a>
## 11.1.2 Information extraction
<a href="#top">[back to top]</a>

---
<a name='11.2'></a><a id='11.2'></a>
# 11.2 Regular patterns
<a href="#top">[back to top]</a>

In [8]:
# Listing 11.1 Pattern hardcoded in Python
""" A naive way to build a Finit State Machine to extract piece of information """

def find_greeting(s):
    """ Return the the greeting string Hi, Hello, or Yo if it occurs at the beginning of a string"""

    if s[0] == 'H':
        if s[:3] in ['Hi', 'Hi ', 'Hi,', 'Hi!']:
            return s[:2]
        elif s[:6] in ['Hello', 'Hello ', 'Hello,', 'Hello!']:
            return s[:5]
    elif s[0] == 'Y':
        if s[1] == 'o' and s[:3] in ['Yo', 'Yo,', 'Yo ', 'Yo!']:
            return s[:2]
    return None

In [9]:
# Listing 11.2 Brittle pattern-matching example
find_greeting('Hi Mr. Turing!')

'Hi'

In [10]:
find_greeting('Hello, Rosa.')

'Hello'

In [11]:
find_greeting("Yo, what's up?")

'Yo'

In [12]:
find_greeting("Hello")

'Hello'

In [13]:
print(find_greeting("hello"))

None


In [14]:
print(find_greeting("HelloWorld"))

None


<a name='11.2.1'></a><a id='11.2.1'></a>
## 11.2.1 Regular expressions
<a href="#top">[back to top]</a>

<a name='11.2.2'></a><a id='11.2.2'></a>
## 11.2.2 Information extraction as ML feature extraction
<a href="#top">[back to top]</a>

---
<a name='11.3'></a><a id='11.3'></a>
# 11.3 Information worth extracting
<a href="#top">[back to top]</a>

<a name='11.3.1'></a><a id='11.3.1'></a>
## 11.3.1 Extracting GPS locations
<a href="#top">[back to top]</a>

In [15]:
# Listing 11.3 Regular expression for GPS coordinates
lat = r'([-]?[0-9]?[0-9][.][0-9]{2,10})'
lon = r'([-]?1?[0-9]?[0-9][.][0-9]{2,10})'
sep = r'[,/ ]{1,3}'
re_gps = re.compile(lat + sep + lon)

In [16]:
re_gps.findall('http://...maps/@34.0551066,-118.2496763...')
# [(34.0551066, -118.2496763)]

[('34.0551066', '-118.2496763')]

In [17]:
re_gps.findall("https://www.openstreetmap.org/#map=10/5.9666/116.0566")
# [('5.9666', '116.0566')]

[('5.9666', '116.0566')]

In [18]:
groups = re_gps.findall("Zig Zag Cafe is at 45.344, -121.9431 on my GPS.")
# [('45.3440', '-121.9431')]

<a name='11.3.2'></a><a id='11.3.2'></a>
## 11.3.2 Extracting dates
<a href="#top">[back to top]</a>

In [19]:
# Listing 11.4 Regular expression for US dates
us = r'((([01]?\d)[-/]([0123]?\d))([-/]([0123]\d)\d\d)?)'
mdy = re.findall(us, 'Santa came 12/25/2017. An elf appeared 12/12.')
mdy

[('12/25/2017', '12/25', '12', '25', '/2017', '20'),
 ('12/12', '12/12', '12', '12', '', '')]

In [20]:
# Listing 11.5 Structuring extracted dates
dates = [{'mdy': x[0], 'my': x[1], 'm': int(x[2]), 'd': int(x[3]),
    'y': int(x[4].lstrip('/') or 0), 'c': int(x[5] or 0)} for x in mdy]

dates

[{'mdy': '12/25/2017', 'my': '12/25', 'm': 12, 'd': 25, 'y': 2017, 'c': 20},
 {'mdy': '12/12', 'my': '12/12', 'm': 12, 'd': 12, 'y': 0, 'c': 0}]

In [21]:
# Listing 11.6 Basic context maintenance
for i, d in enumerate(dates):
    for k, v in d.items():
        if not v:
            d[k] = dates[max(i - 1, 0)][k]
    
print(dates)

HR()


datetimes = [date(d['y'], d['m'], d['d']) for d in dates]
datetimes

[{'mdy': '12/25/2017', 'my': '12/25', 'm': 12, 'd': 25, 'y': 2017, 'c': 20}, {'mdy': '12/12', 'my': '12/12', 'm': 12, 'd': 12, 'y': 2017, 'c': 20}]
----------------------------------------


[datetime.date(2017, 12, 25), datetime.date(2017, 12, 12)]

In [22]:
# FIXME: remove unicode characters in regex or use regexes that can handle them
# deg,min,sec: 34°02'47.5"  # the degree unicode character will CRASH ipython!
deg_sym = r'[ ]?(°|d|deg|degree|degrees)[ ]?'
min_sym = r"[ ]?('|m|min|minute|minutes)[ ]?"
sec_sym = r'[ ]?("|s|sec|second|seconds)[ ]?'
dms = re.compile(r'([-]?[0-9]?[0-9]' + deg_sym +
                 r'[0-6]?[0-9]' + min_sym +
                 r'[0-6]?[0-9][.]?[0-9]{0,9}' + sec_sym +
                 r')[ ]?,[ ]?' +
                 r'([-]?1?[0-9]?[0-9]' + deg_sym +
                 r'[0-6]?[0-9]' + min_sym +
                 r'[0-6]?[0-9][.]?[0-9]{0,9}' + sec_sym +
                 r')')
dms.findall('34°02\'47.5"')
# []
print('34°02\'47.5"')
# 34°02'47.5"
dms.findall('34d02m47.5"')
# []
dms.findall('34d02m47.5s')
# []

34°02'47.5"


[]

In [23]:
# Listing 11.7 Regular expression for European dates
def extract_latlon(s):
    matches = dms.findall(s)
    if len(matches):
        return float(matches[-1][0]), float(matches[-1][-1])
    else:
        return None, s


us = r'(([01]?\d)[-/]([0123]?\d)([-/]([012]\d)?\d\d)?)'
re.findall(us, 'Santa came on 12/25/2017 and a star appeared 12/12')
# [('12/25/2017', '12', '25', '/2017', '20'), ('12/12', '12', '12', '', '')]

eu = r'(([0123]?\d)[-/]([01]?\d)([-/]([012]\d)?\d\d)?)'
re.findall(eu, 'Alan Mathison Turing OBE FRS (23/6/1912-7/6/1954) was an English computer scientist.')
[('23/6/1912', '23', '6', '/1912', '19'),
 ('7/6/1954', '7', '6', '/1954', '19')]


[('23/6/1912', '23', '6', '/1912', '19'),
 ('7/6/1954', '7', '6', '/1954', '19')]

In [24]:
# Listing 11.8 Recognizing years

# Deal with 2-digit an d4-digit and even 1-digit years from Year 0  to 3999 AD
# And lets name the parts of our year so we can easily coerce it into a datetime object
yr_19xx = (
    r'\b(?P<yr_19xx>' +
    '|'.join('{}'.format(i) for i in range(30, 100)) +
    r')\b'
    )
yr_20xx = (
    r'\b(?P<yr_20xx>' +
    '|'.join('{:02d}'.format(i) for i in range(10)) + '|' +
    '|'.join('{}'.format(i) for i in range(10, 30)) +
    r')\b'
    )
yr_cent = r'\b(?P<yr_cent>' + '|'.join('{}'.format(i) for i in range(1, 40)) + r')\b'
yr_ccxx = r'\b(?P<yr_ccxx>' + '|'.join('{:02d}'.format(i) for i in range(0, 100)) + r')\b'

yr = (
    r'\b(?P<yr>' +
    yr_19xx + '|' + yr_20xx + '|(?P<yr_xxxx>(' + yr_cent + ')(' + yr_ccxx + '))' +
    r')\b'
    )
re.findall(yr, "0, 2000, 01, '08, 99, 1984, 2030/1970 85 47 `66")

day = r'|'.join('{:02d}|{}'.format(i, i) for i in range(1, 32))




In [25]:
# Listing 11.9 Recognizing month words with regular expressions

mon_words = 'January February March April May June July ' \
    'August September October November December'
# mon = '|'.join('{}|{}|{}'.format(m, m[:4], m[:3]) for m in months.split())
mon = '|'.join('{}|{}|{}|{}|{:02d}'.format(
    m, m[:4], m[:3], i + 1, i + 1) for i, m in enumerate(mon_words.split()))

eu = r'\b((' + day + r')\b[-,/ ]{0,2}\b(' + mon + r')\b[-,/ ]{0,2}\b(' + yr + r'))\b'

re.findall(eu, '31 Oct, 1970 25/12/2017')
# [('31 Oct, 1970', '31', 'Oct', '1970', '19', '70'),
#  ('25/12/2017', '25', '12', '2017', '20', '17')]

# [('0', '', '0'), ('2000', '20', '00'), ('01', '', '01'), ('99', '9', '9'), ('1984', '19', '84'), ('2030', '20', '30'), ('1970', '19', '70')]
# re.findall(yr'0, 2000, 01, 99, 1984, 2030/1970 ')

eu = r'(([0123]?\d)[-/ ]([01]?\d|' + mon + r')((\,[ ]|[-/ ])([012]\d)?\d\d)?)'
re.findall(eu, 'Barack Hussein Obama II (born August 4, 1961) is an American politician...')
# <1> this catches year zero ("0")for the astronomical calendar
# <2> this catches year integers 0 through 3999


[]

---
<a name='11.4'></a><a id='11.4'></a>
# 11.4 Extracting relationships (relations)
<a href="#top">[back to top]</a>

<a name='11.4.1'></a><a id='11.4.1'></a>
## 11.4.1 Part-of-speec (POS) tagging
<a href="#top">[back to top]</a>

In [26]:
# Listing 11.12 POS tagging with spaCy

!python -m spacy download en_core_web_md  -q

print("Done")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
Done


In [27]:
# load spaCy model
en_model = spacy.load('en_core_web_md')

sentence = """In 1541 Desoto wrote in his journal that the Pascagoula people ranged as far north as the confluence of the Leaf and Chickasawhay r
ivers at 30.4, -88.5."""

# create a spaCy object 
parsed_sent = en_model(sentence)
parsed_sent.ents

(1541, Desoto, Pascagoula, Chickasawhay, 30.4)

In [28]:
# Listing 11.13 Visualize a dependency tree

sentence = "In 1541 Desoto wrote in his journal about the Pascagoula."
parsed_sent = en_model(sentence)

In [29]:
# Listing 11.14 Helper function for spaCy tagged strings

def token_dict(token):
    return OrderedDict(
        ORTH=token.orth_, 
        LEMMA=token.lemma_,
        POS=token.pos_, 
        TAG=token.tag_, 
        DEP=token.dep_
    )

def doc_dataframe(doc):
    return pd.DataFrame([token_dict(tok) for tok in parsed_sent])

In [30]:
test = doc_dataframe(en_model("In 1541 Desoto met the Pascagoula."))

test

Unnamed: 0,ORTH,LEMMA,POS,TAG,DEP
0,In,in,ADP,IN,prep
1,1541,1541,NUM,CD,pobj
2,Desoto,desoto,NOUN,NN,nsubj
3,wrote,write,VERB,VBD,ROOT
4,in,in,ADP,IN,prep
5,his,his,PRON,PRP$,poss
6,journal,journal,NOUN,NN,pobj
7,about,about,ADP,IN,prep
8,the,the,DET,DT,det
9,Pascagoula,Pascagoula,PROPN,NNP,pobj


In [31]:
# Listing 11.15 Example spaCy POS pattern
# define the pattern
pattern = [{'TAG': 'NNP'}, {'LEMMA': 'meet'}, {'IS_ALPHA': True, 'OP': '*'}, {'TAG': 'NNP'}]
pattern

[{'TAG': 'NNP'},
 {'LEMMA': 'meet'},
 {'IS_ALPHA': True, 'OP': '*'},
 {'TAG': 'NNP'}]

In [32]:
# Listing 11.16 Creating a POS pattern matcher with spaCy
# Matcher class object 

doc = en_model("In 1541 Desoto met the Pascagoula.")
print(doc)

matcher = Matcher(en_model.vocab)
print(matcher)

# The new version of spacy needs square brackets around pattern. 
matcher.add('meeting', [pattern])
m = matcher(doc)
m

In 1541 Desoto met the Pascagoula.
<spacy.matcher.matcher.Matcher object at 0x136575790>


[]

In [33]:
# Listing 11.17 Using a POS pattern matcher
doc = en_model("October 24: Lewis and Clark met their first Mandan Chief, Big White.")
m = matcher(doc)[0]
m

(14798207169164081740, 5, 10)

In [34]:
# Listing 11.18 Combining multiple patterns for a more robust pattern matcher
doc = en_model("On 11 October 1986, Gorbachev and Reagan met at a house")

pattern = [
    {'TAG': 'NNP', 'OP': '+'}, 
    {'LEMMA': 'and'}, 
    {'TAG': 'NNP', 'OP': '+'}, 
    {'IS_ALPHA': True, 'OP': '*'}, 
    {'LEMMA': 'meet'}
]

matcher.add('met', [pattern])
m = matcher(doc)
m

[(14332210279624491740, 5, 9)]

In [35]:
doc[m[-1][1]:m[-1][2]]

Gorbachev and Reagan met

<a name='11.4.2'></a><a id='11.4.2'></a>
## 11.4.2 Entity name normalization
<a href="#top">[back to top]</a>

<a name='11.4.3'></a><a id='11.4.3'></a>
## 11.4.3 Relation normalization and extraction
<a href="#top">[back to top]</a>

<a name='11.4.4'></a><a id='11.4.4'></a>
## 11.4.4 Word patterns
<a href="#top">[back to top]</a>

<a name='11.4.5'></a><a id='11.4.5'></a>
## 11.4.5 Segmentation
<a href="#top">[back to top]</a>

<a name='11.4.6'></a><a id='11.4.6'></a>
## 11.4.6 Why won't split('.!?') work?
<a href="#top">[back to top]</a>

<a name='11.4.7'></a><a id='11.4.7'></a>
## 11.4.7 Sentence segmentation with regular expressions
<a href="#top">[back to top]</a>

---
<a name='11.5'></a><a id='11.5'></a>
# 11.5 In the real world
<a href="#top">[back to top]</a>