# 11. Information extraction (named entity extraction and question answering)

###  11.2 Regular patterns

In [1]:
def find_greeting(s):
    """ Return the the greeting string Hi, Hello, or Yo if it occurs at the beginning of a string """
    if s[0] == 'H':
        if s[:3] in ['Hi', 'Hi ', 'Hi,', 'Hi!']:
            return s[:2]
        elif s[:6] in ['Hello', 'Hello ', 'Hello,', 'Hello!']:
            return s[:5]
    elif s[0] == 'Y':
        if s[1] == 'o' and s[:3] in ['Yo', 'Yo,', 'Yo ', 'Yo!']:
            return s[:2]
    return None

In [2]:
find_greeting('Hi Mr. Turing!')

'Hi'

In [3]:
find_greeting('Hello, Rosa.')

'Hello'

In [4]:
find_greeting("Yo, what's up?")

'Yo'

In [5]:
find_greeting("Hello")

'Hello'

In [6]:
print(find_greeting("hello"))

None


In [7]:
print(find_greeting("HelloWorld"))

None


### 11.3.1 Extracting GPS locations

In [8]:
import re

lat = r'([-]?[0-9]?[0-9][.][0-9]{2,10})'
lon = r'([-]?1?[0-9]?[0-9][.][0-9]{2,10})'
sep = r'[,/ ]{1,3}'
re_gps = re.compile(lat + sep + lon)

In [9]:
re_gps.findall('http://...maps/@34.0551066,-118.2496763...')
# [(34.0551066, -118.2496763)]

[('34.0551066', '-118.2496763')]

In [10]:
re_gps.findall("https://www.openstreetmap.org/#map=10/5.9666/116.0566")
# [('5.9666', '116.0566')]

[('5.9666', '116.0566')]

In [11]:
re_gps.findall("Zig Zag Cafe is at 45.344, -121.9431 on my GPS.")
# [('45.3440', '-121.9431')]

[('45.344', '-121.9431')]

### 11.3.2 Extracting dates

In [22]:
us = r'((([01]?\d)[-/]([0123]?\d))([-/]([012]\d)\d\d)?)'
mdy = re.findall(us, 'Santa came 12/25/2017. An elf appeared 12/12')
mdy
# [('12/25/2017', '12', '25', '/2017', '20'), ('12/12', '12', '12', '', '')]

[('12/25/2017', '12/25', '12', '25', '/2017', '20'),
 ('12/12', '12/12', '12', '12', '', '')]

In [23]:
dates = [{'mdy': x[0], 'my': x[1], 'm': int(x[2]), 'd': int(x[3]),
        'y': int(x[4].lstrip('/') or 0), 'c': int(x[5] or 0)} for x in mdy]
dates

[{'mdy': '12/25/2017', 'my': '12/25', 'm': 12, 'd': 25, 'y': 2017, 'c': 20},
 {'mdy': '12/12', 'my': '12/12', 'm': 12, 'd': 12, 'y': 0, 'c': 0}]

In [24]:
for i, d in enumerate(dates):
    for k, v in d.items():
        if not v:
            d[k] = dates[max(i - 1, 0)][k]

dates

[{'mdy': '12/25/2017', 'my': '12/25', 'm': 12, 'd': 25, 'y': 2017, 'c': 20},
 {'mdy': '12/12', 'my': '12/12', 'm': 12, 'd': 12, 'y': 2017, 'c': 20}]

In [25]:
from datetime import date

datetimes = [date(d['y'], d['m'], d['d']) for d in dates]
datetimes

[datetime.date(2017, 12, 25), datetime.date(2017, 12, 12)]

In [27]:
eu = r'((([0123]?\d)[-/]([01]?\d))([-/]([012]\d)?\d\d)?)'
dmy = re.findall(eu, 'Alan Mathison Turing OBE FRS (23/6/1912-7/6/1954) was an English computer scientist.')
dmy
# [('23/6/1912', '23', '6', '/1912', '19'),
#  ('7/6/1954', '7', '6', '/1954', '19')]

[('23/6/1912', '23/6', '23', '6', '/1912', '19'),
 ('7/6/1954', '7/6', '7', '6', '/1954', '19')]

In [28]:
dmy = re.findall(eu, 'Alan Mathison Turing OBE FRS (23/6/12-7/6/54) was an English computer scientist.')
dmy

[('23/6/12', '23/6', '23', '6', '/12', ''),
 ('7/6/54', '7/6', '7', '6', '/54', '')]

In [48]:
# Deal with 2-digit and 4-digit and even 1-digit years from Year 0  to 3999 AD
# And lets name the parts of our year so we can easily coerce it into a datetime object
yr_19xx = (
    r'\b(?P<yr_19xx>' +
    '|'.join('{}'.format(i) for i in range(30, 100)) +
    r')\b'
    )
# print(f"yr_19xx: {yr_19xx}")
yr_20xx = (
    r'\b(?P<yr_20xx>' +
    '|'.join('{:02d}'.format(i) for i in range(10)) + '|' +
    '|'.join('{}'.format(i) for i in range(10, 30)) +
    r')\b'
    )
# print(f"yr_20xx: {yr_20xx}")
yr_cent = r'\b(?P<yr_cent>' + '|'.join('{}'.format(i) for i in range(1, 40)) + r')'
# print(f"yr_cent: {yr_cent}")
yr_ccxx = r'(?P<yr_ccxx>' + '|'.join('{:02d}'.format(i) for i in range(0, 100)) + r')\b'
# print(f"yr_ccxx: {yr_ccxx}")
yr_xxxx = r'(?P<yr_xxxx>(' + yr_cent + ')(' + yr_ccxx + '))'
yr = (
    r'\b(?P<yr>' +
    yr_19xx + '|' + yr_20xx + '|' + yr_xxxx +
    r')\b'
    )

groups = list(re.finditer(
    yr, "0, 2000, 01, '08, 99, 1984, 2030/1970 85 47 `66"
))
full_years = [g['yr'] for g in groups]
full_years

['2000', '01', '08', '99', '1984', '2030', '1970', '85', '47', '66']

In [63]:
mon_words = 'January February March April May June July ' \
    'August September October November December'

mon = (r"\b(?P<mon>" + '|'.join('{}|{}|{}|{}|{:02d}'.format(
    m, m[:4], m[:3], i + 1, i + 1) for i, m in enumerate(mon_words.split())) + 
       r")\b")

re.findall(mon, 'January has 31 days, February the 2nd month of 12, has 28, except in a Leap Year.')

['January', 'February', '12']

In [157]:
day = (r"\b(?P<day>" + r'|'.join('{:02d}|{}'.format(i, i) for i in range(1, 32)) + 
       r")\b")

# eu = r'\b((' + day + r')\b[-,/ ]{0,2}\b(' + mon + r')\b[-,/ ]{0,2}\b(' + yr + r'))\b'
# eu = r'(([0123]?\d)[-/ ]([01]?\d|' + mon + r')((\,[ ]|[-/ ])([012]\d)?\d\d)?)'

eu = (r"\b(" + day.replace("<day", "<eu_day") + r")\b[-,/ ]{0,2}\b(" + mon.replace("<mon", "<eu_mon") + r")\b[-,/ ]{0,2}\b(" + yr.replace("<yr", "<eu_yr") +r")\b")

us = (r"\b(" + mon.replace("<mon", "<us_mon") + r")\b[-,/ ]{0,2}\b(" + day.replace("<day", "<us_day") + r")\b[-,/ ]{0,2}\b(" + yr.replace("<yr", "<us_yr") +r")\b")

date_pattern = r"\b(" + eu + "|" + us + r")\b"

groups = list(re.finditer(date_pattern, "31 October, 1970 25/12/2017"))
groups

[<re.Match object; span=(0, 16), match='31 October, 1970'>,
 <re.Match object; span=(17, 27), match='25/12/2017'>]

In [159]:
import datetime

dates = []

for g in groups:
    month_num = (g["us_mon"] or g["eu_mon"]).strip()
    try:
        month_num = int(month_num)
    except ValueError:
        month_num = [w[:len(month_num)] for w in mon_words.split()].index(month_num) + 1

    date = datetime.date(
        int(g["us_yr"] or g["eu_yr"]),
        month_num,
        int(g["us_day"] or g["eu_day"])
    )
    dates.append(date)

dates

[datetime.date(1970, 10, 31), datetime.date(2017, 12, 25)]

### 11.4.1 Part-of-speech (POS) tagging

In [72]:
!python -m spacy download en_core_web_md

2022-12-22 16:44:59.533116: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-22 16:44:59.740282: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-22 16:44:59.740378: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-22 16:45:00.537452: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [73]:
import spacy

en_model = spacy.load("en_core_web_md")

In [74]:
sentence = ("In 1541 Desoto wrote in his journal that the Pascagoula people " +
    "ranged as far north as the confluence of the Leaf and Chickasawhay rivers at 30.4, -88.5.")

parsed_sent = en_model(sentence)
parsed_sent.ents

(1541, Desoto, Pascagoula, Chickasawhay, 30.4)

In [76]:
" ".join(["{}_{}".format(tok, tok.tag_) for tok in parsed_sent])

'In_IN 1541_CD Desoto_NN wrote_VBD in_IN his_PRP$ journal_NN that_IN the_DT Pascagoula_NNP people_NNS ranged_VBD as_RB far_RB north_RB as_IN the_DT confluence_NN of_IN the_DT Leaf_NNP and_CC Chickasawhay_NNP rivers_NNS at_IN 30.4_CD ,_, -88.5_NN ._.'

In [154]:
from spacy.displacy import render

sentence = "In 1541 Desoto wrote in his journal about the Pascagoula."

parsed_sent = en_model(sentence)

with open("pascagoula.html", "w", encoding="utf-8") as f:
    f.write(str(render(docs=parsed_sent, page=True, jupyter=False, options=dict(compact=True))))

render(docs=parsed_sent, page=True, options=dict(compact=True))

In [85]:
import pandas as pd
from collections import OrderedDict

def token_dict(token):
    return dict(
        ORTH=token.orth_,
        POS=token.pos_,
        TAG=token.tag_,
        DEP=token.dep_,
        LEMMA=token.lemma_)


def doc_dataframe(doc):
    return pd.DataFrame([token_dict(tok) for tok in parsed_sent])


doc_dataframe(en_model("In 1541 Desoto met the Pascagoula."))

Unnamed: 0,ORTH,POS,TAG,DEP,LEMMA
0,In,ADP,IN,prep,in
1,1541,NUM,CD,pobj,1541
2,Desoto,NOUN,NN,nsubj,desoto
3,wrote,VERB,VBD,ROOT,write
4,in,ADP,IN,prep,in
5,his,PRON,PRP$,poss,his
6,journal,NOUN,NN,pobj,journal
7,about,ADP,IN,prep,about
8,the,DET,DT,det,the
9,Pascagoula,PROPN,NNP,pobj,Pascagoula


In [105]:
# pattern = [{'TAG': 'NNP'}, {'LEMMA': 'meet'}, {'IS_ALPHA': True, 'OP': '*'}, {'TAG': 'NNP'}]

pattern = [[{'TAG': 'NNP', 'OP': '+'}],
    [{'IS_ALPHA': True, 'OP': '*'}],
    [{'LEMMA': 'meet'}],
    [{'IS_ALPHA': True, 'OP': '*'}],
    [{'TAG': 'NNP', 'OP': '+'}]]

pattern

[[{'TAG': 'NNP', 'OP': '+'}],
 [{'IS_ALPHA': True, 'OP': '*'}],
 [{'LEMMA': 'meet'}],
 [{'IS_ALPHA': True, 'OP': '*'}],
 [{'TAG': 'NNP', 'OP': '+'}]]

In [106]:
from spacy.matcher import Matcher

doc = en_model("In 1541 Desoto met the Pascagoula.")

matcher = Matcher(en_model.vocab)
matcher.add('meeting', patterns=pattern)

m = matcher(doc)
m

[(14798207169164081740, 0, 1),
 (14798207169164081740, 2, 3),
 (14798207169164081740, 3, 4),
 (14798207169164081740, 2, 4),
 (14798207169164081740, 2, 5),
 (14798207169164081740, 3, 5),
 (14798207169164081740, 4, 5),
 (14798207169164081740, 2, 6),
 (14798207169164081740, 3, 6),
 (14798207169164081740, 4, 6),
 (14798207169164081740, 5, 6)]

In [107]:
doc[m[0][1]:m[0][2]]

In

In [108]:
doc = en_model("October 24: Lewis and CLark met their first Mandan Chief, Big White.")
m = matcher(doc)[0]
m

(14798207169164081740, 0, 1)

In [109]:
doc[m[1]:m[2]]

October

In [110]:
doc = en_model("On 11 October 1986, Gorbachev and Reagan met at a house")
matcher(doc)

[(14798207169164081740, 0, 1),
 (14798207169164081740, 2, 3),
 (14798207169164081740, 5, 6),
 (14798207169164081740, 5, 7),
 (14798207169164081740, 6, 7),
 (14798207169164081740, 5, 8),
 (14798207169164081740, 6, 8),
 (14798207169164081740, 7, 8),
 (14798207169164081740, 8, 9),
 (14798207169164081740, 5, 9),
 (14798207169164081740, 6, 9),
 (14798207169164081740, 7, 9),
 (14798207169164081740, 5, 10),
 (14798207169164081740, 6, 10),
 (14798207169164081740, 7, 10),
 (14798207169164081740, 8, 10),
 (14798207169164081740, 9, 10),
 (14798207169164081740, 5, 11),
 (14798207169164081740, 6, 11),
 (14798207169164081740, 7, 11),
 (14798207169164081740, 8, 11),
 (14798207169164081740, 9, 11),
 (14798207169164081740, 10, 11),
 (14798207169164081740, 5, 12),
 (14798207169164081740, 6, 12),
 (14798207169164081740, 7, 12),
 (14798207169164081740, 8, 12),
 (14798207169164081740, 9, 12),
 (14798207169164081740, 10, 12),
 (14798207169164081740, 11, 12)]

In [112]:
doc = en_model("On 11 October 1986, Gorbachev and Reagan met at a house")

pattern = [[{'TAG': 'NNP', 'OP': '+'}],
    [{'LEMMA': 'and'}],
    [{'TAG': 'NNP', 'OP': '+'}],
    [{'IS_ALPHA': True, 'OP': '*'}],
    [{'LEMMA': 'meet'}]]

matcher.add('met', patterns=pattern)

m = matcher(doc)
m

[(14798207169164081740, 0, 1),
 (14332210279624491740, 0, 1),
 (14798207169164081740, 2, 3),
 (14332210279624491740, 2, 3),
 (14798207169164081740, 5, 6),
 (14332210279624491740, 5, 6),
 (14332210279624491740, 6, 7),
 (14798207169164081740, 5, 7),
 (14332210279624491740, 5, 7),
 (14798207169164081740, 6, 7),
 (14798207169164081740, 5, 8),
 (14332210279624491740, 5, 8),
 (14798207169164081740, 6, 8),
 (14332210279624491740, 6, 8),
 (14798207169164081740, 7, 8),
 (14332210279624491740, 7, 8),
 (14798207169164081740, 8, 9),
 (14332210279624491740, 8, 9),
 (14798207169164081740, 5, 9),
 (14332210279624491740, 5, 9),
 (14798207169164081740, 6, 9),
 (14332210279624491740, 6, 9),
 (14798207169164081740, 7, 9),
 (14332210279624491740, 7, 9),
 (14798207169164081740, 5, 10),
 (14332210279624491740, 5, 10),
 (14798207169164081740, 6, 10),
 (14332210279624491740, 6, 10),
 (14798207169164081740, 7, 10),
 (14332210279624491740, 7, 10),
 (14798207169164081740, 8, 10),
 (14332210279624491740, 8, 10),


In [113]:
doc[m[-1][1]:m[-1][2]]

house

### 11.4.7 Sentence segmentation with regular expressions

In [114]:
re.split(r"[!.?]+[ $]", "Hello World.... Are you there?!?! I'm going to Mars!")

['Hello World', 'Are you there', "I'm going to Mars!"]

In [115]:
re.split(r"[!.?]+ ", "The author wrote \"'I don't think it's conscious.' Turing said.\"")

['The author wrote "\'I don\'t think it\'s conscious.\' Turing said."']

In [116]:
re.split(r"[!.?]+ ", "The author wrote \"'I don't think it's conscious.' Turing said.\" But I stopped reading.")

['The author wrote "\'I don\'t think it\'s conscious.\' Turing said." But I stopped reading.']

In [118]:
re.split(r"(?<!\d)\.|\.(?!\d)", "I went to GT.You?")

['I went to GT', 'You?']

In [128]:
# from nlpia.data.loaders import get_data
# examples = get_data("sentences-tm-town")
import pandas as pd
examples = pd.read_json("../../data/sentences-tm-town.json")

examples.head()

Unnamed: 0,0,1,2
0,NLPIA Ch 11: Multiple period-delimitted abbrev...,I live in the U.S. but I commute to work in Me...,[I live in the U.S. but I commute to work in M...
1,NLPIA Ch 11: G.T. as sentence boundary,I went to G.T. You?,"[I'm went to G.T., You?]"
2,NLPIA Ch 11: Abbreviation period without a spa...,I went to G.T.You?,"[I'm went to G.T., You?]"
3,NLPIA Ch 11: Quote within a sentence followed ...,"She yelled ""It's right here!"" but I kept looki...","[She yelled ""It's right here!"" but I kept look..."
4,NLPIA Ch 11: Multiple quotes,"I stared dumbfounded as things like ""How did I...","[I stared dumbfounded as things like ""How did ..."


In [149]:
regex = re.compile(r"((?<!\d)\.|\.(?!\d))|([!.?]+)[ $]+")
wrong = []

# for i, (challenge, text, sents) in enumerate(examples):
for i, row in examples.iterrows():
    challenge, text, sents = row[0], row[1], row[2]
    if tuple(regex.split(text)) != tuple(sents):
        print("wrong {}: {}{}".format(i, text[:50], '...' if len(text) > 50 else ''))
        wrong += [i]

len(wrong), len(examples)

wrong 0: I live in the U.S. but I commute to work in Mexico...
wrong 1: I went to G.T. You?
wrong 2: I went to G.T.You?
wrong 3: She yelled "It's right here!" but I kept looking f...
wrong 4: I stared dumbfounded as things like "How did I get...
wrong 5: She continued with her story, "'I don't think it's...
wrong 6: Hello French Fry. My name is Katsup.
wrong 7: What is your name? My name is Heinz 57.
wrong 8: There it is! I found it.
wrong 9: My name is Chuck E. Cheese.
wrong 10: Please turn to p. 42.
wrong 11: Please turn to ch. five.
wrong 12: Please turn to ch 5.
wrong 13: Hurry! Turn to ch 5!
wrong 14: Ready? Turn to ch 5.
wrong 15: Were Jane and co. at the party?
wrong 16: They closed the deal with Pitt, Briggs & Co. at no...
wrong 17: Let's ask Jane and co. They should know.
wrong 18: They closed the deal with Pitt, Briggs & Co. It cl...
wrong 19: I can see Mt. Fuji from here.
wrong 20: St. Michael's Church is on 5th st. near the light.
wrong 21: That is JFK Jr.'s book.
wrong 22:

(61, 61)