In [1]:
import spacy, re, dateutil.parser
import numpy as np

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
tagCap = re.compile(r'<P ID=(\d+)>\s+(.+?)\s?</P>', re.DOTALL)

def readFile(filePath):
    with open(filePath, 'r', encoding='utf-8') as fh:
        matches = tagCap.findall(fh.read())
        ids,txts = zip(*matches)
        ids = [int(s) for s in ids]
        return ids, txts

In [4]:
trIDs, trTxts = readFile('data/obits.train.txt')
testIDs, testTxts = readFile('data/obits.test.txt')

In [5]:
trDocs = [nlp(txt) for txt in trTxts]

In [25]:
################################################################################
def extractName(doc):
    sent = next(doc.sents) # get first sentence of document
    for n,tk in enumerate(sent): # loop over tokens of first sentence
        if (tk.is_digit or tk.shape_[0]=='x' or 
                tk.is_punct and not (tk.is_left_punct or tk.is_right_punct)):
            break
    if doc[n-1].is_punct: # if span ends with punctuation, reduce span by 1
        n -= 1
    nameSpan = doc[:n]
    
    # see if any PERSON entity was matched in span
    if 'PERSON' not in set((tk.ent_type_ for tk in nameSpan)): # no PERSON
        for e in trDocs[12].ents: # loop over all extracted entities
            if e.label_ == 'PERSON': # find first PERSON entity
                nameSpan = e # set that entity to the name
                break
    return nameSpan.text

extractName(trDocs[3])

'Helen S. Aigen'

In [26]:
################################################################################
def isMan(tokens):
    tokens = [tk.lower_ for tk in tokens]
    female = sum((tk=='her' or tk=='she' for tk in tokens))
    male = sum((tk=='his' or tk=='he' for tk in tokens))
    return male >= female

In [29]:
isMan(trDocs[0])

True

In [28]:
ageMatch = spacy.matcher.Matcher(nlp.vocab)
ageMatch.add("age", None, [{"TEXT": {"REGEX": "^(1?\d\d)$"}}])
def getAgeDoc(doc):
    matches = ageMatch(doc)
    numPos = [(int(doc[m[1]:m[2]].text),m[1]) for m in matches]
    return max(numPos, key=lambda x: x[0])

getAgeDoc(trDocs[1])

(87, 7)

In [234]:
# ageRe = re.compile(r'\b(1?\d\d)(?:th|st|er|rd)?\b') # match digits for age
ageRe = re.compile(r'\b(1?\d\d)\b') # match digits for age
def getAgeRegex(doc):
    mt = ageRe.finditer(doc.text)
    numPos = [(int(tk[0]),tk.span()[0]) for tk in mt]
    return max(numPos, key=lambda x: x[0])

getAgeRegex(trDocs[12])

(80, 483)

In [376]:
# patterns like: survivors including his wife..., greatly missed by her husband...,
# predeceased by ...
# since many obits describe deceased as great husband/wife, must use gender to 
# distinguish descriptor of the deceased vs the spouse. Also gender is easy to find

def findSpouseName(doc, male=True):
    if male: 
        spouseRe = re.compile(r'(?:wife|spous|partner)', re.I)
    else:
        spouseRe = re.compile(r'(?:husband|spous|partner)', re.I)
    
    mt = spouseRe.search(doc.text) # assume first match is good
    if not mt: # if no match, assumes no spouse can be found
        return None
    
    for tk in doc: # loop over tokens, find 1st token after end of reged match
        if tk.idx >= mt.span()[1]: # if token char-idx after end of regex match
            span = doc.char_span(tk.idx, len(doc.text)) # rest of doc as span
            break
    
    for et in span.ents:
        if et.label_ == 'PERSON':
            return et.text
    return None

findSpouseName(trDocs[13], isMan(trDocs[13]))

'Robert W. Rudd'

In [270]:
bdayRe = re.compile(r'')

409

In [57]:
################################################################################
def findDateAfterMatch(doc, matcher, startSpanAtMatch=False):
    matched = matcher(doc)
    if not matched: # if no match, then nothing is after
        return None
    matchedSent = doc[matched[0][1]].sent # assume 1st match is good
    spanBeg = matched[0][1] if startSpanAtMatch else matchedSent.start
    spanEnd = matchedSent.end
    for n in range(2):
        spanEnd = doc[spanEnd].sent.end
        
    for et in doc[spanBeg:spanEnd].ents:
        if et.label_ == 'DATE' and re.findall(r'\d\d\d', et.text):
            return et.text
    return None

'Saturday, April 27th, 2019'

In [60]:
bdayMatch = spacy.matcher.Matcher(nlp.vocab)
bdayMatch.add("bday", None, [{"LOWER":"born"},{"LOWER":"birth"}])
findDateAfterMatch(trDocs[13], bdayMatch)

In [59]:
deathMatch = spacy.matcher.Matcher(nlp.vocab)
deathSyns = r'(die|pass|sleep|heaven|succumb|perish|decease).*'
deathMatch.add("death", None, [{"LOWER":{"REGEX": deathSyns}}])

findDateAfterMatch(trDocs[19], deathMatch, startSpanAtMatch=True)

In [77]:
def findLocations(doc):
    docHalf = doc[:len(trDocs[0])//2]
    return [e for e in docHalf.ents if e.label_=='GPE']

findLocations(trDocs[2])

[Portsmouth, Ohio, Washington D.C., Fort Myers, Florida, Washington D.C.]

In [78]:
# spacy.displacy.render(trDocs[0], style='ent')

In [8]:
dateutil.parser.parse('September 30, 2018')

datetime.datetime(2018, 9, 30, 0, 0)

In [6]:
# def writeConllInput(outFile, txtLists):
#     with open(outFile, 'w', encoding='utf-8') as fh:
#         for txt in txtLists: # loop over documents
#             doc = nlp(txt) # analyze doc
#             fh.write('-DOCSTART- -X- -X- O\n\n')
#             for st in doc.sents: # loop over sentence
#                 fh.writelines([tk.text+'   \n' if tk.text!='\n' 
#                                else tk.text for tk in st])
#             fh.write('\n')

# writeConllInput('data/obits.train.conll', trTxts)
# writeConllInput('data/obits.test.conll', testTxts)