In [None]:
import pprint, re, dateutil.parser
pprint.sorted = lambda x, key=None: x # disable sorting of results

In [None]:
import spacy
nlp = spacy.load("en_core_web_md") # load spacy English model

# Lab 07: Information Extraction

Student: John Wu

In [None]:
tagCap = re.compile(r'<P ID=(\d+)>\s+(.+?)\s?</P>', re.DOTALL)
def readFiles(filePath):
    with open(filePath, 'r', encoding='utf-8') as fh:
        matches = tagCap.findall(fh.read())
        ids,txts = zip(*matches)
        ids = [int(s) for s in ids]
        return ids, txts
    
trIDs, trTxts = readFiles('data/obits.train.txt') # get training files

def firstTknAfterChar(doc, charOffset):
    if charOffset >= len(doc.text): # if charOffset is after end of doc
        return None
    for tk in doc: # loop over tokens, find 1st token after offset
        if tk.idx >= charOffset: # if token char-idx after offset
            return tk # return token (contains char-idx and tkn-idx)

## Extraction of Required Relations

__Name of the deceased__

In [None]:
def extractName(doc):
    sent = next(doc.sents) # get first sentence of document
    for n,tk in enumerate(sent): # loop over tokens of first sentence
        if (tk.is_digit or tk.shape_[0]=='x' or tk.is_punct and 
                not (tk.text==',' or tk.is_left_punct or tk.is_right_punct)):
            break
    if doc[n-1].is_punct: # if span ends with punctuation, reduce span by 1
        n -= 1
    nameSpan = doc[:n]
    
    # see if any PERSON entity was matched in span
    if 'PERSON' not in set((tk.ent_type_ for tk in nameSpan)): # no PERSON
        for e in doc.ents: # loop over all extracted entities
            if e.label_ == 'PERSON': # find first PERSON entity
                nameSpan = e # set that entity to the name
                break
    return nameSpan.text

__Sex of the deceased__

In [None]:
def isMan(tokens):
    tokens = [tk.lower_ for tk in tokens]
    female = sum((tk=='her' or tk=='she' for tk in tokens))
    male = sum((tk=='his' or tk=='he' for tk in tokens))
    return male >= female

__Age at death__

In [None]:
ageMatch = spacy.matcher.Matcher(nlp.vocab)
ageMatch.add("age", None, [{"TEXT": {"REGEX": "^(1?\d\d)$"}}])
def getAgeDoc(doc):
    matches = ageMatch(doc)
    numPos = [(int(doc[m[1]:m[2]].text),m[1]) for m in matches]
    return max(numPos, key=lambda x: x[0]) if matches else (None,None)

__Location(s) of residency__

In [None]:
## note that first half of document likely contains most of the locations where
## the deceased live. The second half more likely to contain locations where
## the death happened and where the funeral service is held, which may not be
## the living location of the deceased.
###############################################################################

survivor = spacy.matcher.Matcher(nlp.vocab)
survivor.add("surv", None, [{"LOWER":{"REGEX": r'(surviv|pre-?deceas).*'}}])

def findLocations(doc):
    part = doc[:len(doc)//10*8] # exclude the end (where funeral info are)
    locs,locIdxs = set(),set()
    
    for s in [doc[m[1]].sent for m in survivor(doc)]: # loop over sentences
        locIdxs.update( range(s.start,s.end+1) )
    
    for loc in (e for e in part.ents if (e.label_=='GPE' or e.label_=='LOC')):
        if loc.start in locIdxs:
            continue
        endIdx = loc.end
        while ((doc[endIdx].text==',' and doc[endIdx+1].ent_type_) or 
               doc[endIdx].ent_type_):
            endIdx += 1
        locs.add(doc[loc.start:endIdx].text)
        locIdxs.update( range(loc.start,endIdx) )
    return locs # cannot find any GPE entities

In [None]:
# spacy.displacy.render(nlp(trTxts[19]), style='ent')

__Spouse(s) of the deceased__

In [None]:
spouses(nlp(trTxts[11]))[0]

In [None]:
# patterns like: survivors including his wife..., greatly missed by her husband...,
# predeceased by ...
# since many obits describe deceased as great husband/wife, must use gender to 
# distinguish descriptor of the deceased vs the spouse. Also gender is easy to find

spouses = spacy.matcher.Matcher(nlp.vocab)
spouses.add('sp', None, [{"LOWER":{"REGEX":r'(husband|wife|spouse|partner).*'}}])

def findSpouseName(doc):
    mt = spouses(doc) # search for word related to spouses
    if not mt: # if no match, assumes no spouse can be found
        return None
    else: # if found, choose rest of sentence as the document span
        span = doc[mt[0][1]: doc[mt[0][1]].sent.end]
    
    for et in span.ents: # loop over all persons in text span
        if et.label_ == 'PERSON': 
            return et.text # return the first person found
    return None

## Extraction of Additional Relations

Two additional relations extracted were: date of birth and date of death

In [None]:
bdayMatch = spacy.matcher.Matcher(nlp.vocab)
bdayMatch.add("bday", None, [{"LOWER":"born"}],[{"LOWER":"birth"}])

deathMatch = spacy.matcher.Matcher(nlp.vocab)
deathSyns = r'^(die|pass|.?sleep|heaven|succumb|perish).*' # synonyms for death
deathMatch.add("death", None, [{"LOWER":{"REGEX": deathSyns}}])

################################################################################
def findDateAfterMatch(doc, matcher, startAtMatch=False):
    matched = matcher(doc)
    if not matched: # if no match, then nothing is after
        return None
    matchedSent = doc[matched[0][1]].sent # assume 1st match is good
    spanBeg = matched[0][1] if startAtMatch else matchedSent.start
    spanEnd = matchedSent.end
    for n in range(2):
        spanEnd = doc[spanEnd].sent.end
        
    for et in doc[spanBeg:spanEnd].ents:
        if et.label_ == 'DATE' and re.findall(r'\d\d', et.text):
            return et.text
    return None

# findDateAfterMatch(nlp(trTxts[11]), deathMatch, startAtMatch=True)

## Filling Template and Outputting Result

In [None]:
def fillTemplate(doc):
    info = dict()
    info['name'] = extractName(doc)
    info['sex'] = 'male' if isMan(doc) else 'female'
    info['age'] = getAgeDoc(doc)[0] # get age using document parsing
    info['locations'] = list(findLocations(doc))
    info['spouse'] = findSpouseName(doc)
    info['birth date'] = findDateAfterMatch(doc, bdayMatch)
    info['death date'] = findDateAfterMatch(doc, deathMatch, startAtMatch=True)
    
    if info['birth date'] and info['death date']: # if neither date is missing
        bd = dateutil.parser.parse(info['birth date']) # parse datetime for both
        dd = dateutil.parser.parse(info['death date']) # birthday and death day
        info['age'] = (dd.year-bd.year) - ((dd.month,dd.day)<(bd.month,bd.day))
    
    return info

In [None]:
def parseObitsOutputInfo(obitFiles, outInfoFiles):
    docIDs,txts = readFiles(obitFiles)
    
    with open(outInfoFiles, 'w') as outFH:
        for docID,txt in zip(docIDs,txts):
            doc = nlp(txt)
            out = {'ID': docID}
            out.update(fillTemplate(doc))
            pprint.pprint(out, outFH)
            outFH.write('\n')

In [None]:
parseObitsOutputInfo('data/obits.train.txt', 'obits.train.out')

In [None]:
# parseObitsOutputInfo('data/obits.test.txt', 'obits.test.out')

## Evaluation