## Narrative Text Extraction from Autopsies
### Preprocess Autopsy Text for Narrative Extraction
Lowercase text, remove whitespaces, remove symbols

In [None]:
import os
import pandas as pd
import numpy as np
import re

### remove newlines and extra spaces
def rm_whitespace(doc):
    
    rm_newlines = ' '.join(doc.split('\n'))
    rm_newlines_spaces = ' '.join(rm_newlines.split())
    
    return rm_newlines_spaces

### clean text: lowercase, remove whitespace, symbols
symbols = '�¥•°·□§Â®â€™µα_'
def rm_whitespace_sym(doc_text):
    
    doc_text = doc_text.lower()
    text_clean = rm_whitespace(doc_text)
    text_clean = "".join([char for char in text_clean if char not in symbols])
    
    return text_clean

### assign forensic center based on different keywords
def get_forensic_center(doc_text):
    
    if re.search('center for forensic medicine', doc_text):
        return 'M'
    
    elif re.search('(office of the medical examiner west)|(west tennessee( regional forensic center)?)', doc_text):
        return 'W'
    
    elif re.search('(william l jenkins)|(quillen college)|(east tennessee state)', doc_text):
        return 'NE'
    
    elif re.search('(knox county)|(sevier county)|(regional (forensic( center)? )?knox)', doc_text):
        return 'E'
   
    elif re.search('hamilton county( forensic center)?', doc_text):
        return 'SE'
    else:
        return None

## Extract Narrative Sections Based on Forensic Center
Narrative text from three sections:

#### (1) Initial Narrative (all except SE)

Start = "narrative" or "summary" or for NE "brief history"
End = depends on forensic center:
M/W: jurisdiction accept, toxicology order, autopsy order, yes, approve, external exam
E: final anatomic diagnosis/diagnoses i., external exam, reference comment
NE: signature, external exam, autopsy exam, reference comment
SE: no narrative section
#### (2) Interpretation/Summary (M/W only)

Start = conclusion, summary, summary and opinion, summary and interpretation, summary and comment
Search Space = internal/external exam to beginning of toxicology (@nms, reference comment, electronically sign)
#### (3) Summary of Circumstances (all)

Start = "summary of ci[rcumstances]"
End = end of autopsy (remove text after "postmortem observations")
Search Space = end of narrative to end of autopsy

In [None]:
import regex

### extract initial narrative section using regex that locates beginning and end of each narrative
### note: all formats except SE should have initial narrative section
# regex for start of narrative
narr_start_re = re.compile(r'(?<!analysis )(narrative|summary).*')
narr_start_re_NE = re.compile(r'(?<!analysis )(narrative|summary|brief\s?history).*')

# regex for end of narrative
narr_re_end = dict()
narr_re_end['M'] = narr_re_end['W'] = re.compile(r'''(jurisdiction\s?accept|toxicology\s?order|
autopsy\s?order|yes|approve|electronically\ssign|signed|reference\scomment|external\s?exam)''', re.X)
narr_re_end['E'] = re.compile(r'(final\s?anatomic\s?diagnos[ie]s\si\.|external\sexam|reference\scomment)')
narr_re_end['NE'] = re.compile(r'(signature|(external|autopsy)\s?exam|reference\scomment)')

def get_narr(doc, fc):
    # no narratives in SE
    if fc not in narr_re_end.keys():
        return np.nan, 0
    
    # locate start of narrative summary
    try:
        if fc == 'NE':
            narr_match_start = narr_start_re_NE.search(doc)
        else:
            narr_match_start = narr_start_re.search(doc)
            
        narr_start = narr_match_start.start()
        
    except:
        return np.nan, 0
    
    # locate end of narrative summary
    narr_match_end = [f.start() for f in narr_re_end[fc].finditer(doc) if f.start() >= narr_start]
    if narr_match_end:
        narr_end = min(narr_match_end)
    else:
        narr_end = len(doc) - 1
    
    narr = doc[narr_start:narr_end]
    
    return narr, narr_end

### extract summary of circumstances using regex (accounts for extra spaces b/c poorer data quality)
### note: all formats may have summary of circumstances
circ_sum_re = re.compile(r'(s\s?u\s?m\s?m\s?a\s?r\s?y\s?of\s?ci).*')
def get_circ(doc, narr_end):
    try:
        circ_match = circ_sum_re.search(doc[narr_end:])
        circ = circ_match.group()
        circ = re.sub('(signature|i\shereby\sdeclare).*', '', circ)
        circ_start = circ_match.start()
        return circ, circ_start
    
    except:
        return np.nan, len(doc) - 1

### extract interpretation/summary section using regex that locate beginning of interpretation,
### toxicology, internal exam, and external exam sections
### note: only W, M have interpretation section that occurs after internal exam and before toxicology
interp_regex = r'((?:s\s?u\s?m\s?m\s?a\s?r\s?y(?:\s?and\s?(?:opinion|interpretation|comment))?|conclusion\b).*)'
tox_re = re.compile(r'@nms|reference\scomment|electronically\ssign')
int_exam_re = re.compile(r'internal\sexam')
ext_exam_re = re.compile(r'external\sexam')
def get_interp(doc, narr_end, circ_start, fc):
    
    if fc not in ['M', 'W']:
        return np.nan
    
    # find start of internal exam (or external if can't find internal exam)
    int_exam_match_start = [f.start() for f in int_exam_re.finditer(doc)]
    if int_exam_match_start:
        end_of_exam = min(int_exam_match_start)
    else:
        ext_exam_match_start = [f.start() for f in ext_exam_re.finditer(doc)]

        if ext_exam_match_start:
            end_of_exam = min(ext_exam_match_start)
        else:
            end_of_exam = narr_end

    # find start of toxicology report
    tox_match_start = [f.start() for f in tox_re.finditer(doc) if f.start() >= end_of_exam]
    if tox_match_start:
        tox_start = min(tox_match_start)
    else:
        tox_start = circ_start

    # find interpretation section between end of exam and toxicology start
    interp_match = regex.findall(interp_regex, doc[end_of_exam:tox_start], overlapped = True)
    if interp_match:
        interp = interp_match[-1]
    else:
        interp = np.nan
    
    return interp

### extract initial narrative, interpretation/summary, and summary of circumstances based on forensic center
### concatenate narrative sections into full narrative
### add flags to indicate which ones were found in each report
### calculate length of full narrative
def get_narrative(autopsies_df):
    narrative_matches = []
    for row in autopsies_df.itertuples():
        DID, doc, fc = row.DID, row.doc_clean, row.forensic_center

        ### extract narrative that comes at beginning of autopsy and before external examination
        narr, narr_end = get_narr(doc, fc)

        ### extract summary of circumstances that sometimes is included, comes at end of report
        circ, circ_start = get_circ(doc, narr_end)

        ### extract narrative that comes after internal examination and before toxicology report
        interp = get_interp(doc, narr_end, circ_start, fc)

        narrative_matches.append([DID, fc, row.year, row.doc_clean, narr, interp, circ])

    narr_df = pd.DataFrame(narrative_matches, columns = ['DID', 'forensic_center', 'year', 'doc_clean',
                                                         'narr', 'interp', 'circ'])

    # generate flags for each type of narrative section
    narr_df['has_narr'] = narr_df['narr'].apply(lambda x: 1 if pd.notna(x) and x != '' else 0)
    narr_df['has_interp'] = narr_df['interp'].notna().astype(int)
    narr_df['has_circ'] = narr_df['circ'].notna().astype(int)

    # fill blank narrative sections with ''
    narr_df['narr'].fillna('', inplace = True)
    narr_df['circ'].fillna('', inplace = True)
    narr_df['interp'].fillna('', inplace = True)

    # remove text after postmortem observations in summary of circumstances
    narr_df['circ'] = narr_df['circ'].apply(lambda x: re.sub(string=x, pattern='postmortem obs.*', repl=''))

    # concatenate narrative sections (initial narrative + interpretation + summary of circumstances)
    narr_df['full_narr'] = narr_df['narr'].astype(str) + narr_df['interp'].astype(str) + narr_df['circ'].astype(str)

    # calculate length of narrative
    narr_df['full_narr_len'] = narr_df['full_narr'].apply(len)

    return narr_df