## Autopsy Reports to Text

#### Goal:
To extract text from autopsy PDFs

#### Input(s):

Autopsy PDFs (set path variable specified below)
#### Output(s):

Feather file with original autopsy text ({my_directory}/new_autopsies_{batch_date}.feather)
CSV file with preprocessed narrative text (processed_narr_{batch_date}.csv)
#### To run, set 3 variables:

my_directory = where you want outputs to save (e.g., 'C:/Users/dc20b49/Documents/TDH_DS_Demo/')
path = folder with autopsies (e.g., 'A:/APHA Data Science Grant/2021')
batch_date = date you want to appear in output file name (e..g, '8-4-22')
## Setup

In [10]:
%run autopsy_to_text.ipynb

my_directory = 'C:/Users/DC20B46/Desktop/tndh_ds_demo/'
path = 'Z:/APHA Data Science Grant/2021'
batch_date = '8-10-22'

print(f'''my_directory = {my_directory},
path = {path},
batch_date = {batch_date}''')

my_directory = C:/Users/DC20B46/Desktop/tndh_ds_demo/,
path = Z:/APHA Data Science Grant/2021,
batch_date = 8-10-22


In [None]:
%%time

# extract text from each autopsy in directory (this may take hours)...
file_paths = list_files(path)
print(f'Processing {len(file_paths)} autopsies (this may take hours)...')

autopsies = load_autopsies(file_paths)

print('Extracting DID, year, first and last names from file names...')

autopsies_df = parse_filepath(autopsies)

# save original autopsy text to a file
autopsies_df.to_feather(f'{my_directory}/new_autopsies_{batch_date}.feather')

print(f'Original autopsy text saved to new_autopsies_{batch_date}.feather')

## Extract Narrative Sections from Autopsy Text

In [None]:
### run this if you are like me and have autopsies saved in separate files from before; otherwise, ignore
# autopsies_sudors = pd.read_feather(f'{my_directory}/autopsies_original_sudors.feather')
# autopsies_nonsudors1 = pd.read_feather(f'{my_directory}/autopsies_original_nonOD.feather')
# autopsies_nonsudors2 = pd.read_feather(f'{my_directory}/autopsies_original_nonOD_2021.feather')
# autopsies_nonsudors = pd.concat([autopsies_nonsudors1, autopsies_nonsudors2], ignore_index=True)
# autopsies_nonsudors = autopsies_nonsudors.loc[~autopsies_nonsudors.DID.isin(autopsies_sudors.DID)]
# autopsies_df=pd.concat([autopsies_sudors, autopsies_nonsudors], ignore_index=True).reset_index(drop=True)

# autopsies_df['DID'] = autopsies_df['File_Path'].str.extract('([0-9]+)_')
# autopsies_df['full_name'] = autopsies_df['File_Path'].str.lower().str.extract('([a-z\-\s\']+(?:\.)[a-z\-\s\']+)')
# autopsies_df['first_name'] = autopsies_df.apply(lambda x: re.sub('\..*$', '', x['full_name']) if pd.notna(x['DID'])
#                           else re.sub('^.*\.', '', x['full_name']), axis = 1)
# autopsies_df['last_name'] = autopsies_df.apply(lambda x: re.sub('\..*$', '', x['full_name']) if pd.isna(x['DID'])
#                           else re.sub('^.*\.', '', x['full_name']), axis = 1)
# autopsies_df['year'] = autopsies_df.DID.str.extract(r'^([0-9]{4})')
# autopsies_df.drop(columns='full_name', inplace = True)
# autopsies_df.head()

In [None]:
%%time

%run extract_narratives.ipynb

# remove autopsies from 2010 or with NA years
if autopsies_df.loc[(autopsies_df.year == '2010') | (autopsies_df.year.isna())].shape[0] > 0:
    print(f'''Autopsies removed because could not extract year from file path, or year appears off:
    {autopsies_df.loc[(autopsies_df.year == '2010') | (autopsies_df.year.isna())].File_Path.unique()}
    ''')
    
autopsies_df = autopsies_df.loc[(autopsies_df.year != '2010') & (autopsies_df.year.notna())]

# remove whitespaces and symbols
print('Cleaning autopsy text...')

autopsies_df['doc_clean'] = autopsies_df['doc'].apply(rm_whitespace_sym)

# assign forensic centers using regex
print('Assigning forensic centers to autopsies...')

autopsies_df['forensic_center'] = autopsies_df['doc_clean'].apply(get_forensic_center)
autopsies_df['forensic_center'].fillna('', inplace = True)

# remove blank autopsies
autopsies_df = autopsies_df.loc[autopsies_df.doc_clean != '']

# extract narrative sections (initial narrative, interpetation/summary, summary of circumstances)
print('Extracting narrative sections from text...')
narr_df = get_narrative(autopsies_df)

# if more than one narrative for a specific DID, keep the longer one
print('Removing duplicate DIDs...')
narr_df = narr_df.sort_values(by=['DID', 'full_narr_len'], ascending = False).drop_duplicates(subset='DID').reset_index()

# save output
narr_df.to_feather(f'{my_directory}/new_narratives_{batch_date}.feather')

print(f'Narrative text saved to new_narratives_{batch_date}.feather')

# print summary by forensic center
narr_df.groupby('forensic_center').agg({'DID': 'nunique',
                                        'has_narr': 'sum',
                                        'has_interp': 'sum',
                                        'has_circ': 'sum',
                                        'full_narr_len': ['min', 'max', 'mean']})

## Preprocess Narrative Text

In [None]:
%%time

%run preprocess_narratives.ipynb

narr_df = narr_df[['DID', 'forensic_center', 'year',
                   'full_narr', 'has_narr', 'has_interp', 'has_circ']]

narr_df['year'] = narr_df['year'].astype(int)
narr_df['full_narr_len'] = narr_df['full_narr'].map(len)

# preprocess text
print('Parsing narrative text (this may take minutes)...')
narr_df['full_narr_nlp'] = narr_df['full_narr'].map(nlp)

# remove numbers, punctuation, stopwords, and lemmatize
print('Removing numbers, punctuation, and stopwords and lemmatizing text...')
narr_df['full_narr_lemma'] = (narr_df['full_narr_nlp']
                              .map(rm_numbers)
                              .map(rm_punct)
                              .map(rm_stopwords)
                              .map(lemmatize)
                             )
narr_df['full_narr_lemma_text'] = narr_df['full_narr_lemma'].apply(lambda x: ' '.join(t for t in x))

# drop autopsies with no words
narr_df = narr_df.loc[narr_df.full_narr_lemma_text.notna()]
narr_df['full_narr_lemma_text_len'] = narr_df['full_narr_lemma_text'].map(len)

# save output
narr_df[['DID', 'forensic_center', 'year', 
          'has_narr', 'has_interp', 'has_circ',
         'full_narr', 'full_narr_lemma',
         'full_narr_lemma_text', 'full_narr_lemma_text_len']].to_csv(f'{my_directory}/processed_narr_{batch_date}.csv',
                                                                     index = False)

print(f'Preprocessed narrative text saved to processed_narr_{batch_date}.csv')