# Annotation sample creation



## Setup

In [None]:
import os, re
import pandas as pd
import numpy as np
import time
import sys
print("Python executable:", sys.executable)

import medspacy, spacy
from medspacy.ner import TargetRule 
from medspacy.visualization import visualize_ent

from matplotlib import pyplot as plt

import psutil 
num_cores = psutil.cpu_count(logical = False)
print("CPU cores found:", num_cores)
from p_tqdm import p_map

from keywords import find_keywords2

#from clinical_sectionizer import TextSectionizer, Sectionizer

## Import data

### Notes

In [None]:
%%time

# Import the raw notes data.

cache_file = "data/cache/note-events.feather"
if not os.path.exists(cache_file):
    # This takes a ~24 seconds to load.
    df = pd.read_csv('data-raw/mimic/NOTEEVENTS.csv',
                 usecols = ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'DESCRIPTION', 'CHARTTIME','CGID', 'TEXT'],
                 low_memory = False)
    df.to_feather(cache_file)
else:
    # Takes 5 seconds to load - nice speedup.
    df = pd.read_feather(cache_file)

# Lowercase the column names for easier typing.
df.columns = df.columns.str.lower()

print("Dataframe shape:", df.shape)
print("Dataframe columns:", df.columns)

In [None]:
# Review distribution of the notes category.
df.category.value_counts()

In [None]:
# Excluding by note categories.

exclude_categories = ['Echo', 'ECG', 'Case Management ', 'Social Work', 'Pharmacy']
df = df[~df['category'].isin(exclude_categories)]
print(df.category.value_counts())
# Down to 1.8 MM notes
df.shape

### Patients

In [None]:
pat_df = pd.read_csv('data-raw/mimic/PATIENTS.csv',
                     usecols = ['SUBJECT_ID', 'DOB', 'GENDER', 'EXPIRE_FLAG'],
                     low_memory = False)
pat_df.columns = pat_df.columns.str.lower()

pat_df['birthdate'] = pd.to_datetime(pat_df['dob']).dt.date
pat_df.info()

### Admissions

In [None]:
adm_df = pd.read_csv('data-raw/mimic/ADMISSIONS.csv', 
                         usecols = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 
                                  'ADMISSION_TYPE', 'DISCHARGE_LOCATION', 'INSURANCE', 
                                  'MARITAL_STATUS', 'LANGUAGE', 'ETHNICITY', 'DIAGNOSIS'],
                         low_memory = False)

adm_df.columns = adm_df.columns.str.lower()

adm_df['date_of_admission'] = pd.to_datetime(adm_df['admittime']).dt.date
adm_df.info()

In [None]:
# Merge patient and admission data
adm_df = pd.merge(pat_df, adm_df, on = 'subject_id', how = 'left')

## Clean data

### Calculate age

In [None]:
# Note: https://mimic.mit.edu/docs/iii/tables/patients/#dob
# Patients who are older than 89 years old at any time in the database have had their date of birth shifted to obscure their age
adm_df['age'] = adm_df.apply(lambda e: min(round((e['date_of_admission'] - e['birthdate']).days / 365.25, 0), 90),
                             axis = 1)
adm_df.describe()
adm_df.drop(['dob', 'birthdate'], axis = 1, inplace = True)

### Restrict to patients >= 18

In [None]:
# Merge compiled patient data with notes 
# We do an inner join because we don't want admissions with no notes.
df = pd.merge(adm_df, df, on = ['subject_id','hadm_id'], how = 'inner')

# Restrict to patients >= 18
df = df.loc[(df['age'] >= 18)]

# Down to 1.25 MM notes.
print(df.shape)

# Confirm that age distribution looks good.
df[['age']].describe()

In [None]:
print(df.info())
# Check for missingness, esp. in the text field.
df.isnull().sum(axis=0)

### Save preliminary df

In [None]:
df.reset_index(drop = True).to_feather("data/02-notes-to-sentences-prep.feather")

## Begin NLP

In [None]:
#from medspacy.sentence_splitting import PySBDSenteceSplitter

#sentencizer = PySBDSenteceSplitter()

#import spacy
#from pysbd.utils import PySBDFactory

#nlp = medspacy.load(enable=["sectionizer"])
#print(nlp.pipe_names)

# Default nlp:
#nlp = medspacy.load()

# Upgraded:
#nlp = medspacy.load("en_core_sci_scibert",
#nlp = medspacy.load("en_core_sci_lg",
nlp = medspacy.load(load_rules = False,
                    enable = ["medspacy_tokenizer"])
#                    enable = ["tokenizer"],
#                    disable=["target_matcher", 'medspacy_pyrush', 'medspacy_context'])

#tokenizer = medspacy.create_medspacy_tokenizer(nlp)
#nlp.tokenizer = tokenizer

# explicitly adding component to pipeline
# (recommended - makes it more readable to tell what's going on)
nlp.add_pipe("medspacy_pysbd", first = True)

print(nlp.tokenizer)
print(nlp.pipe_names)

## Extract sentences

Create a new dataframe with one row for each sentence in a note. Each sentence should also have its count of words and characters. Don't calculate sentiment yet.

In [None]:
import stanza
print("Stanza version:", stanza.__version__)
import re

# Run this the first time:
# stanza.download('en')

# Make sure to use version 1.3.0+ of stanza, otherwise you will get an md5 error here.
# stanza.download('en', package='mimic')

#nlp_stanza = stanza.Pipeline(lang = 'en',
#                             package = 'mimic',
#                             processors = 'tokenize')

# Loop over each sentence in a note and calculate different things.
# There are a ton of notes so this function should ideally be run in parallel.
# Note should be a tuple.
def analyze_note(note,
                 newlines_to_spaces = True,
                 remove_quotes = True,
                 extra_clean = True,
                 # Other options: "stanza"
                 package = "spacy"):
    
    # This can raise an AttributeError() when note.text is for some reason a float.
    note_text = note.text.strip()
   
    # Custom processing to improve sentence segmentation.
    # Do this before we replace newlines to avoid clobbering a lot of text.
    if extra_clean:
        note_text = re.sub(r' (q\. ?d(ay)?\.?)', r" qd", note_text)

        note_text = note_text.replace("p.o.", "po")
        note_text = note_text.replace("p.r.n.", "prn")
    
    # This is important, otherwise a newline always seem to lead to a sentence break.
    if newlines_to_spaces:
        note_text = note_text.replace("\n", " ")
        note_text = note_text.replace("\r", " ")
        
    if extra_clean:
        note_text = re.sub(r'q\.( *\d+(\-\d+)?h?)\.', r'q\1', note_text)
        # Long series of _____ converted to shorter word then newline.
        note_text = re.sub('_{5,}', '\n_LINE_\n', note_text)
    
    # Only double-quotes for now, since single quotes can be used in contractions.
    if remove_quotes:
        note_text = note_text.replace('"', '')
        
    # Extract all sentences from this note.
    if package == "spacy":
        doc = nlp(note_text)
        sentence_iterator = doc.sents
    elif package == "stanza":
        doc = nlp_stanza(note_text)
        sentence_iterator = doc.sentences

    doc_sents = []
    # Loop over each sentence in the note based on the sentencizer.
    for i, sent in enumerate(sentence_iterator):
        # Remove leading and trailing whitespace - especially newlines but also spaces.
        sent_clean = sent.text.strip()
        # Replace multiple spaces with a single space.
        sent_clean = re.sub(r' +', ' ', sent_clean)
        
        if package == "spacy":
            word_count = len(sent)
        else:
            word_count = len(sent.tokens)
            
        result = {"row_id": note.row_id,
                  # Unique id for this note.
                  "cgid": note.cgid,
               "sent_num": i,
                # We need to extract the raw text from the sentence object,
                # otherwise we'll run into a "pickling a span is not supported" spacy error.
               "text": sent_clean,
               "chars": len(sent_clean),
               "words": word_count}
        # Don't calculate sentiment score in this loop - do it after all sentences have been extracted.
        doc_sents.append(result)
    return(pd.DataFrame(doc_sents))

# Run analyze_note on each row of a df. Used when dividing a dataframe into smaller
# partitions for parallel analysis.
def analyze_note_df(df,
                    min_chars = None, # 22
                    min_words = None, # 5
                    max_words = None, # 100
                    **kwargs):
    results = []
    for row in df.itertuples():
        try:
            result = analyze_note(row, **kwargs)
            # Join original df back onto result.
            #result2 = pd.join(row, result)
            results.append(result)
        except AttributeError as e:
            # This exception will only happen if there is no note for this admission.
            print("Note.text type:", type(row.text))
            print("Note.text value:", row.text)
            print(e)
    
    combined_df = pd.concat(results)
    # Need parentheses for these two criteria.
    if min_words is not None:
        combined_df = combined_df.loc[combined_df.words >= min_words]
    if min_chars is not None:
        combined_df = combined_df.loc[combined_df.chars >= min_chars]
    if max_words is not None:
        combined_df = combined_df.loc[combined_df.words <= max_words]

    return combined_df

## Multi-note example

In [None]:
num_samples = 20
df_samp = df.sample(num_samples, random_state = 1)
df_samp.shape

In [None]:
df_samp.columns

In [None]:
results = {}
results['medspacy'] = analyze_note_df(df_samp)
results['medspacy_label'] = analyze_note_df(df_samp,
                                            min_chars = 22, min_words = 5, max_words = 100)
"""
results['stanza'] = analyze_note_df(df_samp, package = "stanza")
results['stanza_label'] = analyze_note_df(df_samp, package = "stanza",
                                            min_chars = 22, min_words = 5, max_words = 100)
"""
writer = pd.ExcelWriter('data/sentences.xls')

results['medspacy'].to_excel(writer, sheet_name = 'medspacy-pysbd', index = False)
results['medspacy_label'].to_excel(writer, sheet_name = 'medspacy-pysbd-label', index = False)
#results['stanza'].to_excel(writer, sheet_name = 'stanza', index = False)
#results['stanza_label'].to_excel(writer, sheet_name = 'stanza-label', index = False)

df_samp[['row_id', 'category', 'text']].to_excel(writer, sheet_name = 'notes', index = False)

writer.save()
writer.close()

## Sentence segmentation

Takes 23 minutes with 24 cores.

In [None]:
%%time

num_partitions = 10000
# This will be a list of dfs.
df_split = np.array_split(df,
                          #df.sample(50000),
                          num_partitions)

# Apply our function to each dataframe chunk in the list.
result = p_map(analyze_note_df, df_split, num_cpus = num_cores)
sent_df = pd.concat(result)

sent_df.info()
sent_df.reset_index(drop = True).to_feather("data/mimic-sentences-pysbd.feather")

In [None]:
# Load the feather file to confirm that it was saved correctly.
sent_df = pd.read_feather("data/mimic-sentences-pysbd.feather")
# 41 MM sentences with no note category exclusions.
# 38 MM sentences with the note category exclusions.
# 29 MM sentences when also excluding age < 18.
print(sent_df.info())

## Old cells

### Sequential version

This will take around 9 hours.

In [None]:
%%time

from tqdm.auto import tqdm

sents = []

# Analyze a random sample of notes.
#sample_size = 1000
#for i, row in tqdm(df.sample(sample_size).iterrows(),
for i, row in tqdm(df.iterrows(),
                  # total = sample_size):
                   total = df.shape[0]):
    sents.append(analyze_note(row))

sent_df = pd.concat(sents)
sent_df.info()
sent_df.head()
sent_df.reset_index(drop = True).to_feather("data/mimic-sentences-pysbd.feather", index = False)
sent_df.reset_index(drop = True).sample(2000).to_excel("data/mimic-sentences-pysbd-2k.xlsx", index = False)

### Not working parallel options --

In [None]:
#import multiprocessing as mp

"""
with mp.Pool(processes = num_processes) as p:
    # apply our function to each chunk in the list
    result = p.imap(analyze_note_df, df_split)
    sent_df = pd.concat(result)
"""

In [None]:
%%time

from dask import delayed, compute
import dask.dataframe as dd
# Convert the pandas dataframe to a dask dataframe, to enable parallelization.
# ddf = dd.from_pandas(df, npartitions = 10000)
# Only analyze a random 10k rows, for testing purposes.
ddf = dd.from_pandas(df.sample(1000), npartitions = 100)

# Setup the computation that will be needed - the results will be computed in the next cell.
sents2 = ddf.map_partitions(lambda part: part.apply(analyze_note,
                  axis = 1), meta = result.head(0)).clear_divisions().compute() #, meta = ddf)

In [None]:
sent_df2 = pd.concat(sents2)

In [None]:
#%%time
#import swifter

#sents = df.swifter.apply(lambda row: analyze_note(row), axis = 1) #, meta = ddf)
#sents = df.apply(lambda row: analyze_note(row), axis = 1) #, meta = ddf)

In [None]:
%%time

from tqdm.dask import TqdmCallback
# Takes 23 minutes with 24 cores and 10k partitions.
# Takes 44 minutes with 24 cores and 1k partitions.
with TqdmCallback(desc = "compute"):
    compute(sents, scheduler='processes')

In [None]:
# This tends to be very slow for some reason.
len(sents)

In [None]:
print(type(sents))
sents[0]

In [None]:
print("test")

In [None]:
ddf = dd.from_pandas(df.samples(10000), npartitions = 10000)

# Setup the computation that will be needed - the results will be computed in the next cell.
sents = ddf.apply(lambda row: analyze_note(row), axis = 1) #, meta = ddf)

In [None]:
%%time

import mapply

mapply.init(n_workers = -1)

# Uses tqdm by default.
sents = df.sample(1000).mapply(analyze_note, axis = 1, result_type='expand')

In [None]:
sent_df = pd.concat(sents)

In [None]:
type(sents)

In [None]:
sent_df.describe()