In [None]:
import os, re
import pandas as pd
import numpy as np


# Plotting
from matplotlib import pyplot as plt
import seaborn as sns

# Turn off FutureWarnings
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)

print("Pandas version:", pd.__version__)
print("CPU threads detected:", os.cpu_count())

In [None]:
%%time

# Import the raw notes data.
# Restrict to the first 500k rows to speed this part up.
df = pd.read_csv('NOTEEVENTS.csv', low_memory = False)


In [None]:
all_keywords=[]
neg_keywords=pd.read_csv('negative_keywords.csv').iloc[::,0]
pos_keywords=pd.read_csv('positive_keywords.csv').iloc[::,0]

all_keywords.append(neg_keywords.apply(lambda x: x.strip()).tolist())
all_keywords.append(pos_keywords.apply(lambda x: x.strip()).tolist())

keywords=[i.lower() for i in all_keywords for i in i]

neg=neg_keywords.apply(lambda x: x.strip().lower()).tolist()
pos=pos_keywords.apply(lambda x: x.strip().lower()).tolist()

In [None]:
#df=df0.sample(n=100, random_state=0).reset_index(drop=True)

# Lowercase the column names for easier typing.
df.columns = df.columns.str.lower()

print("Dataframe shape:", df.shape)
print("Dataframe columns:", df.columns)

In [None]:
df.head()

In [None]:
# Review distribution of the notes category - a large percentage are ECG, but nursing is 2nd most frequent.
df.category.value_counts()

In [None]:
discharge_summaries=df[df.category=='Discharge summary'].text
physician=df[df.category=='Physician '].text
general=df[df.category=='General'].text
consult=df[df.category=='Consult'].text
nursing=df[df.category=='Nursing'].text
respiratory=df[df.category=='Respiratory '].text
rehab=df[df.category=='Rehab Services'].text
nutrition=df[df.category=='Nutrition'].text

In [None]:
#code from https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences

alphabets= "([A-Za-z])"
digits = "([0-9])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text) 
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences if 5<len(s)<500]
    return sentences

In [None]:
def find_keywords(text, keywords):
    found_keywords=[]
    text=str(text).lower()
    for i in keywords: 
        if re.search(r'\b{}\b'.format(i), text): 
            if i not in ' '.join(found_keywords):
                found_keywords.append(i)
    return found_keywords

In [None]:
def keyword_score(keyword_list, neg_keywords, pos_keywords):
    pos_score=len([i for i in keyword_list if i in pos_keywords])
    neg_score=len([i for i in keyword_list if i in neg_keywords])
    if neg_score+pos_score>0:
        return neg_score/(neg_score+pos_score)

In [None]:
dc_sums=pd.DataFrame(split_into_sentences(' '.join(discharge_summaries)), columns=['text'])
physician_notes=pd.DataFrame(split_into_sentences(' '.join(physician)), columns=['text'])
general_notes=pd.DataFrame(split_into_sentences(' '.join(general)), columns=['text'])
consult_notes=pd.DataFrame(split_into_sentences(' '.join(consult)), columns=['text'])
nursing_notes=pd.DataFrame(split_into_sentences(' '.join(nursing)), columns=['text'])
resp_notes=pd.DataFrame(split_into_sentences(' '.join(respiratory)), columns=['text'])
rehab_notes=pd.DataFrame(split_into_sentences(' '.join(rehab)), columns=['text'])
nutrition_notes=pd.DataFrame(split_into_sentences(' '.join(nutrition)), columns=['text'])

In [None]:
#First 100 sample (to be consistent for medSpacy/Stanza comparison)
dc_samp=pd.Series(dc_sums.text.iloc[0:100])
physician_samp=pd.Series(physician_notes.text.iloc[0:100])
general_samp=pd.Series(general_notes.text.iloc[0:100])
consult_samp=pd.Series(consult_notes.text.iloc[0:100])
nursing_samp=pd.Series(nursing_notes.text.iloc[0:100])
resp_samp=pd.Series(resp_notes.text.iloc[0:100])
rehab_samp=pd.Series(rehab_notes.text.iloc[0:100])
nutrition_samp=pd.Series(nutrition_notes.text.iloc[0:100])

In [None]:
#Sample of 100 sentences per note type (for smaller saved file)
dc_samp.to_csv('MIMIC_sentence_discharge_summaries.csv', index=False)
physician_samp.to_csv('MIMIC_sentence_physician_notes.csv', index=False)
general_samp.to_csv('MIMIC_sentence_general_notes.csv',index=False)
consult_samp.to_csv('MIMIC_sentence_consult_notes.csv',index=False)
nursing_samp.to_csv('MIMIC_sentence_nursing_notes.csv',index=False)
resp_samp.to_csv('MIMIC_sentence_resp_notes.csv',index=False)
rehab_samp.to_csv('MIMIC_sentence_rehab_notes.csv',index=False)
nutrition_samp.to_csv('MIMIC_sentence_nutrition_notes.csv',index=False)

In [None]:
#Continue here for isolation of keywords/keyword score 

In [None]:
#Random sampling
dc_sum_sample=dc_sums.sample(n=1000).reset_index(drop=True)
physician_sample=physician_notes.sample(n=1000).reset_index(drop=True)
general_sample=general_notes.sample(n=1000).reset_index(drop=True)
consult_sample=consult_notes.sample(n=1000).reset_index(drop=True)
nursing_sample=nursing_notes.sample(n=1000).reset_index(drop=True)
resp_sample=resp_notes.sample(n=1000).reset_index(drop=True)
rehab_sample=rehab_notes.sample(n=1000).reset_index(drop=True)
nutrition_sample=nutrition_notes.sample(n=1000).reset_index(drop=True)

In [None]:
dc_sum_sample['keywords']=dc_sum_sample.text.map(lambda x: find_keywords(x, keywords))
physician_sample['keywords']=physician_sample.text.map(lambda x: find_keywords(x, keywords))
general_sample['keywords']=general_sample.text.map(lambda x: find_keywords(x, keywords))
consult_sample['keywords']=consult_sample.text.map(lambda x: find_keywords(x, keywords))
nursing_sample['keywords']=nursing_sample.text.map(lambda x: find_keywords(x, keywords))
resp_sample['keywords']=resp_sample.text.map(lambda x: find_keywords(x, keywords))
rehab_sample['keywords']=rehab_sample.text.map(lambda x: find_keywords(x, keywords))
nutrition_sample['keywords']=nutrition_sample.text.map(lambda x: find_keywords(x, keywords))



In [None]:
dc_sum_sample['keyword_score']=dc_sum_sample.keywords.map(lambda x: keyword_score(x, neg, pos))
physician_sample['keyword_score']=physician_sample.keywords.map(lambda x: keyword_score(x, neg, pos))
general_sample['keyword_score']=general_sample.keywords.map(lambda x: keyword_score(x, neg, pos))
consult_sample['keyword_score']=consult_sample.keywords.map(lambda x: keyword_score(x, neg, pos))
nursing_sample['keyword_score']=nursing_sample.keywords.map(lambda x: keyword_score(x, neg, pos))
resp_sample['keyword_score']=resp_sample.keywords.map(lambda x: keyword_score(x, neg, pos))
rehab_sample['keyword_score']=rehab_sample.keywords.map(lambda x: keyword_score(x, neg, pos))
nutrition_sample['keyword_score']=nutrition_sample.keywords.map(lambda x: keyword_score(x, neg, pos))



In [None]:
dc_sum_sample['keyword_presence']=dc_sum_sample.keywords.map(lambda x: True if len(x)>0 else False)
physician_sample['keyword_presence']=physician_sample.keywords.map(lambda x: True if len(x)>0 else False)
general_sample['keyword_presence']=general_sample.keywords.map(lambda x: True if len(x)>0 else False)
consult_sample['keyword_presence']=consult_sample.keywords.map(lambda x: True if len(x)>0 else False)
nursing_sample['keyword_presence']=nursing_sample.keywords.map(lambda x: True if len(x)>0 else False)
resp_sample['keyword_presence']=resp_sample.keywords.map(lambda x: True if len(x)>0 else False)
rehab_sample['keyword_presence']=rehab_sample.keywords.map(lambda x: True if len(x)>0 else False)
nutrition_sample['keyword_presence']=nutrition_sample.keywords.map(lambda x: True if len(x)>0 else False)



In [None]:
#All sentences for labelling
dc_sums.to_csv('MIMIC_sentence_discharge_summaries.csv', index=False)
physician_notes.to_csv('MIMIC_sentence_physician_notes.csv', index=False)
general_notes.to_csv('MIMIC_sentence_general_notes.csv',index=False)
consult_notes.to_csv('MIMIC_sentence_consult_notes.csv',index=False)
nursing_notes.to_csv('MIMIC_sentence_nursing_notes.csv',index=False)
resp_notes.to_csv('MIMIC_sentence_resp_notes.csv',index=False)
rehab_notes.to_csv('MIMIC_sentence_rehab_notes.csv',index=False)
nutrition_notes.to_csv('MIMIC_sentence_nutrition_notes.csv',index=False)

In [None]:
#Previous Code don't use 

In [None]:
temp_arr=np.zeros(2)
for note in range(len(consult)):
    fragments=split_sentence(consult.iloc[note])

    for frag in range(len(fragments)): 
        #row=np.array()
        row=np.hstack((note, fragments[frag]))
        #print(row)
        temp_arr=np.vstack((temp_arr, row))

In [None]:
#Three sentence split

In [None]:
def three_split_sentence(text):
    combined=[]
    split_text=re.split(r'\. ', text.replace('\n', ' '))
    
    if len(split_text)%2>0: 
        remainder=len(split_text)%2
        count=0
        while count<((len(split_text)//3)+remainder): 
            combined.append('. '.join(split_text[count:count+3]))
            count+=1
    else: 
        for i in range(len(split_text)//3): 
            combined.append('. '.join(split_text[i:i+3]))
        
    return combined

In [None]:
temp_arr=[0,0]
for note in range(len(test_dc)):
    fragments=three_split_sentence(test_dc.iloc[note])

    for frag in range(len(fragments)): 
        #row=np.array()
        row=np.hstack((note, fragments[frag]))
        #print(row)
        temp_arr=np.vstack((temp_arr, row))

In [None]:
three_sentence_sample=pd.DataFrame(np.delete(temp_arr,0,0), columns=['patient_id', 'three_sentence'])
print(three_sentence_sample.shape)
three_sentence_sample.head(5)
three_sentence_sample.to_csv('~/three_sentence_sample.csv')

In [None]:
#Sectional headers

In [None]:
def combine_headers_and_text(text):
    #print(text_spaced[0:50])
    combined=[]
    split_text=[i for i in re.split(r'(.*:)', text)]
    #print(len(split_text))
    for i in range(len(split_text)): 
        #print(split_text[i][-1::])
        if split_text[i][-1::]==':':
            combined.append(split_text[i]+ ' // ' + split_text[i+1])
    return combined

In [None]:
temp_arr=[0,0]
for note in range(len(test_dc)):
    fragments=combine_headers_and_text(test_dc.iloc[note])
    #print(len(fragments))
    #print('')
    for frag in range(len(fragments)): 
        #row=np.array()
        row=np.hstack((note, fragments[frag]))
        #print(row)
        temp_arr=np.vstack((temp_arr, row))

In [None]:
blocked_text_sample=pd.DataFrame(np.delete(temp_arr,0,0), columns=['patient_id', 'blocked_text'])
print(blocked_text_sample.shape)
blocked_text_sample.head(5)
blocked_text_sample.to_csv('~/blocked_text_sample.csv')