In [None]:
import pandas as pd
import random
import re
import numpy as np
from collections import OrderedDict

In [None]:
notes=pd.read_csv("~/desktop/capstone_repo/data/fake_notes_extracted.csv")

In [None]:
# some functions to help with detection

def getMatches(col): 
    ''' creates list of regex matches and converts to dictionary'''
    l=[]
    for match in re.finditer(pattern, col):
        l.append(match)
    # sample l: [<re.Match object; span=(126, 152), match='History of Present Illness'>,
    #  <re.Match object; span=(154, 169), match='FIRST_NAME_FULL'>]
    return getMatchDict(l) # use following function to convert to dictionary
    # l as a dictionary: {History of Present Illness':126,'FIRST_NAME_FULL':154}

def getMatchDict(col):
    '''convert list of matches to clean dictionary'''
    if col is None:
        return None
    else:
        dic=[]
        for i in col:
            dic.append((i[0],i.start()))
        return dic


def getLastItem(col):
    return col[-1]

def getEnd(col):
    return col[1]

## Splitting up 'Objective'

The 'Objective' section in a SOAP note consists of several subsections that we will seperate in this notebook.
Subsections: vitals, physical exam constitutional, other (labs and medications)

In [None]:
'''Seperate the 'other' (labs, medications, etc) section'''

pattern=r'Medication Administration Report|DATA|LABS:|Intake and Output last 24 hours|Lab Review|Laboratory Results|LABS REVIEWED:|Pertinent Diagnostic Studies|Labs/Studies:|Recent Labs|SCHEDULED MEDICATIONS|Immunization History|Scheduled Meds:|Current Medications:|Data:|Medications:|Lab Review:|Scheduled Medications|Lab Results|Labs and Imaging|Relevant Labs and Imaging:|Medications|Pertinent Labs and Studies:|Labs and Studies:|Labs:|Laboratory Studies:'
notes['contains_labs'] = notes['Objective'].str.contains(pattern) 
print("percent of notes that have a lab/medication section in objective: "+str(notes['contains_labs'].sum()/notes.shape[0]))
notes['labs']= notes['Objective'].apply(lambda x:getMatches(x))

# create two temporary dataframes: one with notes that have this section
# and one with notes that are missing this section
#this is because we want to do different things with each group
df2=notes[notes.contains_labs].copy()
df3=notes[notes.contains_labs==False].copy()

# if the note has labs/medications, use match to extract it
df2["Objective_PhysExam"]=df2.apply(lambda row: row.Objective[0:row.labs[0][1]], axis=1)
df2["Objective_other"]=df2.apply(lambda row: row.Objective[row.labs[0][1]:], axis=1)

# if the note has no labs/medications info, set feature to nan
df3["Objective_PhysExam"]=df3.Objective
df3["Objective_other"]=np.nan

#concat temp dataframes back together
notes=pd.concat([df2,df3])
notes=notes[["note_text","type","Subjective","Objective","Objective_PhysExam","Objective_other","Assessment","Plan","diags"]]

We will look for 'Physical Exam Constitutional' next.
This section is the most difficult to find, as sometimes it is called plainly "Physical Exam", which is a bad keyword for us to use, as this same keyword also sometimes indicates the vitals section.

We will look for this section with two different methods and then append them together later.

Method 1: find all instances of 'Physical Exam' in the text that are NOT followed by any of a list of additional keywords that indicate that the following section is vitals.

Method 2: Look for additional headers of physical exam constitutional, such as "general appearance", "constitutional", "GA", etc.

In [None]:
'''Seperate out Physical Exam Constitutional, method 1'''

# The following pattern looks for physical exam when NOT followed by Vitals, Temp, Current Vitals, etc, or PE.
pattern=r'(Physical Exam)(?!((:?)( *)(,?)Vital|(:?)( *)Temp|s and NIH Stroke Scales|(:?)( *)Current Vital))|PE |(PHYSICAL EXAM)(?!((:?)( *)(,?)Vital|(:?)( *)Temp|s and NIH Stroke Scales|(:?)( *)Current Vital))'
notes['contains_pe'] = notes['Objective_PhysExam'].str.contains(pattern) 

# create two temp dataframes
df2=notes[notes["contains_pe"]==True].reset_index()
df=notes[notes["contains_pe"]==False].reset_index()

# if PE is found, extract it
df2['PE']= df2['Objective_PhysExam'].apply(lambda x:getMatches(x))
df2["Objective_PE"]=df2.apply(lambda row: row.Objective_PhysExam[row.PE[-1][1]:] if row.PE[-1][1]>0 else " ", axis=1)
df2["Objective_PhysExam"]=df2.apply(lambda row: row.Objective_PhysExam[0:row.PE[-1][1]]if row.PE[-1][1]>0 else row.Objective_PhysExam, axis=1)
df2=df2[["type","note_text","Subjective","Objective","Objective_PhysExam","Objective_PE","Objective_other","Assessment","Plan","diags","PE"]]

# if PE is not found, set it to empty string
df["Objective_PE"]=" "
df["PE"]=" "
df=df[["type","note_text","Subjective","Objective","Objective_PhysExam","Objective_PE","Objective_other","Assessment","Plan","diags","PE"]]

#combine temps back into notes
notes=pd.concat([df,df2])

In [None]:
# a handful of the notes have the vitals section after constitutional,
#  so we can handle that here real quick 
# (all of these vital sections are labeled with 'Current vitals')

pattern=r'Current Vitals'
notes['contains_v'] = notes['Objective_PE'].str.contains(pattern) 
notes['v']= notes['Objective_PE'].apply(lambda x:getMatches(x))

# create temp dataframes
df=notes[notes["contains_v"]].copy()
df2=notes[notes["contains_v"]==False].copy()

# extract or set to empty string
df["Objective_vitals"]=df.apply(lambda row: row.Objective_PE[row.v[0][1]:], axis=1)
df["Objective_PE"]=df.apply(lambda row: row.Objective_PE[0:row.v[0][1]], axis=1)
df2["Objective_vitals"]=""


#combine temps back into notes
notes=pd.concat([df,df2])

In [None]:
'''handle cases where the vitals section is missing'''

# create two temp dataframes, one with notes where we found vitals
# one with notes where we did not find vitals
novit=notes[notes["Objective_vitals"]==""]
vit=notes[notes["Objective_vitals"]!=""]


## We did this because we want to work with 
# the notes where we did not find vitals

# create two more temp dataframes out of the "no vitals" notes,
#  one where physical exam is found and one where its not
dfpe=novit[novit["Objective_PE"]!=" "].copy()
df=novit[novit["Objective_PE"]==" "].copy()

# if we found physical exam, in a note where we didnt find vitals,
# we can infer the last match of 'physical exam' is labeling vitals section
# assign content after final match of physical exam to vitals feature
dfpe["PE_item"]=dfpe["PE"].apply(lambda x: getLastItem(x))
dfpe["PE_start"]=dfpe["PE_item"].apply(lambda x: getEnd(x))
dfpe["Objective_vitals"]=dfpe.apply(lambda row: row.Objective_PhysExam[0:row.PE_start], axis=1)


# some of the notes in the "no vitals" temp dataframe are no longer missing vitals,
# so we concat them back together and reseperate to update
novit=pd.concat([dfpe,df])
notes=pd.concat([vit,novit])
novit=notes[notes["Objective_vitals"]==""].copy()
vit=notes[notes["Objective_vitals"]!=""].copy()


# if we find "Temp" in the "no vitals" dataframe now, it means that the entire
# section is just vitals. So we assign it accordingly.
pattern=r'Temp(:| )'
novit['contains_temp'] = novit['Objective_PhysExam'].str.contains(pattern) 
novit["Objective_vitals"]=np.where(novit.contains_temp,novit.Objective_PhysExam,"")
notes=pd.concat([vit,novit])

In [None]:
'''Seperate out Physical Exam Constitutional, method 2'''

pattern=r'Gen:|General(:| |.)|GEN(:| )|Neurologic Exam: |Intake/Output Summary \(Last 24 hours\)|INITIAL Exam \(very first encounter\)|CONST:|Constitutional:|Lines, drains, airways:|Mental Status Exam|GA:|PHYSICAL EXAM:  Constitutional:|Exam:( *)GE:|MENTAL STATUS:|Appearance:|GENERAL(:| )|CONSTITUTIONAL:|GENERAL APPEARANCE:?|Exam:( *)Cardiac=|Exam: Heart|Musculoskeletal Exam|ENT: '
notes['contains_gen'] = notes['Objective_PhysExam'].str.contains(pattern) 
notes['gen']= notes['Objective_PhysExam'].apply(lambda x:getMatches(x))

# create temp dataframes, one where we found gen keyword and one where we did not
df=notes[notes["contains_gen"]].copy()
df2=notes[notes["contains_gen"]==False].copy()

# if we found gen, extract it
df["Objective_gen"]=df.apply(lambda row: row.Objective_PhysExam[row.gen[0][1]:], axis=1)
df["Objective_vitals"]=df.apply(lambda row: row.Objective_PhysExam[0:row.gen[0][1]], axis=1)
df=df[["type","note_text","Subjective","Objective","Objective_PhysExam","Objective_PE","Objective_vitals","Objective_gen","Objective_other","Assessment","Plan","diags","PE","gen"]]

# if we did not find, set to empty string
df2["Objective_gen"]=""
df2=df2.drop(columns=["contains_gen","contains_v"])

#combine temp dataframes back into notes
notes=pd.concat([df,df2])


In [None]:
#create physical exam constitutional feature, combining the two methods
notes["Objective_const"] = notes["Objective_PE"] + notes["Objective_gen"]

#edit notes to only have the columns we want
notes=notes[["type","note_text","Subjective","Objective","Objective_vitals","Objective_const","Objective_other","Assessment","Plan","diags"]]
notes=notes[notes["Objective_const"]!=" "].reset_index()
notes=notes.drop(columns=["index"])
notes

In [None]:
notes.to_csv("~/projects/PACE - MIDS Student Portfolio Capstone/data/notes_extracted_obj.csv")