In [40]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from collections import OrderedDict
import random

In [41]:
# read in fake notes
notes=pd.read_csv("~/data/fake_notes.csv")

# 1. Filtering
    The following cells filter out the kinds of notes that are guaranteed to ignore the SOAP format, such as covid vaccines.

In [42]:
'''create indicators for note groups we want to remove'''


# covid vaccines
pattern = r'COVID-19 vaccine|COVID-19 Vaccine' #regular expression to identify if its a covid vaccine
notes['covid_vacc'] = notes['note_text'].str.contains(pattern) #create indicator column for this type
notes['covid_vacc']=np.where(notes["type"]=="COVID VACCINATION",True,notes['covid_vacc']) #incorporate type COVID VACCINATION into indicator

# OBGYN Discharge Summaries
pattern = r'Obstetrics Postpartum Discharge Summary|Obstetrics Antepartum Discharge Summary|Discharge Summary'
notes['ob_discharge'] = notes['note_text'].str.contains(pattern) 

# studies
pattern = r'Study Title:'
notes['study'] = notes['note_text'].str.contains(pattern) 

# newborn notes
pattern = r'Duke University Newborn Nursery '
notes['newborn'] = notes['note_text'].str.contains(pattern) 

#clincal skills
pattern = r'Clinical Skills Foundation 1'
notes['skills'] = notes['note_text'].str.contains(pattern) 

# only blood pressure
pattern = r'Patient is here for blood pressure'
notes['bp'] = notes['note_text'].str.contains(pattern) 

# HCC score
pattern = r'HCC Score Gap Refresh'
notes['gap'] = notes['note_text'].str.contains(pattern) 

# fetal monitoring
pattern = r'Gest age [0-9]*w0d today|Fetal Monitoring|Fetal Heart Monitoring|Procedure Note NST|Fetal heart monitoring|EFM Monitoring|Fetal Tracing Review|Tracing Review|Intrapartum Monitoring'
notes['note_text'].str.contains(pattern).sum()
notes['gest'] = notes['note_text'].str.contains(pattern) 

# labor note
pattern = r'Labor progress note|Brief Progress Note|Labor Progress Note'
notes['labor'] = notes['note_text'].str.contains(pattern) 

In [43]:
'''removing notes that we know we wont find objective in'''

notes["no_obj"]=0 # instantiatie "no objective" column (will aggregate the above indicators)    
                  # any note that has a positive value in any of the above indicators will have a
                  # positve value in this columnnotes["no_obj"]=np.where(notes["ob_discharge"]==True,1,notes["no_obj"])

notes["no_obj"]=np.where(notes["covid_vacc"]==True,1,notes["no_obj"])
notes["no_obj"]=np.where(notes["study"]==True,1,notes["no_obj"])
notes["no_obj"]=np.where(notes["newborn"]==True,1,notes["no_obj"])
notes["no_obj"]=np.where(notes["skills"]==True,1,notes["no_obj"])
notes["no_obj"]=np.where(notes["bp"]==True,1,notes["no_obj"])
notes["no_obj"]=np.where(notes["gap"]==True,1,notes["no_obj"])
notes["no_obj"]=np.where(notes["gest"]==True,1,notes["no_obj"])
notes["no_obj"]=np.where(notes["labor"]==True,1,notes["no_obj"])


no_obj_notes=notes[notes["no_obj"]==1].reset_index() # dataframe of notes that don't have objective
notes_obj=notes[notes["no_obj"]==0].reset_index() # dataframe of notes do have objective
print("percent of notes with Objective: "+str(notes_obj.shape[0]/notes.shape[0])) 

notes_obj=notes_obj.drop(columns=["no_obj","covid_vacc",
"ob_discharge","study","newborn","skills","bp","gap","gest","labor",
"index","Unnamed: 0"]) # keep only columns we need

percent of notes with Objective: 0.995


# 2. Detection
    Many of the notes treat Assessment and Plan as one component. We will deal with these notes first, and then filter these out, and then get the notes that have them seperate.

In [44]:
# some functions to help with detection

def getMatches(col): 
    ''' creates list of regex matches and converts to dictionary'''
    l=[]
    for match in re.finditer(pattern, col):
        l.append(match)
    # sample l: [<re.Match object; span=(126, 152), match='History of Present Illness'>,
    #  <re.Match object; span=(154, 169), match='FIRST_NAME_FULL'>]
    return getMatchDict(l) # use following function to convert to dictionary
    # l as a dictionary: {History of Present Illness':126,'FIRST_NAME_FULL':154}

def getMatchDict(col):
    '''convert list of matches to clean dictionary'''
    if col is None:
        return None
    else:
        dic=[]
        for i in col:
            dic.append((i[0],i.start()))
        return dic

In [45]:
# Subjective
pattern = r'((Brief History of Present ,Illness:)|Chief Complaint|(?<![A-Z])S:|(concerns at this time include:)|subjective(?=[^a-zA-Z])+|Subjective(?=[^a-zA-Z])+|Interval Hx:|INTERVAL HISTORY|Interval Events|[Hh]istory of [Pp]resent [Ii]llness|Hospital Day:|Hosp day:|HISTORY OF PRESENT ILLNESS|HPI(?=[^a-zA-Z])+|FIRST_NAME_FULL|Interval History|SUBJECTIVE)'
notes_obj['contains_subjective'] = notes_obj['note_text'].str.contains(pattern) 
print("subjective: "+str(notes_obj['contains_subjective'].sum()/notes_obj.shape[0]))
notes_obj['sub']= notes_obj['note_text'].apply(lambda x:getMatches(x))

# Objective
pattern = r'(Objective)|(?<![A-Z])O:|(?<![A-Z])PE:|Record of Physical Exams ,and NIH Stroke Scales|Physical exam:  Temp:|PHYSICAL EXAM     VITALS|OBJECTIVE|PHYSICAL EXAM AT DISCHARGE|Vital Sings last 24 hours:|Physical Exam   BP|Physical Exam   Pulse|Physical Exam   Temp:|Physical Exam at Discharge:|Physical exam:  BP|Physical Exam  Vitals|Record of Physical Exams and NIH Stroke Scales|EXAMINATION|Exam:|EXAM:|Current Vital Signs|Physical Exam[s]*:|Vital[s]*:|Vital signs:|Vital Signs:'
notes_obj['contains_objective'] = notes_obj['note_text'].str.contains(pattern) 
print("objective: "+str(notes_obj['contains_objective'].sum()/notes_obj.shape[0]))
notes_obj['obj']= notes_obj['note_text'].apply(lambda x:getMatches(x))

subjective: 1.0
objective: 1.0


  notes_obj['contains_subjective'] = notes_obj['note_text'].str.contains(pattern)
  notes_obj['contains_objective'] = notes_obj['note_text'].str.contains(pattern)


In [46]:
# Assessment and Plan (together)
# the percentage reported here represents the percent of total notes in which we found
# assessment and plan grouped together

pattern=r'(Assessment\/Plan)|(Assessment\/ Plan)|(?<![A-Z])A/P:|(Plan and Assessment)|(Assessment and Recommendations)|Assessment and plan:|Assessment &amp; Recommendations|(Assessment \/Plan)|(Assessments and Plans)|(Assessment \/ Plan)|(ASSESSMENT AND PLAN)|(Assessment &amp; Plan)|(Assessment and Plan)(?!\))|(ASSESSMENT &amp; PLAN)|(ASSESSMENT \/ RECOMMENDATIONS)|(ASSESSMENT \/ PLAN)|(ASSESSMENT/COORDINATION OF CARE:)|(ASSESSMENT AND RECOMMENDATIONS)|(ASSESSMENT\/PLAN)|(ASSESSMENT, RECOMMENDATIONS AND PLAN)'
notes_obj['contains_assessplan'] = notes_obj['note_text'].str.contains(pattern) 
print("assessplan: "+str(notes_obj['contains_assessplan'].sum()/notes_obj.shape[0]))
notes_obj['assessplan']= notes_obj['note_text'].apply(lambda x:getMatches(x))

assessplan: 0.5628140703517588


  notes_obj['contains_assessplan'] = notes_obj['note_text'].str.contains(pattern)


In [47]:
# Assessment and Plan (Separate)
# the percentages reported here represent the percent of total notes in which we found
# assessment or plan (respectively) where the section was not attached to the other

# Assessment
pattern=r'(?<!\()Assessment(?!( and))|(?<!(SAFETY |ENERAL ))ASSESSMENT'
notes_obj['contains_assess'] = notes_obj['note_text'].str.contains(pattern) 
print("assess: "+str(notes_obj['contains_assess'].sum()/notes_obj.shape[0]))
notes_obj['assess']= notes_obj['note_text'].apply(lambda x:getMatches(x))

# Plan
pattern=r'((?<!Specific )Plan:)|PLAN(?![A-Z])|Recommendations|COORDINATION OF CARE:|(Plan)(?! [A-Za-z])'
notes_obj['contains_plan'] = notes_obj['note_text'].str.contains(pattern) 
print("plan: "+str(notes_obj['contains_plan'].sum()/notes_obj.shape[0]))
notes_obj['plan']= notes_obj['note_text'].apply(lambda x:getMatches(x))

assess: 0.6733668341708543
plan: 0.6733668341708543


  notes_obj['contains_assess'] = notes_obj['note_text'].str.contains(pattern)
  notes_obj['contains_plan'] = notes_obj['note_text'].str.contains(pattern)


In [48]:
# This cell is purely so we can calculate the percent of notes that have assessment and plan,
# regardless if they are grouped together or not
# (not used for extraction)

pattern=r'(?<!\()Assessment(?!( and))|(?<!(SAFETY |ENERAL ))ASSESSMENT|(?<![A-Z])A\/P:|(Assessment\/Plan)|(Assessment\/ Plan)|(?<!\()Assessment and plan(?!\()|Assessment and plan:|Assessment &amp; Recommendations|(Plan and Assessment)|(Assessment and Recommendations)|(Assessment \/Plan)|(Assessments and Plans)|(Assessment \/ Plan)|(ASSESSMENT AND PLAN)|(Assessment &amp; Plan)|(Assessment and Plan)(?!\))|(ASSESSMENT &amp; PLAN)|(ASSESSMENT \/ RECOMMENDATIONS)|(ASSESSMENT \/ PLAN)|(ASSESSMENT\/COORDINATION OF CARE:)|(ASSESSMENT AND RECOMMENDATIONS)|(ASSESSMENT\/PLAN)|(ASSESSMENT, RECOMMENDATIONS AND PLAN)'
notes_obj['contains_a'] = notes_obj['note_text'].str.contains(pattern) 
print("a: "+str(notes_obj['contains_a'].sum()/notes_obj.shape[0]))
pattern=r'((?<!Specific )Plan:)|(?<!\()Assessment and plan(?!\()|(?<![A-Z])A\/P:|(Plan)(?! [A-Za-z])|PLAN(?![A-Z])|COORDINATION OF CARE:|Recommendations|(Assessment\/Plan)|(Assessment\/ Plan)|Assessment and plan:|Assessment &amp; Recommendations|(Plan and Assessment)|(Assessment and Recommendations)|(Assessment \/Plan)|(Assessments and Plans)|(Assessment \/ Plan)|(ASSESSMENT AND PLAN)|(Assessment &amp; Plan)|(Assessment and Plan)(?!\))|(ASSESSMENT &amp; PLAN)|(ASSESSMENT \/ RECOMMENDATIONS)|(ASSESSMENT \/ PLAN)|(ASSESSMENT\/COORDINATION OF CARE:)|(ASSESSMENT AND RECOMMENDATIONS)|(ASSESSMENT\/PLAN)|(ASSESSMENT, RECOMMENDATIONS AND PLAN)'
notes_obj['contains_p'] = notes_obj['note_text'].str.contains(pattern) 
print("p: "+str(notes_obj['contains_p'].sum()/notes_obj.shape[0]))
notes_obj=notes_obj.drop(columns=["contains_a","contains_p"])

  notes_obj['contains_a'] = notes_obj['note_text'].str.contains(pattern)
  notes_obj['contains_p'] = notes_obj['note_text'].str.contains(pattern)


a: 1.0
p: 0.8291457286432161


# 3. Extraction
Now that we have detected the keywords for each component, we will use them to separate the text.
To extract, we will assume that all four components of SOAP are present, so we will start by removing notes that are missing any.

Because some of the SOAP notes treat assessment and plan as one section, and therefore have three sections instead of four, we will handle them seperately.
We will extract the sections from the notes that have the three section structure first, then handle the notes with four, and then append them together to get our final set of notes.

For consistency, we will extract the notes into the four sections: Subjective, Objective, Assessment and Plan for both the "three section" and the four section notes. For the "three section" notes, the Assessment and Plan columns will just have the same value. 

In [49]:
# these functions are to be used with lambda to modify columns
# these  are things that you may not normally need a function for, but are useful because we are operating on columns

def replaceTitle(col,title):
    '''easily rename a dictionary component'''
    ind=col[1]
    return (title,ind)

def createOrderedDict(col):
    dic=OrderedDict()
    for i in col:
        dic[i[0]]=i[1]
    #return sorted(dic)
    list=sorted(dic.items(), key=lambda x:x[1])
    ans={}
    for i in list:
        ans[i[0]]=i[1]
    return ans

def getStartEnd(col,name):
    index_sub=int(list(col).index(name))
    next=index_sub+1

    if next==len(col):
        return (col[name],-1)
    else:
        return (col[name],list(col.items())[next][1])

def getStart(col):
    return col[0]

def getEnd(col):
    return col[1]

In [50]:
''' Extraction of "three section" notes'''


# create temporary DataFrame that has subjective, objective, and
# assessment/plan treated as one group

df=notes_obj[notes_obj["contains_objective"]]
print("percent of notes with objective: "+str(df.shape[0]/notes_obj.shape[0]))
df=df[df["contains_subjective"]]
print("percent of notes with objective and subjective: "+str(df.shape[0]/notes_obj.shape[0]))
df=df[df["contains_assessplan"]]
print("percent of notes with objective, subjective and assessment and plan treated as one group: "+str(df.shape[0]/notes_obj.shape[0]))


# If we got multiple matches for a header, we only want to look at the first match
# This is because alternate headers are sometimes found within the section, but the
# first appearance indicates the beginning
# this code extracts the first item in the match dictionary
df["obj_item"]=df["obj"].apply(lambda x: getStart(x))
df["sub_item"]=df["sub"].apply(lambda x: getStart(x))
df["assessplan_item"]=df["assessplan"].apply(lambda x: getStart(x))


# Instead of the match content (section header) we want the dictionary to indicate which section is starting
# For example, instead of "HPI:", we want it to say "Subjective:"
# this code changes the keys as such so we can build an easily readable dictionary with all sections
df["obj_item"]=df["obj_item"].apply(lambda x: replaceTitle(x,"Objective"))
df["sub_item"]=df["sub_item"].apply(lambda x: replaceTitle(x,"Subjective"))
df["assessplan_item"]=df["assessplan_item"].apply(lambda x: replaceTitle(x,"Assess_Plan"))

# put all sections together into a list, then convert to an ordered dictionary
df["ind_list"] = df[["sub_item","obj_item","assessplan_item"]].values.tolist()
df["ind_list"]=df["ind_list"].apply(lambda x: createOrderedDict(x))

# use ordered dictionary to extract the start and end index of each section
df["sub_index"]=df["ind_list"].apply(lambda x: getStartEnd(x,"Subjective"))
df["obj_index"]=df["ind_list"].apply(lambda x: getStartEnd(x,"Objective"))
df["ap_index"]=df["ind_list"].apply(lambda x: getStartEnd(x,"Assess_Plan"))


# CREATE COLUMNS CONTAINING THE START AND END INDEX OF EACH COMPONENT
#Subjective
df["sub_start"]=df["sub_index"].apply(lambda x: getStart(x))
df["sub_end"]=df["sub_index"].apply(lambda x: getEnd(x))

#Objective
df["obj_start"]=df["obj_index"].apply(lambda x: getStart(x))
df["obj_end"]=df["obj_index"].apply(lambda x: getEnd(x))

#Assessment/Plan
df["ap_start"]=df["ap_index"].apply(lambda x: getStart(x))
df["ap_end"]=df["ap_index"].apply(lambda x: getEnd(x))


# ASSIGN COMPONENTS TO SUBSTRING, USING NEWLY GENERATED INDEXES
# assign subjective and objective
df["Subjective"]=df.apply(lambda row: row.note_text[row.sub_start:row.sub_end], axis=1)
df["Objective"]=df.apply(lambda row: row.note_text[row.obj_start:row.obj_end], axis=1)

# assign assessment and plan the same text, as they are the same section here
df["Assessment"]=df.apply(lambda row: row.note_text[row.ap_start:row.ap_end], axis=1)
df["Plan"]=df.apply(lambda row: row.note_text[row.ap_start:row.ap_end], axis=1)

# reset index and drop all the columns we no longer need
df=df.reset_index()
df=df.drop(columns=["index","sub_item","obj_item","assessplan_item","sub","obj","assessplan","ind_list"])
df

percent of notes with objective: 1.0
percent of notes with objective and subjective: 1.0
percent of notes with objective, subjective and assessment and plan treated as one group: 0.5628140703517588


Unnamed: 0,note_text,type,contains_subjective,contains_objective,contains_assessplan,contains_assess,assess,contains_plan,plan,sub_index,...,sub_start,sub_end,obj_start,obj_end,ap_start,ap_end,Subjective,Objective,Assessment,Plan
0,Assessments and Plans fbPzzrLIjfAlibfdEwvJjBR...,RADIATION ONCOLOGY,True,True,True,True,"[(Assessment, 1)]",True,"[(Plan, 17)]","(1855, -1)",...,1855,-1,234,1855,1,234,Subjective r gVdQxnQJuU.FVRAnInvIXIWCNxKdDwrjv...,Objective TdWaFoj.XRTuFYyrtDjaPERFcxoREKHCvkWc...,Assessments and Plans fbPzzrLIjfAlibfdEwvJjBRd...,Assessments and Plans fbPzzrLIjfAlibfdEwvJjBRd...
1,Objective WFohNVDa.pvBysVyyfsHdL.GngexBPJbOx....,SPORTS MEDICINE,True,True,True,False,[],False,[],"(2682, -1)",...,2682,-1,1,2532,2532,2682,"Brief History of Present ,Illness: IUUJxTcbwcP...",Objective WFohNVDa.pvBysVyyfsHdL.GngexBPJbOx.x...,Assessment and Plan TTC.SfLbLe.gbhOXmSKdtyPrAX...,Assessment and Plan TTC.SfLbLe.gbhOXmSKdtyPrAX...
2,"Record of Physical Exams ,and NIH Stroke Scal...",PEDIATRIC SURGERY,True,True,True,False,[],False,[],"(1152, -1)",...,1152,-1,1,1094,1094,1152,"Brief History of Present ,Illness: d bYZnYbzey...","Record of Physical Exams ,and NIH Stroke Scale...",Assessment and Plan FHbJDIJQLVKSPd.CzrKYqnTx.L...,Assessment and Plan FHbJDIJQLVKSPd.CzrKYqnTx.L...
3,Subjective HRhDXUQEAc ?TMhbEVbOhf.eHoYqpXTRsm...,PEDIATRIC NEPHROLOGY,True,True,True,True,"[(Assessment, 136)]",True,"[(Plan, 152)]","(1, 136)",...,1,136,301,-1,136,301,Subjective HRhDXUQEAc ?TMhbEVbOhf.eHoYqpXTRsmY...,Physical exam: Temp: rAkycKovwvvqKAeCQWoWZqV....,Assessments and Plans ?Jq DzwdYOmvAxGJvBCVBmZq...,Assessments and Plans ?Jq DzwdYOmvAxGJvBCVBmZq...
4,Assessments and Plans Dgfu #Asthma | Physical...,OTOLARYNGOLOGY,True,True,True,True,"[(Assessment, 1)]",True,"[(Plan, 17)]","(1692, -1)",...,1692,-1,38,1692,1,38,"Brief History of Present ,Illness: J",Physical exam: Temp: pEqF.BsCJxctymFsDbGoLciT...,Assessments and Plans Dgfu #Asthma |,Assessments and Plans Dgfu #Asthma |
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Assessment and Recommendations sWfaPaFCYeMylg...,HEALTH AND FITNESS,True,True,True,False,[],True,"[(Recommendations, 16)]","(1098, -1)",...,1098,-1,112,1098,1,112,Subjective AfRt?IlV?YFvHhyuXRm?NPdOAIQNliyEPMk...,"Record of Physical Exams ,and NIH Stroke Scale...",Assessment and Recommendations sWfaPaFCYeMylgV...,Assessment and Recommendations sWfaPaFCYeMylgV...
108,Physical exam: Temp: CownpSCPxhVeOjlDiTvC ID...,EMERGENCY MEDICINE,True,True,True,False,[],True,"[(Recommendations, 1106)]","(70, 1091)",...,70,1091,1,70,1091,-1,"HPI: O: sudden, post viral , gradual D:acute o...",Physical exam: Temp: CownpSCPxhVeOjlDiTvC ID:...,Assessment and Recommendations Ehp.dKkEXXwJIqO...,Assessment and Recommendations Ehp.dKkEXXwJIqO...
109,"Brief History of Present ,Illness: pQFunyllcR...",PEDIATRIC NEONATOLOGY AND PERINATOLOGY,True,True,True,False,[],False,[],"(1, 218)",...,1,218,342,-1,218,342,"Brief History of Present ,Illness: pQFunyllcRO...",Physical exam: Temp: bVStXotnGGOvWgWDeMTASZdw...,Assessment and Plan tDqpLwPAaSiOCLBNCClZEnaIdg...,Assessment and Plan tDqpLwPAaSiOCLBNCClZEnaIdg...
110,Objective ciPRsIUMZthhuoXcgmSWQYMMChSIlxlHjvA...,RADIATION ONCOLOGY,True,True,True,False,[],False,[],"(691, 728)",...,691,728,1,691,728,-1,Subjective qXhHYaHdclpTbrMs.nu?yJNVz,Objective ciPRsIUMZthhuoXcgmSWQYMMChSIlxlHjvAU...,Assessment and Plan LCOoKjEeeWMpmqfQCO pnDAoTa...,Assessment and Plan LCOoKjEeeWMpmqfQCO pnDAoTa...


In [51]:
''' Extraction of four section notes'''

# create temporary DataFrame that has subjective, objective, and
# both assessment and plan, treated as separate sections
df2=notes_obj[notes_obj["contains_objective"]]
df2=df2[df2["contains_assessplan"]==False]
df2=df2[df2["contains_subjective"]]
df2=df2[df2["contains_assess"]]
df2=df2[df2["contains_plan"]]
print("percent of notes that have Objective, Subjective, and both Assessment and Plan, treated as separate sections: "+str(df2.shape[0]/notes_obj.shape[0]))


# extract the first item in the match dictionary
df2["obj_item"]=df2["obj"].apply(lambda x: getStart(x))
df2["sub_item"]=df2["sub"].apply(lambda x: getStart(x))
df2["assess_item"]=df2["assess"].apply(lambda x: getStart(x))
df2["plan_item"]=df2["plan"].apply(lambda x: getStart(x))

# change the keys so we can build an easily readable dictionary with all sections
df2["obj_item"]=df2["obj_item"].apply(lambda x: replaceTitle(x,"Objective"))
df2["sub_item"]=df2["sub_item"].apply(lambda x: replaceTitle(x,"Subjective"))
df2["assess_item"]=df2["assess_item"].apply(lambda x: replaceTitle(x,"Assessment"))
df2["plan_item"]=df2["plan_item"].apply(lambda x: replaceTitle(x,"Plan"))

# put all sections together into a list, then convert to an ordered dictionary
df2["ind_list"] = df2[["sub_item","obj_item","assess_item","plan_item"]].values.tolist()
df2["ind_list"]=df2["ind_list"].apply(lambda x: createOrderedDict(x))

# use ordered dictionary to extract the start and end index of each section
df2["sub_index"]=df2["ind_list"].apply(lambda x: getStartEnd(x,"Subjective"))
df2["obj_index"]=df2["ind_list"].apply(lambda x: getStartEnd(x,"Objective"))
df2["as_index"]=df2["ind_list"].apply(lambda x: getStartEnd(x,"Assessment"))
df2["p_index"]=df2["ind_list"].apply(lambda x: getStartEnd(x,"Plan"))


# CREATE COLUMNS CONTAINING THE START AND END INDEX OF EACH COMPONENT
#Subjective
df2["sub_start"]=df2["sub_index"].apply(lambda x: getStart(x))
df2["sub_end"]=df2["sub_index"].apply(lambda x: getEnd(x))

#Objective
df2["obj_start"]=df2["obj_index"].apply(lambda x: getStart(x))
df2["obj_end"]=df2["obj_index"].apply(lambda x: getEnd(x))

#Assessment
df2["as_start"]=df2["as_index"].apply(lambda x: getStart(x))
df2["as_end"]=df2["as_index"].apply(lambda x: getEnd(x))

#Plan
df2["p_start"]=df2["p_index"].apply(lambda x: getStart(x))
df2["p_end"]=df2["p_index"].apply(lambda x: getEnd(x))

# ASSIGN COMPONENTS TO SUBSTRING, USING NEWLY GENERATED INDEXES
# assign subjective and objective
df2["Subjective"]=df2.apply(lambda row: row.note_text[row.sub_start:row.sub_end], axis=1)
df2["Objective"]=df2.apply(lambda row: row.note_text[row.obj_start:row.obj_end], axis=1)
df2["Assessment"]=df2.apply(lambda row: row.note_text[row.as_start:row.as_end], axis=1)
df2["Plan"]=df2.apply(lambda row: row.note_text[row.p_start:row.p_end], axis=1)


# reset index and drop all the columns we no longer need
df2=df2.reset_index()
df2=df2.drop(columns=["sub","obj","assess","plan","index","sub_item","obj_item","assess_item","plan_item","ind_list"])
df2

percent of notes that have Objective, Subjective, and both Assessment and Plan, treated as separate sections: 0.2663316582914573


Unnamed: 0,note_text,type,contains_subjective,contains_objective,contains_assessplan,assessplan,contains_assess,contains_plan,sub_index,obj_index,...,obj_start,obj_end,as_start,as_end,p_start,p_end,Subjective,Objective,Assessment,Plan
0,Physical exam: Temp: K?XWex.NWprnBXEaXo.gbNf...,PSYCHIATRY,True,True,False,[],True,True,"(189, 1242)","(1, 189)",...,1,189,1242,1299,1299,-1,"HPI: O: sudden, post viral , gradual D:acute o...",Physical exam: Temp: K?XWex.NWprnBXEaXo.gbNfW...,Assessment zG.BDV.gKFVyyDIveZaLWypzyEuvZGaKtNY...,PLAN Unfb?iDRuBzBYM?jwnPTuEvhpIHanloeWu.IlbuWT...
1,Physical exam: Temp: bt?xnBXg.YxfDxlWuImTvnM...,ORAL SURGERY,True,True,False,[],True,True,"(3111, -1)","(1, 111)",...,1,111,111,2863,2863,3111,Chief Complaint vRrMB.qBCkvNQHVCKrZDSMEqBcogTl...,Physical exam: Temp: bt?xnBXg.YxfDxlWuImTvnMJ...,ASSESSMENT 1. Stage of Chronic Kidney Disease ...,Recommendations .Iy QkpuktfpRnXMRmleBGPbPDwllo...
2,ASSESSMENT NpSqSSKAxMOVAaHbwPjZJZQGv?OwEJEHlU...,RADIATION ONCOLOGY,True,True,False,[],True,True,"(2353, -1)","(81, 1863)",...,81,1863,1,81,1863,2353,Chief Complaint olxjd K tIzi Fk tUIJK.AKEvbw,"Record of Physical Exams ,and NIH Stroke Scale...",ASSESSMENT NpSqSSKAxMOVAaHbwPjZJZQGv?OwEJEHlUA...,Plan: Labs: pending. EKG: pending. Labs: No ...
3,ASSESSMENT ?dVBNBCiOcLW?cyNwzmPXgTXoqlwan bri...,RHEUMATOLOGY,True,True,False,[],True,True,"(1781, -1)","(201, 1674)",...,201,1674,1,201,1674,1781,"Brief History of Present ,Illness: UYQuCQWBHgn...",Objective PzOXZyOSz FSWKWqWJSFdMCsiTEQnC Gener...,ASSESSMENT ?dVBNBCiOcLW?cyNwzmPXgTXoqlwan briI...,PLAN owR kgQPInYCECHTbLdPgHeCrvKnWurnFK.mmCGFX...
4,Subjective Yu?Qu.LqdLdKOG wtliffYZHTPJHJYtNpa...,PULMONOLOGY,True,True,False,[],True,True,"(1, 362)","(362, 2056)",...,362,2056,2434,-1,2056,2434,Subjective Yu?Qu.LqdLdKOG wtliffYZHTPJHJYtNpad...,Objective QrOVHhGyeQnwacTOrQILh NtRRTDJHLoeHKA...,Assessment amszDtvxXl kNVDifpJrGMNjHZ IhFgDhy,Plan: Labs: pending. EKG: pending. Labs: No ...
5,ASSESSMENT pvuel oQCygGgnShTnXyKi MjDXJDuWRuQ...,GYNECOLOGIC ONCOLOGY,True,True,False,[],True,True,"(213, 359)","(359, 2850)",...,359,2850,1,213,2850,-1,Chief Complaint uhRPVkxg.rCQMycoInugmJKcDOJsB....,Physical exam: Temp: zbRUKl?Tzfdd?KJvm twrtAO...,ASSESSMENT pvuel oQCygGgnShTnXyKi MjDXJDuWRuQ ...,Recommendations YsDPWlisUNGUjIiuWGTsc.ofTUgdFK...
6,Assessment IHDRUjBdmMnEP?mkDxalMhTl.LJgaiGJot...,TRAUMA SURGERY,True,True,False,[],True,True,"(3104, -1)","(254, 3104)",...,254,3104,1,60,60,254,"Brief History of Present ,Illness: oeHZZJaPpgg...",Objective nIdRScwSUO uqAXk BuoDW?mMCyfPXELvXCe...,Assessment IHDRUjBdmMnEP?mkDxalMhTl.LJgaiGJoto...,PLAN szDSZS.KdYjQAYav?ASTZ?FPqpAWuWDuVbAzjVWfk...
7,Chief Complaint kvleuQZGO zZWajSqcgRFznEyu Bt...,GENERAL SURGERY,True,True,False,[],True,True,"(1, 78)","(78, 1835)",...,78,1835,2333,-1,1835,2333,Chief Complaint kvleuQZGO zZWajSqcgRFznEyu Btf...,"Record of Physical Exams ,and NIH Stroke Scale...",ASSESSMENT xLJwFJOWGLcErdpr VfgSRDy J.X XYqQV ...,Plan: Labs: pending. EKG: pending. Labs: No ...
8,"Record of Physical Exams ,and NIH Stroke Scal...",PEDIATRIC PULMONOLOGY,True,True,False,[],True,True,"(890, 1081)","(1, 890)",...,1,890,1284,-1,1081,1284,Chief Complaint rXUPeRVaxFJ?inlnGBcQaPgjy issI...,"Record of Physical Exams ,and NIH Stroke Scale...",ASSESSMENT FRMtNUdkrexVqDjkYuGeYveGnqbXuVjpzrE...,Recommendations nGotBJfxjJnEDZkWfavufloPIMDFP....
9,Subjective JfnaXHMdegNpVCo.tErMcuPDoIaPqAsSEE...,PEDIATRIC SEMI-PRIVATE,True,True,False,[],True,True,"(1, 202)","(202, 862)",...,202,862,862,963,963,-1,Subjective JfnaXHMdegNpVCo.tErMcuPDoIaPqAsSEEd...,"Record of Physical Exams ,and NIH Stroke Scale...",ASSESSMENT BfK.gbuIFBQBrBaVxcddbXdJKzGLlpicHDd...,PLAN OzIPRzl proW.tsL #Asthma


In [52]:
'''combine all extracted notes'''

# merge back together the "three section" notes and the four section notes
# into one big dataframe of notes that have all four components extracted
notes_extracted= pd.concat([df,df2])
notes_extracted=notes_extracted.reset_index()
print("percent of total notes identified to have SOAP structure: "+str(notes_extracted.shape[0]/notes_obj.shape[0]))

percent of total notes identified to have SOAP structure: 0.8291457286432161


In [53]:
# get diagnoses codes

def getTags(col):
    '''detect # diagnoses'''
    if col is None:
        return None
    r1 = re.findall(r"((#+[a-zA-Z(_)]{1,}) ?((\(([^)]+)\))*)( ?(\b[A-Za-z].*?\b))*)",col)
    diag=[]
    for i in r1:
        diag.append(i[0])
    return diag

notes_extracted["diags"]=notes_extracted["note_text"].apply(lambda x:getTags(x))
notes_extracted["diag_detected"]=False
notes_extracted["diag_detected"]=np.where(notes_extracted["diags"],True,notes_extracted["diag_detected"])
print("percent of notes identified to have SOAP structure that have diag codes: "+str(notes_extracted['diag_detected'].sum()/notes_extracted.shape[0]))

percent of notes identified to have SOAP structure that have diag codes: 1.0


In [54]:
# save extracted notes to csv
notes_extracted=notes_extracted[["note_text","type","Subjective","Objective","Assessment","Plan","diags"]]
notes_extracted.to_csv("~/data/fake_notes_extracted.csv")