In [2]:
import microbex.microbex as me
import pandas as pd
import os

In [24]:
sample_data=pd.read_csv('microbex/sample_data/sample_data.csv')

In [25]:
## assigning an example to audit
sample_data.loc[4,'parsed_note']='Aminoglycosides alone are not effective against enterococci. For this isolate'

# a me.Microbex object is instantiated using:

* dataframe with atleast 3 cols: note_section, culture_id, and visit_id
* kwargs to specify the names of each of the required cols:
    * parsed_note or note_section:
        * microbiology report txt in either a raw or (perferable) chopped up into components (eg gram stain/growth report/ab susceptability)
    * culture_id:
        * a primary key tied to a given sample/specimen + microbiological exam order.
        * Often a microbiology order can be tied to numerous components (eg gram stain/growth report/ ab susceptability). additionally these can be appended to same report or added as a new report tied to same sample + order. all of these tied to a sample+order should share same culture_id
    * visit_id:
        * primary key for patient's visit/encounter
        * can be 1-many:1 to culture_id or 1:1 (in which case can specify as culture_id)
in some datasets a patient may have multiple cultures performed in a visit/encounter.

In [26]:
obj1= me.Microbex(
    sample_data,
    text_col='parsed_note',
    culture_id_col='culture_id',
    visit_id_col='visit_id'
)


# a Microbex object can be annotated with the .annotate() method. returns a .annotated_data attribute and .annotate_params

In [27]:
obj1.annotate(
    staph_neg_correction=False, 
    specimen_col=None,
    review_suggestions=False,
    likelyneg_block_skip=False
)

step 0: simple cleaning of parsed_note + annotation of descriptive info for specimen and quantitative information
step1: parses blatent negative results and non-bacterial species.
     step1 runtime: 0.1212 seconds
step2: negative row species captures + sequestering the negative rows from those without any capture before downstream positive annotations.
n= 0 rows (0 unique cultures) added back from the neg list via virus/yeast + bacerial species exemption
     step2 runtime: 0.9196 seconds
step3: annotating unspecific positive rows.
     step3 runtime: 0.0569 seconds
step4: annotating species specific positives.
     step4 runtime: 0.3552 seconds
step5: annotating staph positives with (optional setting) handling for coagulase negative staph; setting = False
step6: annotating any previously positive row with unclear language.
     step6 runtime: 0.0824 seconds
step7: final datastructure management, mapping species captures to OHDSI ontology, and finalizing positive culture status annota

In [28]:
obj1.annotated_data.head()

Unnamed: 0,culture_id,visit_id,parsed_note,pos_culture_audit1,pos_culture_audit2,pos_culture_status,species_capt_all,OHDSI_ID,OHDSI_Concept,regex_source,...,species_regex,pos_quant_capt,pos_quant_regex,pos_qual_capt,pos_qual_regex,unclear_capt,unclear_regex,likelyneg_capt,likelyneg_regex,regex_capture_quant
0,0,0,Many species present,positive_unspecific1,positive,1,[positive_unspecific],[4001594],[Non-specific],positive_unspecific,...,[],[],[],"[many species present, species present]",[(multiple|many|numerous|\d\sor\smore|\d\+|\>\...,[],[],[],[],not_captured
1,1,1,alpha hemolytic streptococci in anaerobic bottle,species_positive19,positive,1,"[streptococci in, streptococci]","[4194525, 4240344]","[Streptococcus iniae, Streptococcus]",species_positive,...,"[streptococc[ius]{1,2}[ ]?[A-Za-z]*, (beta hem...",[],[],[],[],[],[],[],[],not_captured
2,1,1,coagulase negative Staphylococcus species in b...,pos_staph,positive,1,[staphylococcus coagulase negative],[4020318],"[Staphylococcus, coagulase negative]",species_specific_staph,...,[staphylococcus coagulase negative],[],[],[],[],[],[],[],[],not_captured
3,1,1,gram positive cocci in clusters in bottle with...,species_positive14,positive,1,[gram positive cocci],[4177803],[Gram positive coccobacillus],species_positive,...,"[gram positive cocc[ius]{1,2}]",[],[],[],[],[],[],[],[],not_captured
4,1,1,Aminoglycosides alone are not effective agains...,species_positive21,positive,1,[enterococci],[4174874],[Enterococcus canis],species_positive,...,"[enterococc[ius]{1,2}]",[],[],[],[],[],[],[],[],not_captured


In [29]:
obj1.annotate_params

{'staph_neg_correction': False,
 'specimen_col': None,
 'review_suggestions': False,
 'likelyneg_block_skip': False}

# split into pos and neg bacteria positive status
* for each look at the top x most frequent note texts
* if this isn't helpful or not granular enough, can look at top x rank of:
    * result_binary: which will show the enumerated index num of the 
    * result_binary2

In [30]:
pos_df= obj1.annotated_data[obj1.annotated_data['pos_culture_status']==1]
neg_df= obj1.annotated_data[obj1.annotated_data['pos_culture_status']==0]

In [31]:
pos_df['parsed_note'].value_counts()

gram negative bacilli in aerobic and anaerobic bottles                               2
gram positive cocci in clusters in bottle with antibiotic removal resin              2
Klebsiella pneumoniae in aerobic and anaerobic bottles                               2
Many species present                                                                 2
Klebsiella pneumoniae strain 2 in anaerobic bottle                                   1
Many Klebsiella pneumoniae                                                           1
gram negative bacilli in bottle with antibiotic removal resin                        1
Klebsiella pneumoniae in bottle with antibiotic removal resin                        1
alpha hemolytic streptococci in anaerobic bottle                                     1
gram negative bacilli in anaerobic bottle                                            1
coagulase negative Staphylococcus species strain 1 in bottle with antibiotic         1
coagulase negative Staphylococcus species i

In [32]:
pos_df['pos_culture_audit1'].value_counts()

species_positive2       5
species_positive0       4
pos_staph               3
species_positive14      3
positive_unspecific1    2
species_positive19      2
species_positive21      1
species_positive6       1
species_positive1       1
Name: pos_culture_audit1, dtype: int64

In [33]:
pos_df[pos_df['parsed_note']=='Aminoglycosides alone are not effective against enterococci. For this isolate']

Unnamed: 0,culture_id,visit_id,parsed_note,pos_culture_audit1,pos_culture_audit2,pos_culture_status,species_capt_all,OHDSI_ID,OHDSI_Concept,regex_source,...,species_regex,pos_quant_capt,pos_quant_regex,pos_qual_capt,pos_qual_regex,unclear_capt,unclear_regex,likelyneg_capt,likelyneg_regex,regex_capture_quant
4,1,1,Aminoglycosides alone are not effective agains...,species_positive21,positive,1,[enterococci],[4174874],[Enterococcus canis],species_positive,...,"[enterococc[ius]{1,2}]",[],[],[],[],[],[],[],[],not_captured


### example:
* above we see "Aminoglycosides alone are not effective against enterococci. For this isolate..." was classified as positive.
    * this looks like a note section that was providing context to an result rather than being a primary result. thus should be ignored. 
    
### todo: need to add exemption for this such that it and other similar cases will be classified as negative from here on
* we have two choices for where to account for this, first: negative_captures, which is the most broad reaching since it is applied first and will dictate if a case has a chance to be considered positive downstream. The other option is in "unclear", which is applied on the last step of the positive block and is used as a way to catch false positives. 
* for this example Let's do the "unclear" 
* please see supplimental_use_guide1.md for detailed description of classification logic flow for additional context.
* for this case, let's assume that "Aminoglycosides alone are not effective..." is the common string in these false positives.
    * to address this, we can make a enterococci capture with a positive look behind for the above phrase and add this to the unclear list.


### first, look up the regex list of interest:
* see supplimental_use_guide1.md for names and descriptions of all regex lists
* all regex lists are stored in as class attributes, so a Microbex class must be instiantiated first before adjusting classes.

, see below for example of how to look at lists, and append a regex to the list. 


* NOTE: this addition only appends the regex to teh list in the given run, if an kernal instance were to be reset, the addition would need to be done again.
* this same workflow can be performed to remove or modify a regex in the lists.

In [34]:
##negative regex list:

obj1.negative_regex_list

['negative for',
 'no\\sgrowth',
 'no acid fast bacilli',
 'acid fast bacilli negative',
 '(?<!\\bno\\b\\s)(?<!\\bnot\\b\\s)\\bnormal\\sflora\\b',
 'no\\s+(?!normal flora)([a-zA-Z]+\\s*){1,4}((\\bisolated\\b)|(\\bfound\\b)|(\\bgrow[nth]{1,2}\\b)|(\\bseen\\b)|(\\bpresent\\b)|(\\bdetected\\b)|(\\bgrown\\b)|(\\bseen\\b)|(\\bcultured\\b))',
 '(?<!\\bno\\b\\s)(?<!\\bnot\\b\\s)\\bnormal\\s(\\s?\\w{2,}\\s)flora',
 'no\\s(\\s?\\b\\w*[-()\\s]*\\b){0,6}\\s?isolated',
 'culture\\s(\\s?\\b\\w*[-()]*\\b){0,6}\\s?negative',
 'no\\sgrowth.*\\(detection\\slevel\\sof\\s\\d+,?\\d+\\s?colonies',
 '^negative$',
 'species\\snot\\sisolated',
 'mixed\\s\\w{0,}\\s?flora',
 '(?<!resistance)(?<!susceptibility)\\s+not\\sdetected|indicated',
 ':\\snegative$',
 'no\\s(predominant|prevelant|identifyable|isolated)\\s(organism|bacteria|colony|growth)',
 'parasite',
 '(?<!un)usual\\s(\\s?\\w{2,}\\s)flora',
 '^no normal flora\\s?((\\bisolated\\b)|(\\bfound\\b)|(\\bgrow[nth]{1,2}\\b)|(\\bseen\\b)|(\\bpresent\\b)|(\\bdet

In [35]:
#unclear regex list:
obj1.unclear_regex_list

['culture complete',
 '(?<!isolated)(?<!isolated )(?<!present)(?<!present )(?<!detected)(?<!detected )(unable to determine)(?!\\s?colony count)',
 'see (note|below|scanned|comment)',
 '(left|right) hand',
 'cannot be performed',
 'test not performed',
 "\\d\\+\\s?(wbc|rbc)[\\']?s\\sseen",
 '\\+\\sepithelial\\scells',
 'culture in progress',
 'neutrop',
 'contamin',
 "presence.{0,20}absence.{0,40}(cannot|can\\'?t)\\s?be\\s?(determined|detected)",
 'comments:\\s{0,5}validation studies at labcorp have demonstrated',
 'comments:\\s{0,4}this assay is specific for',
 '^comments:',
 'indeterminate',
 "cannot|can'?t be ruled out",
 'below the detection|lod|limit of detection',
 'no\\s(?=.{0,75},).{0,75},(?=.{0,75}or).{0,75}or(?=(.*?\\s.+?isolated)|(.*?\\s.+?detected)).*?\\.?']

In [36]:
### adding a regex to account for this type of language and avoid the false positive:
obj1.unclear_regex_list.append(r'(?<=aminoglycosides alone are not effective against )enterococci')

In [37]:
### confirm the regex has indeed been added:
obj1.unclear_regex_list

['culture complete',
 '(?<!isolated)(?<!isolated )(?<!present)(?<!present )(?<!detected)(?<!detected )(unable to determine)(?!\\s?colony count)',
 'see (note|below|scanned|comment)',
 '(left|right) hand',
 'cannot be performed',
 'test not performed',
 "\\d\\+\\s?(wbc|rbc)[\\']?s\\sseen",
 '\\+\\sepithelial\\scells',
 'culture in progress',
 'neutrop',
 'contamin',
 "presence.{0,20}absence.{0,40}(cannot|can\\'?t)\\s?be\\s?(determined|detected)",
 'comments:\\s{0,5}validation studies at labcorp have demonstrated',
 'comments:\\s{0,4}this assay is specific for',
 '^comments:',
 'indeterminate',
 "cannot|can'?t be ruled out",
 'below the detection|lod|limit of detection',
 'no\\s(?=.{0,75},).{0,75},(?=.{0,75}or).{0,75}or(?=(.*?\\s.+?isolated)|(.*?\\s.+?detected)).*?\\.?',
 '(?<=aminoglycosides alone are not effective against )enterococci']

In [38]:
### rerun annotate:
obj1.annotate(
    staph_neg_correction=False, 
    specimen_col=None,
    review_suggestions=False,
    likelyneg_block_skip=False
)

step 0: simple cleaning of parsed_note + annotation of descriptive info for specimen and quantitative information
step1: parses blatent negative results and non-bacterial species.
     step1 runtime: 0.1070 seconds
step2: negative row species captures + sequestering the negative rows from those without any capture before downstream positive annotations.
n= 0 rows (0 unique cultures) added back from the neg list via virus/yeast + bacerial species exemption
     step2 runtime: 0.7161 seconds
step3: annotating unspecific positive rows.
     step3 runtime: 0.0380 seconds
step4: annotating species specific positives.
     step4 runtime: 0.2560 seconds
step5: annotating staph positives with (optional setting) handling for coagulase negative staph; setting = False
step6: annotating any previously positive row with unclear language.
     step6 runtime: 0.0699 seconds
step7: final datastructure management, mapping species captures to OHDSI ontology, and finalizing positive culture status annota

In [40]:
#confirm the classification changed and the correct negative classification is now reflected:
display(obj1.annotated_data.loc[
    obj1.annotated_data['parsed_note']=='Aminoglycosides alone are not effective against enterococci. For this isolate',
    ['pos_culture_audit1','unclear_capt','OHDSI_Concept','species_capt','pos_culture_status']])

Unnamed: 0,pos_culture_audit1,unclear_capt,OHDSI_Concept,species_capt,pos_culture_status
4,unclear19,[enterococci],[Enterococcus canis],[enterococci],0
