In [1]:
import pandas as pd
import os
import numpy as np

import pathlib
import re
import sys, warnings
#import xlrd
from datetime import date

import rbmce
# from .rbmce import *
import time  


###running rbmce spits out some unnecessary warnings when used in jupyter, silencing them here
import warnings
warnings.filterwarnings('ignore')

### rbmce column requirements reminder:
* parsed_note or note_section:
    * microbiology report txt in either a raw or (perferable) chopped up into components (eg gram stain/growth report/ab susceptability)
* culture_id:
    * a primary key tied to a given sample/specimen + microbiological exam order.
    * Often a microbiology order can be tied to numerous components (eg gram stain/growth report/ ab susceptability). additionally these can be appended to same report or added as a new report tied to same sample + order. all of these tied to a sample+order should share same culture_id
* visit_id:
    * primary key for patient's visit/encounter
    * can be 1-many:1 to culture_id or 1:1 (in which case can specify as culture_id)
in some datasets a patient may have multiple cultures performed in a visit/encounter.

# importing in a short dataframe to serve as an example for auditing
* this test_df is an example input for RBMCE


In [5]:
test_df=pd.read_csv(os.path.join(os.path.join(os.getcwd(),'rbmce/use_guides'),'audit_example.csv'))

# running rbmce on test_df:

In [7]:
test_df=rbmce.run(test_df[['culture_id','visit_id','parsed_note']])

step0.1
step0.2
step0.3
step0.4
step1
negative_classifier: 0.1473 seconds
step2
virus, neg, yeast, etc...: 0.0130 seconds
step2.1
n= 0 rows (0 unique cultures) added back from the neg list via virus/yeast + bacerial species exemption
negative species capture...: 0.8238 seconds
step3
unspecific positive: 0.0449 seconds
step4
species specific captures: 0.2955 seconds
step5
staph classifier: 0.0643 seconds
step6
unclear and likely negative: 0.0894 seconds


# split into pos and neg bacteria positive status
* for each look at the top x most frequent note texts
* if this isn't helpful or not granular enough, can look at top x rank of:
    * result_binary: which will show the enumerated index num of the 
    * result_binary2

In [8]:
pos_df= test_df[test_df['result_num']==1]
neg_df= test_df[test_df['result_num']==0]

In [9]:
pos_df['parsed_note'].value_counts() #check through these and ensure no glaring misclassifications. if a misclass is spotted

gram positive cocci in clusters in bottle with antibiotic removal resin          3
Many species present                                                             3
gram negative bacilli in aerobic and anaerobic bottles                           3
Many Pseudomonas aeruginosa strain 1                                             2
Many Pseudomonas aeruginosa strain 2                                             2
Klebsiella pneumoniae in bottle with antibiotic removal resin                    2
gram negative bacilli in anaerobic bottle                                        2
Moderate Pseudomonas aeruginosa                                                  2
Many Pseudomonas aeruginosa                                                      2
Mixed gram negative bacilli                                                      2
Klebsiella oxytoca in aerobic and anaerobic bottles                              2
Klebsiella pneumoniae in aerobic and anaerobic bottles                           2
> 10

In [6]:
pos_df['result_binary'].value_counts()

species_positive1       12
species_positive2       10
species_positive0        8
species_positive70       7
pos_staph                6
species_positive14       6
species_positive93       3
species_positive19       3
positive_unspecific1     3
species_positive21       3
species_positive6        2
species_positive11       1
species_positive27       1
species_positive62       1
species_positive3        1
Name: result_binary, dtype: int64

In [7]:
pos_df['result_binary2'].value_counts()

positive    67
Name: result_binary2, dtype: int64

In [12]:
pos_df[pos_df['parsed_note']=='Aminoglycosides alone are not effective against enterococci. For this isolate']

Unnamed: 0,culture_id,likelyneg_capt,likelyneg_regex,negative_capt,negative_regex,parsed_note,pos_qual_capt,pos_qual_regex,pos_quant_capt,pos_quant_regex,...,unclear_regex,virus_capt,virus_regex,visit_id,yeast_capt,yeast_regex,species_capt_all,OHDSI_ID,OHDSI_Concept,flora_flag
46,16,[],[],[],[],Aminoglycosides alone are not effective agains...,[],[],[],[],...,[],[],[],16,[],[],[enterococci],[4174874],[Enterococcus canis],0


### example:
* above we see "Aminoglycosides alone are not effective against enterococci. For this isolate..." was classified as positive.
    * this looks like a note section that was providing context to an result rather than being a primary result. thus should be ignored. 
    
### todo: need to add exemption for this such that it and other similar cases will be classified as negative from here on
* we have two choices for where to account for this, first: negative_captures, which is the most broad reaching since it is applied first and will dictate if a case has a chance to be considered positive downstream. The other option is in "unclear", which is applied on the last step of the positive block and is used as a way to catch false positives. 
* for this example Let's do the "unclear" 
* please see supplimental_use_guide1.md for detailed description of classification logic flow for additional context.
* for this case, let's assume that "Aminoglycosides alone are not effective..." is the common string in these false positives.
    * to address this, we can make a enterococci capture with a positive look behind for the above phrase and add this to the unclear list.


### first, look up the regex list of interest:
* see supplimental_use_guide1.md for names and descriptions of all regex lists
* all regex lists are stored in rbmce.regex_blocks, see below for example of how to look at lists, and append a regex to the list. 
* NOTE: this addition only appends the regex to teh list in the given run, if an kernal instance were to be reset, the addition would need to be done again.
* this same workflow can be performed to remove or modify a regex in the lists.

In [13]:
#negative regex list:
rbmce.regex_blocks.negative_regex_list

['negative for',
 'no\\sgrowth',
 'no acid fast bacilli',
 'acid fast bacilli negative',
 '(?<!\\bno\\b\\s)(?<!\\bnot\\b\\s)\\bnormal\\sflora\\b',
 'no\\s+(?!normal flora)([a-zA-Z]+\\s*){1,4}((\\bisolated\\b)|(\\bfound\\b)|(\\bgrow[nth]{1,2}\\b)|(\\bseen\\b)|(\\bpresent\\b)|(\\bdetected\\b)|(\\bgrown\\b)|(\\bseen\\b)|(\\bcultured\\b))',
 '(?<!\\bno\\b\\s)(?<!\\bnot\\b\\s)\\bnormal\\s(\\s?\\w{2,}\\s)flora',
 'no\\s(\\s?\\b\\w*[-()\\s]*\\b){0,6}\\s?isolated',
 'culture\\s(\\s?\\b\\w*[-()]*\\b){0,6}\\s?negative',
 'no\\sgrowth.*\\(detection\\slevel\\sof\\s\\d+,?\\d+\\s?colonies',
 '^negative$',
 'species\\snot\\sisolated',
 'mixed\\s\\w{0,}\\s?flora',
 '(?<!resistance)(?<!susceptibility)\\s+not\\sdetected|indicated',
 ':\\snegative$',
 'no\\s(predominant|prevelant|identifyable|isolated)\\s(organism|bacteria|colony|growth)',
 'parasite',
 '(?<!un)usual\\s(\\s?\\w{2,}\\s)flora',
 '^no normal flora\\s?((\\bisolated\\b)|(\\bfound\\b)|(\\bgrow[nth]{1,2}\\b)|(\\bseen\\b)|(\\bpresent\\b)|(\\bdet

In [10]:
#unclear regex list:
rbmce.regex_blocks.unclear_regex_list

['culture complete',
 '(?<!isolated)(?<!isolated )(?<!present)(?<!present )(?<!detected)(?<!detected )(unable to determine)(?!\\s?colony count)',
 'see (note|below|scanned|comment)',
 '(left|right) hand',
 'cannot be performed',
 'test not performed',
 "\\d\\+\\s?(wbc|rbc)[\\']?s\\sseen",
 '\\+\\sepithelial\\scells',
 'culture in progress',
 'neutrop',
 'contamin',
 "presence.{0,20}absence.{0,40}(cannot|can\\'?t)\\s?be\\s?(determined|detected)",
 'comments:\\s{0,5}validation studies at labcorp have demonstrated',
 'comments:\\s{0,4}this assay is specific for',
 '^comments:',
 'indeterminate',
 "cannot|can'?t be ruled out",
 'below the detection|lod|limit of detection',
 'no\\s(?=.{0,75},).{0,75},(?=.{0,75}or).{0,75}or(?=(.*?\\s.+?isolated)|(.*?\\s.+?detected)).*?\\.?']

In [23]:
rbmce.regex_blocks.unclear_regex_list.append(r'(?<=aminoglycosides alone are not effective against )enterococci')

In [24]:
rbmce.regex_blocks.unclear_regex_list

['culture complete',
 '(?<!isolated)(?<!isolated )(?<!present)(?<!present )(?<!detected)(?<!detected )(unable to determine)(?!\\s?colony count)',
 'see (note|below|scanned|comment)',
 '(left|right) hand',
 'cannot be performed',
 'test not performed',
 "\\d\\+\\s?(wbc|rbc)[\\']?s\\sseen",
 '\\+\\sepithelial\\scells',
 'culture in progress',
 'neutrop',
 'contamin',
 "presence.{0,20}absence.{0,40}(cannot|can\\'?t)\\s?be\\s?(determined|detected)",
 'comments:\\s{0,5}validation studies at labcorp have demonstrated',
 'comments:\\s{0,4}this assay is specific for',
 '^comments:',
 'indeterminate',
 "cannot|can'?t be ruled out",
 'below the detection|lod|limit of detection',
 'no\\s(?=.{0,75},).{0,75},(?=.{0,75}or).{0,75}or(?=(.*?\\s.+?isolated)|(.*?\\s.+?detected)).*?\\.?',
 '(?<!aminoglycosides alone are not effective against )enterococci',
 '(?<=aminoglycosides alone are not effective against )enterococci']

In [25]:
###running rbmce spits out some unnecessary warnings when used in jupyter, silencing them here
import warnings
warnings.filterwarnings('ignore')

In [26]:
###need to silence warnings
test_df_rerun=rbmce.run(test_df[['culture_id','visit_id','parsed_note']])

step0.1
step0.2
step0.3
step0.4
step1
negative_classifier: 0.1402 seconds
step2
virus, neg, yeast, etc...: 0.0108 seconds
step2.1
n= 0 rows (0 unique cultures) added back from the neg list via virus/yeast + bacerial species exemption
negative species capture...: 0.7291 seconds
step3
unspecific positive: 0.0408 seconds
step4
species specific captures: 0.2750 seconds
step5
staph classifier: 0.0651 seconds
step6
unclear and likely negative: 0.0845 seconds


In [28]:
# the correct classification is now reflected in the RBMCE run. 

display(test_df_rerun.loc[
    test_df_rerun['parsed_note']=='Aminoglycosides alone are not effective against enterococci. For this isolate',
    ['result_binary','unclear_capt','OHDSI_Concept','species_capt','result_num']])

Unnamed: 0,result_binary,unclear_capt,OHDSI_Concept,species_capt,result_num
46,unclear20,[enterococci],[Enterococcus canis],[enterococci],0
