# NLP System

In [1]:
# Main NLP Class

from pipeUtils import Document
from pipeUtils import Annotation
import re

class PadClassificationSystem:
    def __init__(self):
        #initiate necessary components        
        self.target_rules=self.getTargetRegexes()        
        self.negation_rules = self.getNegRegexes()
                
    def process(self, document):
        document_id = document.document_id
        ann_index=0
        for reg in self.target_rules:
            for match in reg.finditer(document.text):
                ann_id = 'NLP_'+ str(document_id) + '_' + str(ann_index)
                ann_index=ann_index+1
                new_annotation = Annotation(start_index=int(match.start()), 
                                    end_index=int(match.end()), 
                                    type='pad_annotation',
                                    ann_id = ann_id
                                    )
                new_annotation.spanned_text = document.text[new_annotation.start_index:new_annotation.end_index]

                # Check negation right before the found target up to 30 charachers before, 
                # making sure that the pre-text does not cross the text boundary and is valid

                if new_annotation.start_index - 30 > 0:
                    pre_text_start = new_annotation.start_index - 30
                else:
                    pre_text_start = 0

                # ending index of the pre_text is the beginning of the found target    
                pre_text_end = new_annotation.start_index    

                # substring the document text to identify the pre_text string
                pre_text = doc.text[pre_text_start: pre_text_end]

                # We do not need to know the exact location of the negation keyword, so re.search is acceptable
                for neg_regex in self.negation_rules:
                    if re.search(neg_regex, pre_text):
                        new_annotation.attributes["Negation"] ="Negated"

                document.annotations.append(new_annotation)
        
        return document 
    
    def getTargetRegexes(self):
        target_regexes = []
        regexes = [
            r'(peripheral\s*(arter\w*|vasc\w*)\s*disease)',
            r'\bpvd\b',
            r'femoral.{1,50}occlusion'
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

    def getNegRegexes(self):
        target_regexes = []
        regexes = [
            r'\bno\b',
            r'no\s*evidence\s*of'  ,
            r'does\s*not\s*have',
            r'denies'
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

In [2]:
%%time
#  test case
nlp_system = PadClassificationSystem()
doc_text = '''
Patient has peripheral artery disease. ---------- \nPatient also has PVD or peripheral vascular\ndisease or pvd . 
\n The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . \n 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.
'''
doc=Document(text=doc_text, document_id='Doc1')
 
out_doc=nlp_system.process(doc)
print(out_doc.toString())

Doc1
-------

Patient has peripheral artery disease. ---------- 
Patient also has PVD or peripheral vascular
disease or pvd . 

 The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . 
 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.

-------
NLP_Doc1_0 pad_annotation 13 38 peripheral artery disease 
NLP_Doc1_1 pad_annotation 76 103 peripheral vascular
disease 
NLP_Doc1_2 pad_annotation 146 171 peripheral artery disease [Negation:Negated]
NLP_Doc1_3 pad_annotation 181 208 peripheral arterial disease 
NLP_Doc1_4 pad_annotation 237 264 peripheral vascular disease [Negation:Negated]
NLP_Doc1_5 pad_annotation 340 367 peripheral arterial disease 
NLP_Doc1_6 pad_annotation 69 72 PVD 
NLP_Doc1_7 pad_annotation 107 110 pvd 
NLP_Doc1_8 pad_annotation 288 316 femoral and illiac occlusion 

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wal

## Validation

In [5]:
from pipeUtils import Annotation
from pipeUtils import Document
 
import os
import glob 

In [6]:
# Read all test documents
unid="u6008208"
project_1 = "Project_abi_v1"
project_2 = "PAD_ann_test"
path_1 = "/home/"+"u1166466"+"/BRAT/"+str(unid)+"/"+project_1
path_2 = "/home/"+str(unid)+"/BRAT/"+str(unid)+"/"+project_2

In [7]:
test_docs=dict()
test_doc_paths = glob.glob(str(path_1+'/*.txt')) 
for d in test_doc_paths:
    doc = Document()
    #print(d)
    doc.load_document_from_file(d)
    #print(str(d[:-3])+'ann')
    doc.load_annotations_from_brat(str(d[:-3])+'ann')
    #print(os.path.basename(d))
    test_docs[os.path.basename(d)]=doc


test_docs    

{'_0.txt': <pipeUtils.Document at 0x7fe7c85df7f0>,
 '_1.txt': <pipeUtils.Document at 0x7fe7c8577668>,
 '_10.txt': <pipeUtils.Document at 0x7fe7c8572630>,
 '_11.txt': <pipeUtils.Document at 0x7fe7c8572828>,
 '_12.txt': <pipeUtils.Document at 0x7fe7c857f828>,
 '_13.txt': <pipeUtils.Document at 0x7fe7c8577e48>,
 '_14.txt': <pipeUtils.Document at 0x7fe7c85b8e80>,
 '_15.txt': <pipeUtils.Document at 0x7fe7c857b358>,
 '_16.txt': <pipeUtils.Document at 0x7fe7c85e5198>,
 '_17.txt': <pipeUtils.Document at 0x7fe7c85c9898>,
 '_18.txt': <pipeUtils.Document at 0x7fe7c85e54a8>,
 '_19.txt': <pipeUtils.Document at 0x7fe7c85c9780>,
 '_2.txt': <pipeUtils.Document at 0x7fe7c85df828>,
 '_20.txt': <pipeUtils.Document at 0x7fe7c85d9a58>,
 '_21.txt': <pipeUtils.Document at 0x7fe7c856e828>,
 '_22.txt': <pipeUtils.Document at 0x7fe7c857b8d0>,
 '_23.txt': <pipeUtils.Document at 0x7fe7c856ecf8>,
 '_24.txt': <pipeUtils.Document at 0x7fe7c8572ef0>,
 '_25.txt': <pipeUtils.Document at 0x7fe7c85b8c18>,
 '_26.txt': <pi

In [8]:
# Processing the all notes
nlp_system = PadClassificationSystem()

for doc_id in  test_docs.keys():
    nlp_system.process(test_docs.get(doc_id))

In [9]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span('PAD','pad_annotation', False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 0 FP = 38 FN = 0
NLP__51.txt_0 pad_annotation 424 427 pvd 
NLP__38.txt_0 pad_annotation 414 417 PVD 
NLP__54.txt_0 pad_annotation 426 429 pvd 
NLP__25.txt_0 pad_annotation 457 460 PVD 
NLP__36.txt_0 pad_annotation 431 434 PVD 
NLP__2.txt_0 pad_annotation 19647 19674 Peripheral Vascular Disease 
NLP__2.txt_1 pad_annotation 427 430 pvd 
NLP__2.txt_2 pad_annotation 1919 1922 PVD 
NLP__2.txt_3 pad_annotation 3980 3983 PVD 
NLP__2.txt_4 pad_annotation 17927 17930 PVD 
NLP__44.txt_0 pad_annotation 455 458 pvd 
NLP__4.txt_0 pad_annotation 241 268 PERIPHERAL VASCULAR DISEASE 
NLP__49.txt_0 pad_annotation 215 218 PVD 
NLP__49.txt_1 pad_annotation 545 548 PVD 
NLP__30.txt_0 pad_annotation 269 272 PVD 
NLP__30.txt_1 pad_annotation 467 470 PVD 
NLP__5.txt_0 pad_annotation 761 788 peripheral vascular disease 
NLP__5.txt_1 pad_annotation 357 360 PVD 
NLP__55.txt_0 pad_annotation 251 278 PERIPHERAL VASCULAR DISEASE 
NLP__55.txt_1 pad_annotation 822 849 peripheral vascular disease 
NLP__55.txt_2 

In [None]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('PAD', 'Negation', 'Negated'),('pad_annotation', 'Negation', 'Negated')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_annotation', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

#for a in tp_list_total:
#    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

## System deployment

In [None]:
# imports
import pymysql
import pandas as pd
import getpass

In [None]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd=getpass.getpass("Enter MySQL passwd for jovyan"),
                       db='mimic2')
cursor = conn.cursor()

In [None]:
# identify patients with PAD for reference standard
pad_data = pd.read_sql("""SELECT noteevents.subject_id, 
                      noteevents.category, 
                      noteevents.text FROM noteevents limit 100000 """,conn)

In [None]:
try:
    conn.close()
except:
    print("Connection is already closed!")

In [None]:
print(pad_data.columns)
print("Number of records = ", len(pad_data))

pad_data.head(1)

In [None]:
final_nlp_system = PadClassificationSystem()

In [None]:
output = []
counter = 0
for index , row in pad_data.sample(10000).iterrows():    
    doc = Document(document_id=str(row.subject_id) + '_' + str(index), text=row.text)
    final_nlp_system.process(doc)
    if(len(doc.annotations) > 0):
        i = 1
        for a in doc.annotations:
            if( a.type == 'pad_annotation'):
                neg_flag = 0
                # Switch the flag to 1 when the mention is negated
                if('definite_negated_existence' in a.attributes):
                    neg_flag=1
                ### Each row in the dictionary
                record_id  = str(row.subject_id) + '_' + str(index)+'_'+str(i)
                subject_id =  row.subject_id
                note_id = str(row.subject_id) + '_' + str(index)
                annotation_type = a.type
                snippet = doc.text[int(a.start_index): int(a.end_index)]
                out_list = [record_id, subject_id, note_id, annotation_type, \
                            a.start_index, a.end_index, \
                            snippet, neg_flag]
                output.append(out_list)
                i=i+1
                counter=counter+1
                # Print . after 10 identified records
                if counter%10 == 0:
                    print('.', end='')
        else:
            continue
        break
    

# Output to CSV file

In [None]:
columns=['record_id','subject_id', 'note_id', 'annotation_type', 'span_start', 'span_end', 'PAD_snippet', 'neg_flag']
result_data_frame = (pd.DataFrame(output, columns=columns))

result_data_frame.describe()


In [None]:
result_data_frame

In [None]:
result_data_frame.to_csv('tmp/out_table.csv', index=False)
print('Done')

## This completes the development and deployment of the Classification system