# Detection Model and Visualization

In [46]:
# Install spacy: https://spacy.io/usage
# !pip install -U pip setuptools wheel
# !pip3 install spacy==3.4.4
# !pip3 install scispacy
# !pip3 install negspacy

In [44]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

import spacy
import scispacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from negspacy.negation import Negex
from negspacy.termsets import termset

In [23]:
# Download models
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

## Detection Model

In [25]:
# Load the extracted data
notes=pd.read_csv("~/data/fake_notes_extracted.csv").drop(columns=["Unnamed: 0"])
notes.head()

Unnamed: 0,note_text,type,Subjective,Objective,Assessment,Plan,diags
0,Assessments and Plans fbPzzrLIjfAlibfdEwvJjBR...,RADIATION ONCOLOGY,Subjective r gVdQxnQJuU.FVRAnInvIXIWCNxKdDwrjv...,Objective TdWaFoj.XRTuFYyrtDjaPERFcxoREKHCvkWc...,Assessments and Plans fbPzzrLIjfAlibfdEwvJjBRd...,Assessments and Plans fbPzzrLIjfAlibfdEwvJjBRd...,['#COVID']
1,Objective WFohNVDa.pvBysVyyfsHdL.GngexBPJbOx....,SPORTS MEDICINE,"Brief History of Present ,Illness: IUUJxTcbwcP...",Objective WFohNVDa.pvBysVyyfsHdL.GngexBPJbOx.x...,Assessment and Plan TTC.SfLbLe.gbhOXmSKdtyPrAX...,Assessment and Plan TTC.SfLbLe.gbhOXmSKdtyPrAX...,['#Asthma ']
2,"Record of Physical Exams ,and NIH Stroke Scal...",PEDIATRIC SURGERY,"Brief History of Present ,Illness: d bYZnYbzey...","Record of Physical Exams ,and NIH Stroke Scale...",Assessment and Plan FHbJDIJQLVKSPd.CzrKYqnTx.L...,Assessment and Plan FHbJDIJQLVKSPd.CzrKYqnTx.L...,['#Asthma ']
3,Subjective HRhDXUQEAc ?TMhbEVbOhf.eHoYqpXTRsm...,PEDIATRIC NEPHROLOGY,Subjective HRhDXUQEAc ?TMhbEVbOhf.eHoYqpXTRsmY...,Physical exam: Temp: rAkycKovwvvqKAeCQWoWZqV....,Assessments and Plans ?Jq DzwdYOmvAxGJvBCVBmZq...,Assessments and Plans ?Jq DzwdYOmvAxGJvBCVBmZq...,['#COVID']
4,Assessments and Plans Dgfu #Asthma | Physical...,OTOLARYNGOLOGY,"Brief History of Present ,Illness: J",Physical exam: Temp: pEqF.BsCJxctymFsDbGoLciT...,Assessments and Plans Dgfu #Asthma |,Assessments and Plans Dgfu #Asthma |,['#Asthma ']


In [45]:
# load models
nlp0 = spacy.load("en_core_sci_sm")
nlp1 = spacy.load("en_ner_bc5cdr_md")

In [47]:
#lemmatizing the notes to capture all forms of negation(e.g., deny: denies, denying)
def lemmatize(note, nlp):
    doc = nlp(note)
    # lemmatize and tokenize notes
    lemNote = [wd.lemma_ for wd in doc]
    # join the lemmatized tokens back to text
    lemNote = " ".join(lemNote)
    # replace double spaces to period to mark the end of sentences
    lemNote = lemNote.replace("  ", ". ")
    return lemNote

#function to modify options for displacy NER visualization
def get_entity_options():
    # list of entities the model should detect
    entities = ["DISEASE", "SYMPTOM", "NEG_ENTITY"]
    # assign colors to different entity labels
    colors = {'DISEASE': 'linear-gradient(180deg, #66ffcc, #abf763)', 
    'SYMPTOM': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 
    "NEG_ENTITY":'linear-gradient(90deg, #ffff66, #ff6600)'}
    options = {"ents": entities, "colors": colors}    
    return options

def add_label_pattern(nlp, label, patterns):
    # add new label 
    ner = nlp.get_pipe("ner")
    ner.add_label(label)

    # Add custom patterns to new entity labels

    #Create the EntityRuler
    if not "entity_ruler" in nlp.pipe_names:
        ruler = nlp.add_pipe("entity_ruler")
    else:
        ruler = nlp.get_pipe("entity_ruler")

    #List of Entities and Patterns
    for p in patterns:
        pattern = [
                        {"label": label, "pattern": p}
                    ]

        ruler.add_patterns(pattern)

#adding a new pipeline component to identify negation
def neg_model(nlp):

    if not "sentencizer" in nlp.pipe_names:
        nlp.add_pipe('sentencizer')

    ts = termset("en_clinical")
    # add custom patterns for negations
    ts.add_patterns(
        {
        "preceding_negations":["unable", "w/o", "no"],
        "following_negations": ["was negative"]
        }
    )
    if not "negex" in nlp.pipe_names:
        nlp.add_pipe("negex", config={"neg_termset": ts.get_patterns()})
    return nlp

# """
# Negspacy sets a new attribute e._.negex to True if a negative concept is encountered
# """
def negation_handling(nlp, note):
    results = []
    #sentence tokenizing based on delimeter 
    note = note.split(".") 
    #removing extra spaces at the begining and end of sentence
    note = [n.strip() for n in note] 
    for t in note:
        doc = nlp(t)
        for e in doc.ents:
            rs = str(e._.negex)
            if rs == "True": 
                results.append(e.text)
    return results

#function to identify span objects of matched megative phrases from clinical note
def match(nlp,terms,label):
        patterns = [nlp.make_doc(text) for text in terms]
        matcher = PhraseMatcher(nlp.vocab)
        matcher.add(label, None, *patterns)
        return matcher
#replacing the labels for identified negative entities
def overwrite_ent_lbl(matcher, doc):
    matches = matcher(doc)
    seen_tokens = set()
    new_entities = []
    entities = doc.ents
    for match_id, start, end in matches:
        if start not in seen_tokens and end - 1 not in seen_tokens:
            new_entities.append(Span(doc, start, end, label=match_id))
            entities = [
                e for e in entities if not (e.start < end and e.end > start)
            ]
            seen_tokens.update(range(start, end))
    doc.ents = tuple(entities) + tuple(new_entities)
    return doc
def note_visualization(note):
    lem_clinical_note= lemmatize(note, nlp0)
    #creating a doc object using BC5CDR model
    doc = nlp1(lem_clinical_note)
    options = get_entity_options()
    #visualizing identified Named Entities in clinical input text 
    displacy.render(doc, style='ent', options=options)
    # output dataframe
    doc_ = nlp1(doc)
    entities = [(e.label_,e.text) for e in doc_.ents]
    df = pd.DataFrame(entities, columns=['Entity','Identified'])
    return df

def negation_visualization(note, visualize=True):
    lem_clinical_note = lemmatize(note, nlp0)
    doc = nlp1(lem_clinical_note)
    options = get_entity_options()
    #list of negative concepts from clinical note identified by negspacy
    results0 = negation_handling(nlp1, lem_clinical_note)
    matcher = match(nlp1, results0,"NEG_ENTITY")
    #doc0: new doc object with added "NEG_ENTITY label"
    doc0 = overwrite_ent_lbl(matcher,doc)
    #visualizing identified Named Entities in clinical input text 
    if visualize:
        displacy.render(doc0, style='ent', options=options)
    # output dataframe
    doc_0 = nlp1(doc0)
    entities = [(e.label_,e.text) for e in doc_0.ents]
    df = pd.DataFrame(entities, columns=['Entity','Identified'])
    return df

In [48]:
# add patterns to existing model
# manually add more to this list
patterns = [
    "murmur",
    "rub",
    "rale", 
    "gallop",
    "LAD",
    "JVD",
    "jvp"
    "lymphadenopathy",
    "focal deficit",
    "tachycardic",
    "congestion",
    "rhonchi",
    "soft", 
    "poor tone",
    "reflexes abnormal ",
    "ck" , 
    "tsh" ,
    "tpn-2 wln",
    "intact downward gaze",
    "neck flexion",
    "lesion", "click"
]

# add label to model
add_label_pattern(nlp1, "SYMPTOM", patterns)
nlp1 = neg_model(nlp1)

In [50]:
# note example
note = notes['Objective'][0]
# visualization
vis = negation_visualization(note)

In [51]:
vis.head()

Unnamed: 0,Entity,Identified
0,NEG_ENTITY,lesion
1,NEG_ENTITY,ptosis
2,NEG_ENTITY,edema
3,NEG_ENTITY,erythema
4,DISEASE,palate


In [53]:
# apply model to all notes in the data set
new_df = pd.DataFrame()
for i, note in enumerate(notes['Objective']):
    if (i+1) % 10 == 0:
        print(i+1, 'notes completed')
    neg_df = negation_visualization(note, False)
    neg_df['note_num'] = i
    new_df = pd.concat([new_df, neg_df], axis=0)

10 notes completed
20 notes completed
30 notes completed
40 notes completed
50 notes completed
60 notes completed
70 notes completed
80 notes completed
90 notes completed
100 notes completed
110 notes completed
120 notes completed
130 notes completed
140 notes completed
150 notes completed
160 notes completed


In [57]:
# remove unmeaningful patterns
new_df = new_df.loc[~(new_df['Identified'].isin(['avg:36.8', 'pain', 'MMM']))]