# Systems Tagging

The purpose of the code in this notebook is to perform systems tagging for the extracted entities (e.g. instead of "weakness" we will have "weakness (Motor)"). The result is an indicator table with "1" indicating a symptom/disease is present in the note and "0" indicating its abscence. 

In [4]:
import pandas as pd
import numpy as np
import re
import spacy
import scispacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

ModuleNotFoundError: No module named 'spacy'

In [3]:
# Import notes
notes = pd.read_csv("data/fake_notes_extracted_obj.csv")
notes.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/fake_notes_extracted_obj.csv'

In [None]:
# Load spaCy models
nlp0 = spacy.load("en_core_sci_sm")
nlp1 = spacy.load("en_ner_bc5cdr_md")

In [None]:
#lemmatizing the notes to capture all forms of negation(e.g., deny: denies, denying)
def lemmatize(note, nlp):
    doc = nlp(note)
    # lemmatize and tokenize notes
    lemNote = [wd.lemma_ for wd in doc]
    # join the lemmatized tokens back to text
    lemNote = " ".join(lemNote)
    # replace double spaces to period to mark the end of sentences
    lemNote = lemNote.replace("  ", ". ")
    return lemNote


#function to modify options for displacy NER visualization
def get_entity_options():
    # list of entities the model should detect
    entities = ["DISEASE", "SYMPTOM", "NEG_ENTITY"]
    # assign colors to different entity labels
    colors = {'DISEASE': 'linear-gradient(180deg, #66ffcc, #abf763)', 
    'SYMPTOM': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 
    "NEG_ENTITY":'linear-gradient(90deg, #ffff66, #ff6600)'}
    options = {"ents": entities, "colors": colors}    
    return options

def add_label_pattern(nlp, label, patterns):
    # add new label 
    ner = nlp.get_pipe("ner")
    ner.add_label(label)

    # Add custom patterns to new entity labels

    #Create the EntityRuler
    if not "entity_ruler" in nlp.pipe_names:
        ruler = nlp.add_pipe("entity_ruler")
    else:
        ruler = nlp.get_pipe("entity_ruler")

    #List of Entities and Patterns
    for p in patterns:
        pattern = [
                        {"label": label, "pattern": p}
                    ]

        ruler.add_patterns(pattern)


#adding a new pipeline component to identify negation
def neg_model(nlp):

    if not "sentencizer" in nlp.pipe_names:
        nlp.add_pipe('sentencizer')

    ts = termset("en_clinical")
    # add custom patterns for negations
    ts.add_patterns(
        {
        "preceding_negations":["unable", "w/o"],
        "following_negations": ["was negative"]
        }
    )
    if not "negex" in nlp.pipe_names:
        nlp.add_pipe("negex", config={"neg_termset": ts.get_patterns()})
    return nlp


# Negspacy sets a new attribute e._.negex to True if a negative concept is encountered
def negation_handling(nlp, note):
    results = []
    #sentence tokenizing based on delimeter 
    note = note.split(".") 
    #removing extra spaces at the begining and end of sentence
    note = [n.strip() for n in note] 
    for t in note:
        doc = nlp(t)
        for e in doc.ents:
            rs = str(e._.negex)
            if rs == "True": 
                results.append(e.text)
    return results

#function to identify span objects of matched megative phrases from clinical note
def match(nlp,terms,label):
        patterns = [nlp.make_doc(text) for text in terms]
        matcher = PhraseMatcher(nlp.vocab)
        matcher.add(label, None, *patterns)
        return matcher


#replacing the labels for identified negative entities
def overwrite_ent_lbl(matcher, doc):
    matches = matcher(doc)
    seen_tokens = set()
    new_entities = []
    entities = doc.ents
    for match_id, start, end in matches:
        if start not in seen_tokens and end - 1 not in seen_tokens:
            new_entities.append(Span(doc, start, end, label=match_id))
            entities = [
                e for e in entities if not (e.start < end and e.end > start)
            ]
            seen_tokens.update(range(start, end))
    doc.ents = tuple(entities) + tuple(new_entities)
    return doc

def negation_visualization(note, visualize=True):
    lem_clinical_note = lemmatize(note, nlp0)
    doc = nlp1(lem_clinical_note)
    options = get_entity_options()
    #list of negative concepts from clinical note identified by negspacy
    results0 = negation_handling(nlp1, lem_clinical_note, neg_model)
    matcher = match(nlp1, results0,"NEG_ENTITY")
    #doc0: new doc object with added "NEG_ENTITY label"
    doc0 = overwrite_ent_lbl(matcher,doc)
    #visualizing identified Named Entities in clinical input text 
    if visualize:
        displacy.render(doc0, style='ent', options=options)
    # output dataframe
    doc_0 = nlp1(doc0)
    entities = [(e.label_,e.text) for e in doc_0.ents]

    # Find systems
    systems = re.findall(r"[A-Za-z]+:", note)

    # Split strings by systems
    split_notes = re.split(r"[A-Za-z]+:", note)

    # Check if entity belongs to a specific system and tag the system to the entity
    tagged_words = []
    for word in doc_0.ents:
        for i in range(len(split_notes)):
            if str(word) in split_notes[i]:
                tagged = str(word) + " (" + systems[i-1][:-1] + ")"
                tagged_words.append(tagged)

    entities = [(e.label_) for e in doc_0.ents]
    temp = [entities, tagged_words]
    df = pd.DataFrame(temp).transpose()
    df.columns = ['Entity', 'Identified']


    return df

### Testing code on the first note

In [None]:
negation_visualization(notes["Objective_const"][0], False)

### Add customized patterns

In [None]:
# add patterns to existing model
# manually add more to this list
patterns = [
    "murmur",
    "rub",
    "rale", 
    "gallop",
    "LAD",
    "JVD",
    "jvp",
    "s3",
    "s4",
    "lymphadenopathy",
    "focal deficit",
    "tachycardic",
    "congestion",
    "rhonchi",
    "poor tone",
    "ck" , 
    "tsh" ,
    "tpn-2 wln",
    "intact downward gaze",
    "neck flexion",
    "lesion", 
    "click",
    "carotid bruit",
    "varicosity",
    "nodule",
    "deformity",
    "eruption",
    "suicidal",
    "drainage",
    "crackle",
    "retraction",
    "distention",
    "distress"
]

# add label to model
add_label_pattern(nlp1, "SYMPTOM", patterns)
nlp1 = neg_model(nlp1)

### Run function on all notes

In [None]:
# Run function for first 100 notes and input into new dataframe
new_df = pd.DataFrame()
for i, note in enumerate(notes['Objective_const'][:100]):
    if (i+1) % 500 == 0:
        print(i+1, 'notes completed')
    neg_df = negation_visualization(note, False)
    neg_df['note_num'] = i
    new_df = pd.concat([new_df, neg_df], axis=0)
    new_df = new_df.drop_duplicates(["Identified", "note_num"])

### Create indicator table from extracted entities

In [None]:
new_df['positive'] = 1
new_df.loc[new_df['Entity']=='NEG_ENTITY', 'positive'] = 0
result_df = new_df.pivot_table(index='note_num',columns=["Identified"], values='positive')
result_df.fillna(0, inplace=True)
result_df