# A rule-based NLP pipeline using MedSpaCy
The following script defines a medspaCy pipeline which identifies mentions of 
alcohol and family members in clinical notes.

In [None]:
import medspacy
from medspacy.ner import TargetRule
from medspacy.preprocess import Preprocessor, PreprocessingRule
from medspacy.postprocess import (Postprocessor, PostprocessingRule, 
    PostprocessingPattern, postprocessing_functions)
from medspacy.visualization import visualize_ent
from spacy.tokens import Doc, Span

In [None]:
# Instantiate MedSpaCy pipeline
nlp = medspacy.load()
target_matcher = nlp.get_pipe("medspacy_target_matcher")

# Define extensions
def has_alcohol(doc):
    "Function for checking if a span or doc contains an ALCOHOL named entity"
    return any('ALCOHOL' in e.label_ for e in doc.ents)

Doc.set_extension("id", default=None, force=True)
Doc.set_extension("has_alcohol", getter=has_alcohol, force=True)
Span.set_extension("has_alcohol", getter=has_alcohol, force=True)

# Define rule-based patterns
rules = [
    TargetRule("alcohol", "ALCOHOL", pattern=r"alcohol[a-z]*"),
    TargetRule("drink", "ALCOHOL", pattern=r"dr[aiu]nk"),
    TargetRule("aud", "ALCOHOL", pattern=[{"LOWER": "aud"}]),
    TargetRule("fetal_alcohol", "ALCOHOL", 
        pattern=r"fetal alcohol( syndrome)?"),
    TargetRule("family", "FAMILY", 
        pattern=r"((father)|(mother)|(aunt)|(uncle)|(brother)|(sister)|(sibling)|(cousin))(?![a-rt-z])"),
]
target_matcher.add(rules)

# Pre-processing rules
preprocess_rules = [    
    PreprocessingRule(
        r"\[\*\*[\d]{1,4}-[\d]{1,2}(-[\d]{1,2})?\*\*\]",
        repl="01-01-2010",
        desc="Replace MIMIC date brackets with a generic date."
    ),    
    PreprocessingRule(
        r"\[\*\*[\d]{4}\*\*\]",
        repl="2010",
        desc="Replace MIMIC year brackets with a generic year."
    ),    
        PreprocessingRule(
        r"\[\*\*[^\]]*(name)[^\]]+\]", 
        repl="FRED", 
        desc="Remove all name bracketed placeholder text from MIMIC"
    ),    
        PreprocessingRule(
        r"\[\*\*[^\]]*(telephone)[^\]]+\]", 
        repl="555-555-5555", 
        desc="Remove all telephone bracketed placeholder text from MIMIC"
    ),    
        PreprocessingRule(
        r"\[\*\*[^\]]+\]", 
        desc="Remove all other bracketed placeholder text from MIMIC"
    )
]
preprocessor = Preprocessor(nlp.tokenizer)
nlp.tokenizer = preprocessor
preprocessor.add(preprocess_rules)

# Add medical section parser
sectionizer = nlp.add_pipe("medspacy_sectionizer")

In [None]:
# Run pipeline on a sample note.
note = """
allergies: 
alcohol

social history: 
Patient was born with fetal alcohol syndrome.

family history: 
Mother was diagnosed with AUD.
"""
doc = nlp(note.lower())
visualize_ent(doc)