In [None]:

from spacy.tokens import DocBin
import os
import spacy
from spacy import displacy, Language

from scripts.entity_recognition.components import block_matcher, intersection_matcher, street_vs_neighborhood
from scripts.utils.config import Config
from scripts.utils.spacy import load_spacy

In [2]:
config = Config()
train_path = config.get_data_path("entity_recognition.article_text_train")
dev_path = config.get_data_path("entity_recognition.article_text_dev")
er_model = os.path.join(config.get_file_path("entity_recognition.trained_model"), 'model-best')

In [None]:
Language.component('block_matcher', func=block_matcher)
Language.component('intersection_matcher', func=intersection_matcher)
Language.component('street_vs_neighborhood', func=street_vs_neighborhood)
nlp_base = load_spacy("en_core_web_sm")
nlp = spacy.load(er_model)

# Reading this twice because both nlp's modifies the docs in place.
docs_base = list(DocBin().from_disk(train_path).get_docs(nlp_base.vocab))
# docs_base = [d for d in docs_base if 'WHERE' in d.user_data and d.user_data['WHERE'] > .5]
docs = list(DocBin().from_disk(train_path).get_docs(nlp.vocab))
# docs = [d for d in docs if 'WHERE' in d.user_data and d.user_data['WHERE'] > .5]


In [4]:
text = "I drove down Chicago Avenue and West 35th Street"
for i in range(len(nlp.pipeline)):
    components = nlp.pipe_names[0:i+1]
    print(components[0], ">...>", components[-1])
    with nlp.select_pipes(enable=components) as nlp_inspect:
        doc = nlp_inspect.nlp(text)
        print(doc.ents)
        # displacy.render(nlp_inspect.nlp(text), style='ent')

tok2vec >...> tok2vec
()
tok2vec >...> street_matcher
(Chicago Avenue,)
tok2vec >...> gpe_matcher
(Chicago Avenue,)
tok2vec >...> street_to_neighborhood
(Chicago Avenue,)
tok2vec >...> age_matcher
(Chicago Avenue,)
tok2vec >...> ner
(Chicago Avenue, West 35th)
tok2vec >...> block_matcher
(Chicago Avenue, West 35th)
tok2vec >...> intersection_matcher
(Chicago Avenue, West 35th)


In [5]:
from IPython.display import clear_output

def ents_eq(e1, e2):
    return e1.text == e2.text and e1.start == e2.start \
        and e1.end == e2.end and e1.label == e2.label \
        and e1.label_ == e2.label_

for doc_base, doc in zip(docs_base, docs):
    pred_base = nlp_base(doc_base)
    pred = nlp(doc)
    if not all([ents_eq(e1, e2) for e1, e2 in zip(pred_base.ents, pred.ents)]):
        clear_output(wait=True)
        displacy.render(pred_base, style="ent")
        displacy.render(pred, style="ent")
        input("Press any key to continue")

KeyboardInterrupt: Interrupted by user