In [1]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.training import Example
import warnings

warnings.filterwarnings('ignore')
nlp = spacy.load('en_core_web_md')
pipe_exceptions = ['tok2vec', 'tagger', 'parser']
not_required_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
nlp.disable_pipes(*not_required_pipes)
entity_ruler = nlp.add_pipe("entity_ruler")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'entity_ruler']

In [2]:
# Laden der XML-Datei

import xml.etree.ElementTree as ElementTree

entity_linker_export = ElementTree.parse("D:/Git_Fachpraktikum/MedExtractor/resources/med_entity_linker_export.xml")
root = entity_linker_export.getroot()

In [3]:
# Training of Entity_Ruler

train_data_ruler = []

for entity in [item.text for item in root.iter("entity")]:
    to_train = {"label": "DISEASE", "pattern": entity}
    train_data_ruler.append(to_train)

for alias in [item.text for item in root.iter("alias")]:
    to_train = {"label": "SYMPTOM", "pattern": alias}
    train_data_ruler.append(to_train)
      
entity_ruler.add_patterns(train_data_ruler)

In [4]:
# Training of Entity Linker

import random
import math
from spacy.kb import KnowledgeBase

vocab = nlp.vocab

def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab, entity_vector_length=300)

    for entity in [item.text for item in root.iter("entity")]:
        vector = nlp.vocab.get_vector(entity)
        kb.add_entity(entity = entity, freq = 50, entity_vector = vector)

    for alias in [item for item in root.iter("alias")]:
        entities = [entity.text for entity in alias.find("alias_entities")]
        kb.add_alias(alias = alias.text, entities = entities,probabilities = [0.001*math.floor(1000/len(entities))]*len(entities))
        
    return kb

TRAIN_DATA = []

for sample in [item for item in root.iter("sample")]:
    sample_data = [sample.text]
    
    annotations = []
    links = {}
    aliases = []
    
    alias_type = sample.find("links/alias_type").text
    links_node = sample.find("links")
    
    for pos in links_node.iter("position"):
        entities = {}
        position = eval(pos.text)
        aliases.append((position[0],position[1],alias_type))
        entities_training = pos.find("entities_training")

        for entity in entities_training:
            entities[entity.text] = float(entity.find("probability").find("prob").text)
        links[position] = entities

    sample_data.append(aliases)
    sample_data.append(links)
    TRAIN_DATA.append(sample_data)

for i in range(20):
    random.shuffle(TRAIN_DATA)
    examples = []
    for text, annotations, links in TRAIN_DATA:
        doc = nlp(text)
        gold_dict = {"entities": annotations, "links": links}
        examples.append(Example.from_dict(doc, gold_dict))

def give_examples():
    return examples

entity_linker = nlp.add_pipe('entity_linker')
entity_linker.set_kb(create_kb)
optimizer = entity_linker.create_optimizer()
entity_linker.initialize(give_examples)
entity_linker.update(examples, sgd=optimizer)

{'entity_linker': 1.0223745346069335}

In [5]:
doc = nlp('Yesterday I entered the shopping mall when out of a sudden I had a panic attack.'
          'I was hyperventilating for a moment and felt hot and sweaty. '
          'And I started to feel chest pain.')

In [6]:
for ent in doc.ents:
    candidates = entity_linker.kb.get_alias_candidates(ent.text)
    print(ent,ent.label_,[cand.entity_ for cand in candidates])

panic attack SYMPTOM ['agoraphobia', 'dysphagia', 'panic disorder', 'shock', 'social anxiety disorder']
hyperventilating SYMPTOM ['agoraphobia']
hot and sweaty SYMPTOM ['agoraphobia']
chest pain SYMPTOM ['agoraphobia']


In [7]:
predicts = entity_linker.predict(doc)
for i,ent in enumerate(doc.ents):
    print(ent,ent.label_,predicts[i])

panic attack SYMPTOM panic disorder
hyperventilating SYMPTOM agoraphobia
hot and sweaty SYMPTOM agoraphobia
chest pain SYMPTOM agoraphobia
