In [1]:
import os
from pathlib import Path
import re
import string
import json
import random
import sys
import time

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.training import Example

In [2]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATASET_PATH = "NER-Data"
NER_PATH = "NER-Process"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"
VALIDATE_DATASET = "devel"
TEST_DATASET = "test"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

In [40]:
#main_nlp = spacy.load("en_core_web_sm")
main_nlp = spacy.blank("en")

path = os.path.join(NER_PATH, LABEL_SPECIES, "best_model")
nlp_species = spacy.load(path)

path = os.path.join(NER_PATH, LABEL_DRUG_PROTEIN, "best_model")
nlp_drug_protein = spacy.load(path)

path = os.path.join(NER_PATH, LABEL_CHEMICAL, "best_model")
nlp_chemical = spacy.load(path)

path = os.path.join(NER_PATH, LABEL_DISEASE, "best_model")
nlp_disease = spacy.load(path)


In [44]:
if len(main_nlp.pipe_names) > 0:
    main_nlp.add_pipe(LABEL_SPECIES + "_ner", source=nlp_species, before="ner")
    main_nlp.add_pipe(LABEL_DRUG_PROTEIN + "_ner", source=nlp_drug_protein, before="ner")
    main_nlp.add_pipe(LABEL_CHEMICAL + "_ner", source=nlp_chemical, before="ner")
    main_nlp.add_pipe(LABEL_DISEASE + "_ner", source=nlp_disease, before="ner")
    main_nlp.remove_pipe("ner")
else:
    main_nlp.add_pipe(LABEL_SPECIES + "_ner", source=nlp_species)
    main_nlp.add_pipe(LABEL_DRUG_PROTEIN + "_ner", source=nlp_drug_protein)
    main_nlp.add_pipe(LABEL_CHEMICAL + "_ner", source=nlp_chemical)
    main_nlp.add_pipe(LABEL_DISEASE + "_ner", source=nlp_disease)

In [45]:
main_nlp.pipe_names

['SPECIES_ner', 'DRUG-PROTEIN_ner', 'CHEMICAL_ner', 'DISEASE_ner']

In [26]:
def format_data_for_evalation(data, nlp_model): 
    data_formated = []
    for text, annotations in data:
        doc = nlp_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        data_formated.append(example)
    return data_formated

In [27]:
def load_data(label, data_set_type=TRAIN_DATASET):
    train_data = []
    file = os.path.join(NER_PATH, label, label + "-" + data_set_type + ".json")
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    for item in data:
        texto = item["texto"]
        entities = []
        for entidade in item["entities"]:
            info = (entidade["start"], entidade["end"], entidade["label"])
            entities.append(info)
        train_data.append((texto,{"entities":entities}))
    return train_data

In [46]:
for label in LABEL_LIST:
    test_data = load_data(label, TEST_DATASET)
    test_data_spacy=format_data_for_evalation(test_data, main_nlp)
    test_metrics = main_nlp.evaluate(test_data_spacy)
    print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.45914078279382065, 'ents_r': 0.6693458538914532, 'ents_f': 0.5446656953213534, 'ents_per_type': {'SPECIES': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'DRUG-PROTEIN': {'p': 0.6717514575513138, 'r': 0.6693458538914532, 'f': 0.6705464981863116}, 'CHEMICAL': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'DISEASE': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'speed': 9286.318412860264}
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.3521415343394145, 'ents_r': 0.5816276723829358, 'ents_f': 0.438684975886316, 'ents_per_type': {'CHEMICAL': {'p': 0.8673330745341615, 'r': 0.5816276723829358, 'f': 0.6963127446970139}, 'SPECIES': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'DISEASE': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'DRUG-PROTEIN': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'speed': 8987.40239940502}
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.3240492273526418, 'ents_r': 0.6504457652303121, 'ents_f': 0.43258600