In [47]:
import os
from pathlib import Path
import re
import string
import json
import random
import sys
import time

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.training import Example

In [48]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATA_ORIGIN_PATH = os.path.join("data","origin")
DATA_PREPARED_PATH = os.path.join("data", "prepared")
DATA_AGGREGATE_PATH = os.path.join(DATA_PREPARED_PATH, "aggregate")

WORD_VECTOR_PATH = "word_vec"
WORD_VECTOR_MODEL_NAME = os.path.join(WORD_VECTOR_PATH, "biomed.model")
WORD_VECTOR_FILE_NAME = os.path.join(WORD_VECTOR_PATH, "biomed_word2vec.txt")
WORD_VECTOR_EXPORT_TENSORBOARD_PATH = os.path.join(WORD_VECTOR_PATH, "tensorboard", "biomed_ner")

MODEL_PATH = "model"
MODEL_TRAIN_PATH = os.path.join(MODEL_PATH, "train")
MODEL_ACTUAL_PATH = os.path.join(MODEL_PATH, "actual")

TSV_EXTENSION = ".tsv"
JSON_EXTENSION = ".json"
SPACY_EXTENSION = ".spacy"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"
VALIDATE_DATASET = "devel"
TEST_DATASET = "test"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

In [50]:
#main_nlp = spacy.load("en_core_web_sm")
main_nlp = spacy.blank("en")

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_SPECIES)
nlp_species = spacy.load(path)

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_DRUG_PROTEIN)
nlp_drug_protein = spacy.load(path)

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_CHEMICAL)
nlp_chemical = spacy.load(path)

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_DISEASE)
nlp_disease = spacy.load(path)


In [51]:
if len(main_nlp.pipe_names) > 0:
    main_nlp.add_pipe(LABEL_SPECIES + "_ner", source=nlp_species, before="ner")
    main_nlp.add_pipe(LABEL_DRUG_PROTEIN + "_ner", source=nlp_drug_protein, before="ner")
    main_nlp.add_pipe(LABEL_CHEMICAL + "_ner", source=nlp_chemical, before="ner")
    main_nlp.add_pipe(LABEL_DISEASE + "_ner", source=nlp_disease, before="ner")
    main_nlp.remove_pipe("ner")
else:
    main_nlp.add_pipe(LABEL_SPECIES + "_ner", source=nlp_species)
    main_nlp.add_pipe(LABEL_DRUG_PROTEIN + "_ner", source=nlp_drug_protein)
    main_nlp.add_pipe(LABEL_CHEMICAL + "_ner", source=nlp_chemical)
    main_nlp.add_pipe(LABEL_DISEASE + "_ner", source=nlp_disease)

In [52]:
main_nlp.pipe_names

['SPECIES_ner', 'DRUG-PROTEIN_ner', 'CHEMICAL_ner', 'DISEASE_ner']

In [58]:
def format_data_for_evaluation(data, nlp_model): 
    data_formated = []
    for text, annotations in data:
        doc = nlp_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        data_formated.append(example)
    return data_formated

In [56]:
def load_data(label, dataset_type, group_by_label=True):
    data = []
    if group_by_label:
        file = os.path.join(DATA_PREPARED_PATH, label, label + "-" + dataset_type + JSON_EXTENSION)
    else:
        file = os.path.join(DATA_AGGREGATE_PATH, dataset_type + JSON_EXTENSION)
        
    with open (file, "r", encoding="utf-8") as f:
        json_data = json.load(f)
    for an in json_data:
        texto = an["texto"]
        entities = []
        for entidade in an["entities"]:
            info = (entidade["start"], entidade["end"], entidade["label"])
            entities.append(info)
        data.append((texto,{"entities":entities}))
    return data


In [59]:
test_data = load_data(label, TEST_DATASET, group_by_label=False)
test_data_spacy=format_data_for_evaluation(test_data, main_nlp)
test_metrics = main_nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.39227125998835793, 'ents_r': 0.5827519113224976, 'ents_f': 0.4689054234634575, 'ents_per_type': {'SPECIES': {'p': 0.12930886640370434, 'r': 0.6854545454545454, 'f': 0.2175732217573222}, 'DRUG-PROTEIN': {'p': 0.2538183755623964, 'r': 0.6823969441349673, 'f': 0.3700107874865156}, 'CHEMICAL': {'p': 0.6579564258135687, 'r': 0.6210666753441151, 'f': 0.6389795610907448}, 'DISEASE': {'p': 0.42504409171075835, 'r': 0.08952451708766716, 'f': 0.14789812826020254}}, 'speed': 8189.3018945219455}
