In [3]:
import os
from pathlib import Path
import re
import string
import json
import random
import sys
import time
import datetime

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.training import Example


In [4]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATA_ORIGIN_PATH = os.path.join("data","origin")
DATA_PREPARED_PATH = os.path.join("data", "prepared")
DATA_AGGREGATE_PATH = os.path.join(DATA_PREPARED_PATH, "aggregate")

WORD_VECTOR_PATH = "word_vec"
WORD_VECTOR_MODEL_NAME = os.path.join(WORD_VECTOR_PATH, "biomed.model")
WORD_VECTOR_FILE_NAME = os.path.join(WORD_VECTOR_PATH, "biomed_word2vec.txt")
WORD_VECTOR_EXPORT_TENSORBOARD_PATH = os.path.join(WORD_VECTOR_PATH, "tensorboard", "biomed_ner")

MODEL_PATH = "model"
MODEL_TRAIN_PATH = os.path.join(MODEL_PATH, "train")
MODEL_ACTUAL_PATH = os.path.join(MODEL_PATH, "actual")

TSV_EXTENSION = ".tsv"
JSON_EXTENSION = ".json"
SPACY_EXTENSION = ".spacy"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"
VALIDATE_DATASET = "devel"
TEST_DATASET = "test"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

In [11]:
def load_data(label, dataset_type, group_by_label=True):
    data = []
    if group_by_label:
        file = os.path.join(DATA_PREPARED_PATH, label, label + "-" + dataset_type + JSON_EXTENSION)
    else:
        file = os.path.join(DATA_AGGREGATE_PATH, dataset_type + JSON_EXTENSION)
        
    with open (file, "r", encoding="utf-8") as f:
        json_data = json.load(f)
    for an in json_data:
        texto = an["texto"]
        entities = []
        for entidade in an["entities"]:
            info = (entidade["start"], entidade["end"], entidade["label"])
            entities.append(info)
        data.append((texto,{"entities":entities}))
    return data


In [12]:
def format_data_for_evaluation(data, nlp_model): 
    data_formated = []
    for text, annotations in data:
        doc = nlp_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        data_formated.append(example)
    return data_formated


### Treinando multiple labels com multiplas camadas de NER Pré-Treinadas

In [13]:
def train_spacy_multiple_label_multiple_ner(train_data, validate_data, iterations, lista_model_label, description):
    nlp_train = spacy.blank("en")
    lista_ner = []
    
    for sm in lista_model_label:
        nlp_train.add_pipe(sm["label"] + "_ner", source=sm["model"])
        lista_ner.append(sm["label"] + "_ner")
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1
    
    with open(os.path.join(MODEL_TRAIN_AGGRATE_PATH, "log_treino.txt" ) , "a") as log_file:
        log_file.write("\n")
        log_file.write(f"=======> [TREINO AGREGADO] Inicio Treino {time.strftime('%d/%m/%Y %H:%M:%S', time.gmtime(time.time()))} ===================================== \n")
        log_file.write(f"Detalhes: {description} \n\n")
        
        other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe not in lista_ner]
        with nlp_train.disable_pipes(*other_pipes):
            optimizer = nlp_train.begin_training()
            print(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duração':10} ")
            log_file.write(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duracao':10} \n")
            
            for itn in range(iterations):
                start_time = time.time()
                linha = f"{str(itn):5} | "
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=512)

                for batch in batches:
                    for text, annotations in batch:
                        doc = nlp_train.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp_train.update( [example],
                            drop=0.2,  
                            sgd=optimizer,
                            losses=losses)

                validate_metrics = nlp_train.evaluate(format_data_for_evaluation(validate_data, nlp_train))
                linha += f"{losses[list(losses)[0]]:08.2f} | {validate_metrics['ents_p']*100:06.2f} | {validate_metrics['ents_r']*100:06.2f} | {validate_metrics['ents_f']*100:06.2f} | "

                if (validate_metrics["ents_f"] > best_ents_f):
                    best_ents_r = validate_metrics["ents_r"]
                    best_ents_f = validate_metrics["ents_f"]
                    best_ents_p = validate_metrics["ents_p"]
                    linha += f"{'S':8} | "
                    
                    Path(MODEL_TRAIN_AGGRATE_PATH).mkdir(parents=True, exist_ok=True)
                    nlp_train.to_disk(path)                
                else:
                    linha += f"{'N':8} | "

                elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
                linha += elapsed

                print(linha)
                print(validate_metrics)
                log_file.write(linha + "\n")
                log_file.flush()
    return nlp_train

In [14]:
lista_model_label = []

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_SPECIES)
nlp_species = spacy.load(path)
lista_model_label.append({"label": LABEL_SPECIES, "model": nlp_species})

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_DRUG_PROTEIN)
nlp_drug_protein = spacy.load(path)
lista_model_label.append({"label": LABEL_DRUG_PROTEIN, "model": nlp_drug_protein})

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_CHEMICAL)
nlp_chemical = spacy.load(path)
lista_model_label.append({"label": LABEL_CHEMICAL, "model": nlp_chemical})

path = os.path.join(MODEL_ACTUAL_PATH, LABEL_DISEASE)
nlp_disease = spacy.load(path)
lista_model_label.append({"label": LABEL_DISEASE, "model": nlp_disease})


In [16]:
train_data =  load_data(LABEL_SPECIES, TRAIN_DATASET, group_by_label=False)
validate_data = load_data(LABEL_SPECIES, VALIDATE_DATASET, group_by_label=False)
nlp = train_spacy_multiple_label(train_data,validate_data, 10,  lista_model_label, "Treino Agregado sem Word Vector")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 129170.33 | 048.57 | 062.19 | 054.54 | S        | 02:17:54
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.48572913572913573, 'ents_r': 0.6218795748806967, 'ents_f': 0.5454363540858951, 'ents_per_type': {'DRUG-PROTEIN': {'p': 0.3947340482004459, 'r': 0.639051220350636, 'f': 0.48802257662269477}, 'CHEMICAL': {'p': 0.5895818913584044, 'r': 0.6602646915281486, 'f': 0.6229246228433682}, 'SPECIES': {'p': 0.14986376021798364, 'r': 0.2511415525114155, 'f': 0.18771331058020477}, 'DISEASE': {'p': 0.26103421258814313, 'r': 0.39717862110073515, 'f': 0.31502639665904975}}, 'speed': 8751.493417262382}
1     | 115415.92 | 056.26 | 061.39 | 058.71 | S        | 02:18:17
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.5626001811972959, 'ents_r': 0.6139323535562865, 'ents_f': 0.5871464547743948, 'ents_per_type': {'DRUG-PROTEIN': {'p': 0.45229796023754193, 'r': 0.6021

KeyboardInterrupt: 