# Desafio NER - Treino Spacy 
* Treinamento com Entidades Separadas, um modelo para cada label - Bons resultados quando executados separados, bons resultados se testados com datasets separados, mas resultados ruins quando utilizado um dataset com todos os labels.
* Treinamento com Todas as Entidades - Em andamento

In [2]:
import os
from pathlib import Path
import re
import string
import json
import random
import sys
import subprocess
import time
import datetime
import shutil

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.training import Example

KMP_DUPLICATE_LIB_OK=True

In [3]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATA_ORIGIN_PATH = os.path.join("data","origin")
DATA_PREPARED_PATH = os.path.join("data", "prepared")
DATA_AGGREGATE_PATH = os.path.join(DATA_PREPARED_PATH, "aggregate")

WORD_VECTOR_PATH = "word_vec"
WORD_VECTOR_MODEL_NAME = os.path.join(WORD_VECTOR_PATH, "biomed.model")
WORD_VECTOR_GENSIM = os.path.join(WORD_VECTOR_PATH, "biomed_word2vec.txt")
WORD_VECTOR_GLOVE = os.path.join(WORD_VECTOR_PATH, "glove.840B.300d.txt")
WORD_VECTOR_EXPORT_TENSORBOARD_PATH = os.path.join(WORD_VECTOR_PATH, "tensorboard", "biomed_ner")

MODEL_PATH = "model"
MODEL_TRAIN_PATH = os.path.join(MODEL_PATH, "train")
MODEL_ACTUAL_PATH = os.path.join(MODEL_PATH, "actual")

TSV_EXTENSION = ".tsv"
JSON_EXTENSION = ".json"
SPACY_EXTENSION = ".spacy"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"
VALIDATE_DATASET = "devel"
TEST_DATASET = "test"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

#### Carrega os datasets convertidos para formato json conhecido do spacy. A versão 3 utiliza a classe *Example* e será tratada no treinamento

In [4]:
def load_data(label, dataset_type, group_by_label=True):
    data = []
    if group_by_label:
        file = os.path.join(DATA_PREPARED_PATH, label, label + "-" + dataset_type + JSON_EXTENSION)
    else:
        file = os.path.join(DATA_AGGREGATE_PATH, dataset_type + JSON_EXTENSION)
        
    with open (file, "r", encoding="utf-8") as f:
        json_data = json.load(f)
    for an in json_data:
        texto = an["texto"]
        entities = []
        for entidade in an["entities"]:
            info = (entidade["start"], entidade["end"], entidade["label"])
            entities.append(info)
        data.append((texto,{"entities":entities}))
    return data


#### Converte os dados em formato válido até a versão 2.0 do Spacy para a classe Example, que passou a ser utilizada na versão 3

In [5]:
def format_data_for_evaluation(data, nlp_model): 
    data_formated = []
    for text, annotations in data:
        doc = nlp_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        data_formated.append(example)
    return data_formated


### Treinamento de um Label utilizando sua respectiva base para treinamento

In [6]:
def create_model_with_word_vector(label, word_vec):
    path = os.path.join(MODEL_TRAIN_PATH, label)
    if (os.path.exists):
        shutil.rmtree(path, ignore_errors=True)
    
    Path(path).mkdir(parents=True, exist_ok=True)
    
    subprocess.run([sys.executable,
                    "-m",
                    "spacy",
                    "init",
                    "vectors",
                    "en",
                    word_vec,
                    path,
                    ])
    
    return spacy.load(path)

In [9]:
def train_spacy_one_label(train_data, validate_data, iterations, label, word_embedding=None, description=""):
    ner_name = label + "_ner"    
    if word_embedding == None:
        nlp_train = spacy.blank("en")
    else:
        nlp_train = create_model_with_word_vector(label, word_embedding)
    
    ner = nlp_train.add_pipe("ner",name=ner_name)
    ner.add_label(label)    
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1
    
    with open(os.path.join(MODEL_TRAIN_PATH, "log_treino.txt" ) , "a") as log_file:
        log_file.write("\n")
        log_file.write(f"=======> [LABELS ISOLADOS] Inicio Treino {time.strftime('%d/%m/%Y %H:%M:%S', time.gmtime(time.time()))} ===================================== \n")
        log_file.write(f"Detalhes: {description} \n\n")
        
        other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe != ner_name]
        with nlp_train.disable_pipes(*other_pipes):
            optimizer = nlp_train.begin_training()
            #optimizer = nlp_train.create_optimizer()
            print(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duração':10} ")
            log_file.write(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duracao':10} \n")
            
            for itn in range(iterations):
                start_time = time.time()
                linha = f"{str(itn):5} | "
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=512)

                for batch in batches:
                    for text, annotations in batch:
                        doc = nlp_train.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp_train.update( [example],
                            drop=0.2,  
                            sgd=optimizer,
                            losses=losses)

                validate_metrics = nlp_train.evaluate(format_data_for_evaluation(validate_data, nlp_train))
                linha += f"{losses[list(losses)[0]]:08.2f} | {validate_metrics['ents_p']*100:06.2f} | {validate_metrics['ents_r']*100:06.2f} | {validate_metrics['ents_f']*100:06.2f} | "

                if (validate_metrics["ents_f"] > best_ents_f):
                    best_ents_r = validate_metrics["ents_r"]
                    best_ents_f = validate_metrics["ents_f"]
                    best_ents_p = validate_metrics["ents_p"]
                    linha += f"{'S':8} | "
                    
                    path = os.path.join(MODEL_TRAIN_PATH, label)
                    Path(path).mkdir(parents=True, exist_ok=True)
                    nlp_train.to_disk(path)                
                else:
                    linha += f"{'N':8} | "

                elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
                linha += elapsed
            
                linha.replace(".", ",")
                print(linha)
                log_file.write(linha + "\n")
                log_file.flush()
    return nlp_train

In [None]:
# test_data não formatado para spacy
def executa_teste_modelo(model_path, label, test_data):
    nlp_test = spacy.load(os.path.join(model_path, label))
    test_data = load_data(label, TEST_DATASET, group_by_label=True)
    test_data_spacy=format_data_for_evaluation(test_data, nlp_test)
    test_metrics = nlp_test.evaluate(test_data_spacy)
    with open(os.path.join(MODEL_TRAIN_PATH, "log_treino.txt" ) , "a") as log_file:
        linha = f"{'Val.':5} | "
        linha += f"{'':8} | {test_metrics['ents_p']*100:06.2f} | {test_metrics['ents_r']*100:06.2f} | {test_metrics['ents_f']*100:06.2f} | {'':8} | "
        linha.replace(".", ",")
        log_file.write(linha)
        print(linha)

### Treinamento para Labels e Bases Separadas  - Validação após cada epoch (10 no total) e medição final com a base de teste do melhor modelo.

In [10]:
train_data =  load_data(LABEL_SPECIES, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_SPECIES, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_SPECIES, word_embedding=WORD_VECTOR_GLOVE ,description="Treino Isolado de Species com WordVec Glove 840B.300d")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 04194.89 | 090.64 | 068.13 | 077.79 | S        | 00:06:09
1     | 02881.98 | 069.26 | 065.84 | 067.51 | N        | 00:06:10
2     | 02411.55 | 087.22 | 074.16 | 080.16 | S        | 00:06:16
3     | 02064.36 | 080.93 | 072.88 | 076.69 | N        | 00:06:10
4     | 01943.09 | 086.64 | 072.24 | 078.78 | N        | 00:06:07
5     | 01731.40 | 082.46 | 071.69 | 076.70 | N        | 00:06:09
6     | 01784.68 | 076.08 | 072.33 | 074.16 | N        | 00:06:12
7     | 01691.52 | 084.88 | 073.33 | 078.69 | N        | 00:06:10
8     | 01609.53 | 086.99 | 072.69 | 079.20 | N        | 00:06:08
9     | 01611.48 | 087.64 | 051.14 | 064.59 | N        | 00:06:14


In [11]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_SPECIES))
test_data = load_data(LABEL_SPECIES, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
with open(os.path.join(MODEL_TRAIN_PATH, "log_treino.txt" ) , "a") as log_file:
    log_file.write("\n")


print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8052434456928839, 'ents_r': 0.6840909090909091, 'ents_f': 0.7397394937331039, 'ents_per_type': {'SPECIES': {'p': 0.8052434456928839, 'r': 0.6840909090909091, 'f': 0.7397394937331039}}, 'speed': 41456.909509558565}


In [12]:
train_data =  load_data(LABEL_DRUG_PROTEIN, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_DRUG_PROTEIN, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_DRUG_PROTEIN, word_embedding=WORD_VECTOR_GLOVE ,description="Treino Isolado de DRUG PROTEIN com WordVec Glove 840B.300d")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 42964.92 | 077.45 | 065.27 | 070.84 | S        | 00:12:32
1     | 36260.78 | 071.13 | 074.26 | 072.66 | S        | 00:12:33
2     | 34466.13 | 075.71 | 071.02 | 073.29 | S        | 00:12:31
3     | 34089.25 | 075.65 | 063.67 | 069.15 | N        | 00:12:22
4     | 33634.95 | 076.77 | 068.06 | 072.15 | N        | 00:12:24
5     | 33369.66 | 071.94 | 071.68 | 071.81 | N        | 00:12:31
6     | 32956.11 | 075.50 | 071.97 | 073.70 | S        | 00:12:32
7     | 32605.39 | 075.28 | 069.89 | 072.48 | N        | 00:12:31
8     | 32440.36 | 076.64 | 070.37 | 073.37 | N        | 00:12:28
9     | 32386.97 | 076.41 | 070.52 | 073.35 | N        | 00:12:25


In [13]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_DRUG_PROTEIN))
test_data = load_data(LABEL_DRUG_PROTEIN, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.6930966469428008, 'ents_r': 0.6991087060321503, 'ents_f': 0.6960896953369518, 'ents_per_type': {'DRUG-PROTEIN': {'p': 0.6930966469428008, 'r': 0.6991087060321503, 'f': 0.6960896953369518}}, 'speed': 40232.105882322045}


In [14]:
train_data =  load_data(LABEL_CHEMICAL, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_CHEMICAL, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_CHEMICAL, word_embedding=WORD_VECTOR_GLOVE ,description="Treino Isolado de Chemical com WordVec Glove 840B.300d")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 30403.22 | 082.41 | 069.40 | 075.35 | S        | 00:15:17
1     | 20341.30 | 074.54 | 080.48 | 077.39 | S        | 00:15:19
2     | 17704.59 | 080.42 | 076.18 | 078.25 | S        | 00:15:25
3     | 15935.56 | 083.54 | 075.80 | 079.48 | S        | 00:15:31
4     | 15167.09 | 081.22 | 079.55 | 080.37 | S        | 00:15:34
5     | 14732.48 | 083.85 | 075.96 | 079.71 | N        | 00:15:28
6     | 14471.01 | 080.07 | 078.07 | 079.05 | N        | 00:15:41
7     | 14193.32 | 082.22 | 079.32 | 080.74 | S        | 00:15:54
8     | 13959.60 | 072.66 | 075.11 | 073.86 | N        | 00:15:33
9     | 13646.91 | 080.70 | 078.33 | 079.50 | N        | 00:15:26


In [15]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_CHEMICAL))
test_data = load_data(LABEL_CHEMICAL, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8280228758169934, 'ents_r': 0.7915134554684196, 'ents_f': 0.8093566473122931, 'ents_per_type': {'CHEMICAL': {'p': 0.8280228758169934, 'r': 0.7915134554684196, 'f': 0.8093566473122931}}, 'speed': 40307.3784585951}


In [16]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_DISEASE, word_embedding=WORD_VECTOR_GLOVE ,description="Treino Isolado de Disease com WordVec Glove 840B.300d")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 09596.31 | 076.89 | 059.37 | 067.00 | S        | 00:04:14
1     | 06218.46 | 077.50 | 069.96 | 073.54 | S        | 00:04:14
2     | 05396.39 | 078.43 | 069.06 | 073.45 | N        | 00:04:14
3     | 04872.73 | 074.57 | 072.88 | 073.71 | S        | 00:04:19
4     | 04394.85 | 073.00 | 066.24 | 069.46 | N        | 00:04:19
5     | 03992.47 | 078.99 | 070.06 | 074.26 | S        | 00:04:25
6     | 03731.85 | 078.80 | 070.61 | 074.48 | S        | 00:04:17
7     | 03602.43 | 078.48 | 070.59 | 074.33 | N        | 00:04:13
8     | 03327.46 | 082.83 | 068.07 | 074.73 | S        | 00:04:16
9     | 03210.17 | 076.92 | 073.04 | 074.93 | S        | 00:04:25


In [17]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_DISEASE))
test_data = load_data(LABEL_DISEASE, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.7606013878180417, 'ents_r': 0.7329123328380386, 'ents_f': 0.7465001891789633, 'ents_per_type': {'DISEASE': {'p': 0.7606013878180417, 'r': 0.7329123328380386, 'f': 0.7465001891789633}}, 'speed': 40236.75947895969}


### Treinamento Multiplos labels com uma camada de NER

In [9]:
def train_spacy_muliple_label(train_data, validate_data, iterations, lista_label, description):
    nlp_train = spacy.blank("en")
    #nlp_train = spacy.load("en_core_web_md")
    
    ner = nlp_train.add_pipe("ner")
    for label in lista_label:
        ner.add_label(label)    
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1
    
    with open(os.path.join(MODEL_TRAIN_AGGRATE_PATH, "log_treino.txt" ) , "a") as log_file:
        log_file.write("\n")
        log_file.write(f"=======> [LABELS AGREGADOS] Inicio Treino {time.strftime('%d/%m/%Y %H:%M:%S', time.gmtime(time.time()))} ===================================== \n")
        log_file.write(f"Detalhes: {description} \n\n")
        
        other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe != "ner"]
        with nlp_train.disable_pipes(*other_pipes):
            optimizer = nlp_train.begin_training()
            #optimizer = nlp_train.create_optimizer()
            print(f"{'#IT':5} | {'Loss':10} | {'Label':12} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duração':10} ")
            log_file.write(f"{'#IT':5} | {'Loss':10} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duracao':10} \n")
            
            for itn in range(iterations):
                start_time = time.time()
                linha = f"{str(itn):5} | "
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=512)

                for batch in batches:
                    for text, annotations in batch:
                        doc = nlp_train.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp_train.update( [example],
                            drop=0.2,  
                            sgd=optimizer,
                            losses=losses)

                validate_metrics = nlp_train.evaluate(format_data_for_evaluation(validate_data, nlp_train))
                linha += f"{losses[list(losses)[0]]:10.2f} | {'Geral':12} | {validate_metrics['ents_p']*100:06.2f} | {validate_metrics['ents_r']*100:06.2f} | {validate_metrics['ents_f']*100:06.2f} | "

                if (validate_metrics["ents_f"] > best_ents_f):
                    best_ents_r = validate_metrics["ents_r"]
                    best_ents_f = validate_metrics["ents_f"]
                    best_ents_p = validate_metrics["ents_p"]
                    linha += f"{'S':8} | "
                    
                    Path(MODEL_TRAIN_AGGRATE_PATH).mkdir(parents=True, exist_ok=True)
                    nlp_train.to_disk(MODEL_TRAIN_AGGRATE_PATH)                
                else:
                    linha += f"{'N':8} | "
                
                elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
                linha += elapsed +" \n"
                
                for label in lista_label:
                    linha += f"{'':5} | {'':10} | {label:12} | {validate_metrics['ents_per_type'][label]['p']*100:06.2f} | {validate_metrics['ents_per_type'][label]['r']*100:06.2f} | {validate_metrics['ents_per_type'][label]['f']*100:06.2f} | {'':8} | \n"

                print(linha)
                log_file.write(linha + "\n")
                log_file.flush()
    return nlp_train

In [None]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET, group_by_label=False)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET, group_by_label=False)
nlp = train_spacy_muliple_label(train_data,validate_data, 10, LABEL_LIST, "Treino Agregado, Multiplos Labels com uma camada de NER")

### Treinamento Multiplos labels com Múltiplas camadas de NER 

In [13]:
def train_spacy_muliple_label_multiple_ner_layer(train_data, validate_data, iterations, lista_label, description):
    nlp_train = spacy.blank("en")
    #nlp_train = spacy.load("en_core_web_md")
    
    lista_ner_pipes = []
    for label in lista_label:
        ner_name = label + "_ner"
        ner = nlp_train.add_pipe("ner", name=ner_name)
        ner.add_label(label)    
        lista_ner_pipes.append(ner_name)
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1
    
    with open(os.path.join(MODEL_TRAIN_AGGRATE_PATH, "log_treino.txt" ) , "a") as log_file:
        log_file.write("\n")
        log_file.write(f"=======> [LABELS AGREGADOS] Inicio Treino {time.strftime('%d/%m/%Y %H:%M:%S', time.gmtime(time.time()))} ===================================== \n")
        log_file.write(f"Detalhes: {description} \n\n")
        
        other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe not in lista_ner_pipes]
        with nlp_train.disable_pipes(*other_pipes):
            optimizer = nlp_train.begin_training()
            #optimizer = nlp_train.create_optimizer()
            print(f"{'#IT':5} | {'Loss':10} | {'Label':12} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duração':10} ")
            log_file.write(f"{'#IT':5} | {'Loss':10} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duracao':10} \n")
            
            for itn in range(iterations):
                start_time = time.time()
                linha = f"{str(itn):5} | "
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=512)

                for batch in batches:
                    for text, annotations in batch:
                        doc = nlp_train.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp_train.update( [example],
                            drop=0.2,  
                            sgd=optimizer,
                            losses=losses)

                validate_metrics = nlp_train.evaluate(format_data_for_evaluation(validate_data, nlp_train))
                linha += f"{losses[list(losses)[0]]:10.2f} | {'Geral':12} | {validate_metrics['ents_p']*100:06.2f} | {validate_metrics['ents_r']*100:06.2f} | {validate_metrics['ents_f']*100:06.2f} | "

                if (validate_metrics["ents_f"] > best_ents_f):
                    best_ents_r = validate_metrics["ents_r"]
                    best_ents_f = validate_metrics["ents_f"]
                    best_ents_p = validate_metrics["ents_p"]
                    linha += f"{'S':8} | "
                    
                    Path(MODEL_TRAIN_AGGRATE_PATH).mkdir(parents=True, exist_ok=True)
                    nlp_train.to_disk(MODEL_TRAIN_AGGRATE_PATH)                
                else:
                    linha += f"{'N':8} | "
                
                elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
                linha += elapsed + " \n"
                
                for label in lista_label:
                    linha += f"{'':5} | {'':10} | {label:12} | {validate_metrics['ents_per_type'][label]['p']*100:06.2f} | {validate_metrics['ents_per_type'][label]['r']*100:06.2f} | {validate_metrics['ents_per_type'][label]['f']*100:06.2f} | {'':8} | \n"
                
                

                print(linha)
                log_file.write(linha + "\n")
                log_file.flush()
    return nlp_train

In [None]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET, group_by_label=False)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET, group_by_label=False)
nlp = train_spacy_muliple_label_multiple_ner_layer(train_data,validate_data, 10, LABEL_LIST, "Treino Agregado, Multiplos Labels com multiplas camadas  NER")