# Desafio NER - Treino Spacy 
* Treinamento com Entidades Separadas, um modelo para cada label - Bons resultados quando executados separados, bons resultados se testados com datasets separados, mas resultados ruins quando utilizado um dataset com todos os labels.
* Treinamento com Todas as Entidades - Em andamento

In [1]:
import os
from pathlib import Path
import re
import string
import json
import random
import sys
import subprocess
import time
import datetime
import shutil

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.training import Example

KMP_DUPLICATE_LIB_OK=True

In [12]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATA_ORIGIN_PATH = os.path.join("data","origin")
DATA_PREPARED_PATH = os.path.join("data", "prepared")
DATA_AGGREGATE_PATH = os.path.join(DATA_PREPARED_PATH, "aggregate")

WORD_VECTOR_PATH = "word_vec"
WORD_VECTOR_MODEL_NAME = os.path.join(WORD_VECTOR_PATH,"biomed.model")
WORD_VECTOR_FILE_NAME = os.path.join(WORD_VECTOR_PATH,"biomed_word2vec.txt")
WORD_VECTOR_GLOVE = os.path.join(WORD_VECTOR_PATH,"glove.840B.300d.txt")
WORD_VECTOR_PUBMED_PMC_ORIGINAL = os.path.join(WORD_VECTOR_PATH, "ri-3gram-400-tsv", "vectors.tsv")
VOCAB_PUBMED_PMC = os.path.join(WORD_VECTOR_PATH, "ri-3gram-400-tsv", "vocab.tsv")
WORD_VECTOR_PUBMED_PMC_PREPARED = os.path.join(WORD_VECTOR_PATH, "pubmed_pmc_word2vec.txt")


MODEL_PATH = "model"
MODEL_TRAIN_PATH = os.path.join(MODEL_PATH, "train")
MODEL_ACTUAL_PATH = os.path.join(MODEL_PATH, "actual")

TSV_EXTENSION = ".tsv"
JSON_EXTENSION = ".json"
SPACY_EXTENSION = ".spacy"

AUG_SUFFIXE = "_aug"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"
TRAIN_AUGMENT_DATASET = "train_aug"
VALIDATE_DATASET = "devel"
TEST_DATASET = "test"
TRAIN_AUG_VALIDATE_DATASET = "train_dev_aug"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

#### Carrega os datasets convertidos para formato json conhecido do spacy. A versão 3 utiliza a classe *Example* e será tratada no treinamento

In [3]:
def read_file(file):
    data = []
    with open (file, "r", encoding="utf-8") as f:
        json_data = json.load(f)
    for an in json_data:
        texto = an["texto"].lower()
        entities = []
        for entidade in an["entities"]:
            info = (entidade["start"], entidade["end"], entidade["label"])
            entities.append(info)
        data.append((texto,{"entities":entities}))
    return data

In [4]:
def load_data(label, dataset_type, load_aug_file=False):
    file = os.path.join(DATA_PREPARED_PATH, label, label + "-" + dataset_type + JSON_EXTENSION)
    data = read_file(file) 
    
    if load_aug_file:
        file_aug = os.path.join(DATA_PREPARED_PATH, label, label + "-" + dataset_type + AUG_SUFFIXE + JSON_EXTENSION)
        data.extend(read_file(file_aug))
    return data

#### Converte os dados em formato válido até a versão 2.0 do Spacy para a classe Example, que passou a ser utilizada na versão 3

In [5]:
def format_data_for_evaluation(data, nlp_model): 
    data_formated = []
    for text, annotations in data:
        doc = nlp_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        data_formated.append(example)
    return data_formated


### Treinamento de um Label utilizando sua respectiva base para treinamento

In [6]:
def create_model_with_word_vector(label, word_vec):
    path = os.path.join(MODEL_TRAIN_PATH, label)
    if (os.path.exists):
        shutil.rmtree(path, ignore_errors=True)
    
    Path(path).mkdir(parents=True, exist_ok=True)
    
    subprocess.run([sys.executable,
                    "-m",
                    "spacy",
                    "init",
                    "vectors",
                    "en",
                    word_vec,
                    path,
                    ])
    
    return spacy.load(path)

In [7]:
def train_spacy_one_label(train_data, validate_data, iterations, label, word_embedding=None, description=""):
    ner_name = label + "_ner"    
    if word_embedding == None:
        nlp_train = spacy.blank("en")
    else:
        nlp_train = create_model_with_word_vector(label, word_embedding)
    
    ner = nlp_train.add_pipe("ner",name=ner_name)
    ner.add_label(label)    
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1
    
    with open(os.path.join(MODEL_TRAIN_PATH, "log_treino.txt" ) , "a") as log_file:
        log_file.write("\n")
        log_file.write(f"=======> [LABELS ISOLADOS] Inicio Treino {time.strftime('%d/%m/%Y %H:%M:%S', time.gmtime(time.time()))} ===================================== \n")
        log_file.write(f"Detalhes: {description} \n\n")
        
        other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe != ner_name]
        with nlp_train.disable_pipes(*other_pipes):
            optimizer = nlp_train.begin_training()
            #optimizer = nlp_train.create_optimizer()
            print(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duração':10} ")
            log_file.write(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duracao':10} \n")
            
            for itn in range(iterations):
                start_time = time.time()
                linha = f"{str(itn):5} | "
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=512)

                for batch in batches:
                    for text, annotations in batch:
                        doc = nlp_train.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp_train.update( [example],
                            drop=0.2,  
                            sgd=optimizer,
                            losses=losses)                
                
                validate_metrics = nlp_train.evaluate(format_data_for_evaluation(validate_data, nlp_train))
                linha += f"{losses[list(losses)[0]]:08.2f} | {validate_metrics['ents_p']*100:06.2f} | {validate_metrics['ents_r']*100:06.2f} | {validate_metrics['ents_f']*100:06.2f} | "

                if (validate_metrics["ents_f"] > best_ents_f):
                    best_ents_r = validate_metrics["ents_r"]
                    best_ents_f = validate_metrics["ents_f"]
                    best_ents_p = validate_metrics["ents_p"]
                    linha += f"{'S':8} | "

                    path = os.path.join(MODEL_TRAIN_PATH, label)
                    Path(path).mkdir(parents=True, exist_ok=True)
                    nlp_train.to_disk(path)                
                else:
                    linha += f"{'N':8} | "

                elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
                linha += elapsed

                linha.replace(".", ",")
                print(linha)
                log_file.write(linha + "\n")
                log_file.flush()
                
    return nlp_train

In [8]:
def executa_teste_modelo(model_path, label):
    start_time = time.time()
    nlp_test = spacy.load(os.path.join(model_path, label))
    test_data = load_data(label, TEST_DATASET)
    test_data_spacy=format_data_for_evaluation(test_data, nlp_test)
    test_metrics = nlp_test.evaluate(test_data_spacy)
    elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
    with open(os.path.join(MODEL_TRAIN_PATH, "log_treino.txt" ) , "a") as log_file:
        linha = f"{'Val.':5} | "
        linha += f"{'':8} | {test_metrics['ents_p']*100:06.2f} | {test_metrics['ents_r']*100:06.2f} | {test_metrics['ents_f']*100:06.2f} | {'':8} | {elapsed} "
        linha.replace(".", ",")
        log_file.write(linha)
        print(linha)
    

In [9]:
def treina_modelo(label, word_embedding=None, load_aug_file=False, description=""):
    
    validate_data = load_data(label, VALIDATE_DATASET)    
    train_data =  load_data(label, TRAIN_DATASET, load_aug_file)
    nlp = train_spacy_one_label(train_data,
                                validate_data,                                 
                                10, 
                                label, 
                                word_embedding,
                                description=description)

### Treinamento - Validação após cada epoch (10 no total) e medição final com a base de teste do melhor modelo.

In [14]:
treina_modelo(LABEL_SPECIES, 
              word_embedding=WORD_VECTOR_GLOVE, 
              load_aug_file=False, 
              description="Treinamento com Embedding Glove, Lowercase")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 04724.02 | 093.16 | 054.70 | 068.93 | S        | 00:06:10
1     | 03265.02 | 084.49 | 068.68 | 075.77 | S        | 00:06:15
2     | 02691.76 | 083.18 | 066.85 | 074.13 | N        | 00:06:10
3     | 02478.83 | 080.96 | 070.68 | 075.48 | N        | 00:06:11
4     | 02172.41 | 074.71 | 070.68 | 072.64 | N        | 00:06:10
5     | 02118.62 | 081.97 | 065.21 | 072.63 | N        | 00:06:10
6     | 01970.32 | 077.36 | 070.23 | 073.62 | N        | 00:06:12
7     | 01817.55 | 082.87 | 068.49 | 075.00 | N        | 00:06:12
8     | 01845.41 | 082.90 | 069.95 | 075.88 | S        | 00:06:15
9     | 01820.76 | 083.85 | 068.77 | 075.56 | N        | 00:06:11


In [10]:
executa_teste_modelo(MODEL_TRAIN_PATH, LABEL_SPECIES)

Val.  |          | 080.77 | 065.86 | 072.56 |          | 00:00:10 


In [38]:
treina_modelo(LABEL_DRUG_PROTEIN, 
              word_embedding=None, 
              load_aug_file=True, 
              description="Treinamento DRUG PROTEIN com arquivo augment_file, com Validação")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    




ValueError: [E103] Trying to set conflicting doc.ents: '(10, 20, 'DRUG-PROTEIN')' and '(-1, 29, 'DRUG-PROTEIN')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.

In [13]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_DRUG_PROTEIN))
test_data = load_data(LABEL_DRUG_PROTEIN, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.6930966469428008, 'ents_r': 0.6991087060321503, 'ents_f': 0.6960896953369518, 'ents_per_type': {'DRUG-PROTEIN': {'p': 0.6930966469428008, 'r': 0.6991087060321503, 'f': 0.6960896953369518}}, 'speed': 40232.105882322045}


In [14]:
train_data =  load_data(LABEL_CHEMICAL, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_CHEMICAL, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_CHEMICAL, word_embedding=WORD_VECTOR_GLOVE ,description="Treino Isolado de Chemical com WordVec Glove 840B.300d")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 30403.22 | 082.41 | 069.40 | 075.35 | S        | 00:15:17
1     | 20341.30 | 074.54 | 080.48 | 077.39 | S        | 00:15:19
2     | 17704.59 | 080.42 | 076.18 | 078.25 | S        | 00:15:25
3     | 15935.56 | 083.54 | 075.80 | 079.48 | S        | 00:15:31
4     | 15167.09 | 081.22 | 079.55 | 080.37 | S        | 00:15:34
5     | 14732.48 | 083.85 | 075.96 | 079.71 | N        | 00:15:28
6     | 14471.01 | 080.07 | 078.07 | 079.05 | N        | 00:15:41
7     | 14193.32 | 082.22 | 079.32 | 080.74 | S        | 00:15:54
8     | 13959.60 | 072.66 | 075.11 | 073.86 | N        | 00:15:33
9     | 13646.91 | 080.70 | 078.33 | 079.50 | N        | 00:15:26


In [15]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_CHEMICAL))
test_data = load_data(LABEL_CHEMICAL, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8280228758169934, 'ents_r': 0.7915134554684196, 'ents_f': 0.8093566473122931, 'ents_per_type': {'CHEMICAL': {'p': 0.8280228758169934, 'r': 0.7915134554684196, 'f': 0.8093566473122931}}, 'speed': 40307.3784585951}


In [16]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_DISEASE, word_embedding=WORD_VECTOR_GLOVE ,description="Treino Isolado de Disease com WordVec Glove 840B.300d")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 09596.31 | 076.89 | 059.37 | 067.00 | S        | 00:04:14
1     | 06218.46 | 077.50 | 069.96 | 073.54 | S        | 00:04:14
2     | 05396.39 | 078.43 | 069.06 | 073.45 | N        | 00:04:14
3     | 04872.73 | 074.57 | 072.88 | 073.71 | S        | 00:04:19
4     | 04394.85 | 073.00 | 066.24 | 069.46 | N        | 00:04:19
5     | 03992.47 | 078.99 | 070.06 | 074.26 | S        | 00:04:25
6     | 03731.85 | 078.80 | 070.61 | 074.48 | S        | 00:04:17
7     | 03602.43 | 078.48 | 070.59 | 074.33 | N        | 00:04:13
8     | 03327.46 | 082.83 | 068.07 | 074.73 | S        | 00:04:16
9     | 03210.17 | 076.92 | 073.04 | 074.93 | S        | 00:04:25


In [17]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_DISEASE))
test_data = load_data(LABEL_DISEASE, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.7606013878180417, 'ents_r': 0.7329123328380386, 'ents_f': 0.7465001891789633, 'ents_per_type': {'DISEASE': {'p': 0.7606013878180417, 'r': 0.7329123328380386, 'f': 0.7465001891789633}}, 'speed': 40236.75947895969}


In [None]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET, group_by_label=False)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET, group_by_label=False)
nlp = train_spacy_muliple_label(train_data,validate_data, 10, LABEL_LIST, "Treino Agregado, Multiplos Labels com uma camada de NER")