# Desafio NER - Treino Spacy 
* Treinamento com Entidades Separadas, um modelo para cada label - Bons resultados quando executados separados, bons resultados se testados com datasets separados, mas resultados ruins quando utilizado um dataset com todos os labels.
* Treinamento com Todas as Entidades - Em andamento

In [1]:
import os
from pathlib import Path
import re
import string
import json
import random
import sys
import time
import datetime

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.training import Example


In [2]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATA_ORIGIN_PATH = os.path.join("data","origin")
DATA_PREPARED_PATH = os.path.join("data", "prepared")
DATA_AGGREGATE_PATH = os.path.join(DATA_PREPARED_PATH, "aggregate")
MODEL_PATH = "model"
MODEL_TRAIN_PATH = os.path.join(MODEL_PATH, "train")
MODEL_TRAIN_AGGRATE_PATH = os.path.join(MODEL_PATH, "train-aggregate")
MODEL_ACTUAL_PATH = os.path.join(MODEL_PATH, "actual")

TSV_EXTENSION = ".tsv"
JSON_EXTENSION = ".json"
SPACY_EXTENSION = ".spacy"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"

VALIDATE_DATASET = "devel"
TEST_DATASET = "test"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

#### Carrega os datasets convertidos para formato json conhecido do spacy. A versão 3 utiliza a classe *Example* e será tratada no treinamento

In [3]:
def load_data(label, dataset_type, group_by_label=True):
    data = []
    if group_by_label:
        file = os.path.join(DATA_PREPARED_PATH, label, label + "-" + dataset_type + JSON_EXTENSION)
    else:
        file = os.path.join(DATA_AGGREGATE_PATH, dataset_type + JSON_EXTENSION)
        
    with open (file, "r", encoding="utf-8") as f:
        json_data = json.load(f)
    for an in json_data:
        texto = an["texto"]
        entities = []
        for entidade in an["entities"]:
            info = (entidade["start"], entidade["end"], entidade["label"])
            entities.append(info)
        data.append((texto,{"entities":entities}))
    return data


#### Converte os dados em formato válido até a versão 2.0 do Spacy para a classe Example, que passou a ser utilizada na versão 3

In [4]:
def format_data_for_evaluation(data, nlp_model): 
    data_formated = []
    for text, annotations in data:
        doc = nlp_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        data_formated.append(example)
    return data_formated


### Treinamento de um Label utilizando sua respectiva base para treinamento

In [5]:
def train_spacy_one_label(train_data, validate_data, iterations, label, description):
    ner_name = label + "_ner"    
    nlp_train = spacy.blank("en")
    #nlp_train = spacy.load("en_core_web_md")
    
    ner = nlp_train.add_pipe("ner",name=ner_name)
    ner.add_label(label)    
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1
    
    with open(os.path.join(MODEL_TRAIN_PATH, "log_treino.txt" ) , "a") as log_file:
        log_file.write("\n")
        log_file.write(f"=======> [LABELS ISOLADOS] Inicio Treino {time.strftime('%d/%m/%Y %H:%M:%S', time.gmtime(time.time()))} ===================================== \n")
        log_file.write(f"Detalhes: {description} \n\n")
        
        other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe != ner_name]
        with nlp_train.disable_pipes(*other_pipes):
            optimizer = nlp_train.begin_training()
            #optimizer = nlp_train.create_optimizer()
            print(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duração':10} ")
            log_file.write(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duracao':10} \n")
            
            for itn in range(iterations):
                start_time = time.time()
                linha = f"{str(itn):5} | "
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=512)

                for batch in batches:
                    for text, annotations in batch:
                        doc = nlp_train.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp_train.update( [example],
                            drop=0.2,  
                            sgd=optimizer,
                            losses=losses)

                validate_metrics = nlp_train.evaluate(format_data_for_evaluation(validate_data, nlp_train))
                linha += f"{losses[list(losses)[0]]:08.2f} | {validate_metrics['ents_p']*100:06.2f} | {validate_metrics['ents_r']*100:06.2f} | {validate_metrics['ents_f']*100:06.2f} | "

                if (validate_metrics["ents_f"] > best_ents_f):
                    best_ents_r = validate_metrics["ents_r"]
                    best_ents_f = validate_metrics["ents_f"]
                    best_ents_p = validate_metrics["ents_p"]
                    linha += f"{'S':8} | "
                    
                    path = os.path.join(MODEL_TRAIN_PATH, label)
                    Path(path).mkdir(parents=True, exist_ok=True)
                    nlp_train.to_disk(path)                
                else:
                    linha += f"{'N':8} | "

                elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
                linha += elapsed

                print(linha)
                log_file.write(linha + "\n")
                log_file.flush()
    return nlp_train

### Treinamento para Labels e Bases Separadas  - Validação após cada epoch (10 no total) e medição final com a base de teste do melhor modelo.

In [38]:
train_data =  load_data(LABEL_SPECIES, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_SPECIES, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_SPECIES, "Treino Isolado de Species Sem WordVec")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 04188.94 | 076.83 | 070.87 | 073.73 | S          | 00:05:30
1     | 02844.82 | 081.21 | 062.74 | 070.79 | N        | 00:05:30
2     | 02529.17 | 087.76 | 073.33 | 079.90 | S          | 00:05:30
3     | 02122.87 | 082.96 | 081.83 | 082.39 | S          | 00:05:33
4     | 01942.68 | 087.16 | 081.19 | 084.07 | S          | 00:05:33
5     | 01837.41 | 088.96 | 077.99 | 083.11 | N        | 00:05:34
6     | 01751.14 | 080.30 | 073.33 | 076.66 | N        | 00:05:34
7     | 01720.78 | 080.21 | 068.86 | 074.10 | N        | 00:05:34
8     | 01672.09 | 077.00 | 065.11 | 070.56 | N        | 00:05:35
9     | 01602.27 | 070.52 | 065.75 | 068.05 | N        | 00:05:42


In [39]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_SPECIES))
test_data = load_data(LABEL_SPECIES, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.7764876632801161, 'ents_r': 0.7295454545454545, 'ents_f': 0.7522849777361142, 'ents_per_type': {'SPECIES': {'p': 0.7764876632801161, 'r': 0.7295454545454545, 'f': 0.7522849777361142}}, 'speed': 34624.15112357422}


In [42]:
train_data =  load_data(LABEL_DRUG_PROTEIN, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_DRUG_PROTEIN, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_DRUG_PROTEIN, "Treino Isolado de DRUG PROTEIN Sem WordVec")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 43113.34 | 070.26 | 063.12 | 066.50 | S        | 00:11:50
1     | 36124.18 | 075.39 | 070.30 | 072.76 | S        | 00:11:48
2     | 34349.47 | 071.76 | 070.02 | 070.88 | N        | 00:11:46
3     | 33830.05 | 075.77 | 068.31 | 071.85 | N        | 00:11:45
4     | 33232.88 | 075.05 | 073.14 | 074.08 | S        | 00:11:44
5     | 32905.07 | 074.65 | 069.36 | 071.91 | N        | 00:11:46
6     | 33033.97 | 072.32 | 075.34 | 073.80 | N        | 00:11:44
7     | 32860.86 | 079.15 | 061.21 | 069.03 | N        | 00:11:44
8     | 32480.39 | 071.40 | 074.10 | 072.73 | N        | 00:11:45
9     | 32548.00 | 069.61 | 076.86 | 073.05 | N        | 00:11:46


In [43]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_DRUG_PROTEIN))
test_data = load_data(LABEL_DRUG_PROTEIN, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.6799419049075065, 'ents_r': 0.7078624860735317, 'ents_f': 0.6936213349968808, 'ents_per_type': {'DRUG-PROTEIN': {'p': 0.6799419049075065, 'r': 0.7078624860735317, 'f': 0.6936213349968808}}, 'speed': 34590.965523060884}


In [45]:
train_data =  load_data(LABEL_CHEMICAL, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_CHEMICAL, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_CHEMICAL, "Treino Isolado de Chemical Sem WordVec")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 30452.82 | 081.58 | 070.28 | 075.51 | S        | 00:14:14
1     | 20290.45 | 083.21 | 071.02 | 076.63 | S        | 00:14:20
2     | 17549.54 | 082.34 | 070.25 | 075.81 | N        | 00:14:18
3     | 16225.91 | 078.94 | 076.82 | 077.86 | S        | 00:14:21
4     | 15679.11 | 081.17 | 078.86 | 080.00 | S        | 00:14:23
5     | 14946.28 | 082.43 | 075.89 | 079.03 | N        | 00:14:22
6     | 14709.57 | 077.19 | 080.89 | 078.99 | N        | 00:14:30
7     | 14459.19 | 084.41 | 076.28 | 080.14 | S        | 00:14:25
8     | 14340.50 | 085.50 | 073.83 | 079.24 | N        | 00:14:28
9     | 13740.24 | 081.07 | 078.77 | 079.90 | N        | 00:14:25


In [47]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_CHEMICAL))
test_data = load_data(LABEL_CHEMICAL, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8547992254010012, 'ents_r': 0.7612833946178126, 'ents_f': 0.8053356282271945, 'ents_per_type': {'CHEMICAL': {'p': 0.8547992254010012, 'r': 0.7612833946178126, 'f': 0.8053356282271945}}, 'speed': 34570.738044853344}


In [48]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET, group_by_label=True)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET, group_by_label=True)
nlp = train_spacy_one_label(train_data,validate_data, 10, LABEL_DISEASE, "Treino Isolado de Disease Sem WordVec")

#IT   | Loss     | Prec   | Recall | F_Scr  | Save Mod | Duração    
0     | 09495.49 | 070.20 | 063.98 | 066.94 | S        | 00:04:04
1     | 06466.31 | 077.18 | 067.26 | 071.88 | S        | 00:04:12
2     | 05428.19 | 078.90 | 066.20 | 072.00 | S        | 00:04:03
3     | 04840.86 | 081.92 | 066.18 | 073.22 | S        | 00:04:04
4     | 04337.82 | 067.65 | 074.39 | 070.86 | N        | 00:04:02
5     | 03950.55 | 076.51 | 073.14 | 074.79 | S        | 00:04:02
6     | 03720.81 | 081.56 | 069.06 | 074.79 | S        | 00:04:01
7     | 03622.13 | 080.62 | 072.14 | 076.15 | S        | 00:04:02
8     | 03312.85 | 081.04 | 069.38 | 074.76 | N        | 00:04:02
9     | 03222.92 | 078.87 | 071.57 | 075.04 | N        | 00:04:02


In [50]:
nlp = spacy.load(os.path.join(MODEL_TRAIN_PATH, LABEL_DISEASE))
test_data = load_data(LABEL_DISEASE, TEST_DATASET, group_by_label=True)
test_data_spacy=format_data_for_evaluation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8004568106312292, 'ents_r': 0.7160104011887073, 'ents_f': 0.7558823529411764, 'ents_per_type': {'DISEASE': {'p': 0.8004568106312292, 'r': 0.7160104011887073, 'f': 0.7558823529411764}}, 'speed': 38052.47472579074}


### Treinamento Multiplos labels com uma camada de NER

In [9]:
def train_spacy_muliple_label(train_data, validate_data, iterations, lista_label, description):
    nlp_train = spacy.blank("en")
    #nlp_train = spacy.load("en_core_web_md")
    
    ner = nlp_train.add_pipe("ner")
    for label in lista_label:
        ner.add_label(label)    
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1
    
    with open(os.path.join(MODEL_TRAIN_AGGRATE_PATH, "log_treino.txt" ) , "a") as log_file:
        log_file.write("\n")
        log_file.write(f"=======> [LABELS AGREGADOS] Inicio Treino {time.strftime('%d/%m/%Y %H:%M:%S', time.gmtime(time.time()))} ===================================== \n")
        log_file.write(f"Detalhes: {description} \n\n")
        
        other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe != "ner"]
        with nlp_train.disable_pipes(*other_pipes):
            optimizer = nlp_train.begin_training()
            #optimizer = nlp_train.create_optimizer()
            print(f"{'#IT':5} | {'Loss':8} | {'Label':12} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duração':10} ")
            log_file.write(f"{'#IT':5} | {'Loss':8} | {'Prec':6} | {'Recall':6} | {'F_Scr':6} | {'Save Mod':8} | {'Duracao':10} \n")
            
            for itn in range(iterations):
                start_time = time.time()
                linha = f"{str(itn):5} | "
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=512)

                for batch in batches:
                    for text, annotations in batch:
                        doc = nlp_train.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp_train.update( [example],
                            drop=0.2,  
                            sgd=optimizer,
                            losses=losses)

                validate_metrics = nlp_train.evaluate(format_data_for_evaluation(validate_data, nlp_train))
                linha += f"{losses[list(losses)[0]]:08.2f} | {'Geral':12} | {validate_metrics['ents_p']*100:06.2f} | {validate_metrics['ents_r']*100:06.2f} | {validate_metrics['ents_f']*100:06.2f} | "

                if (validate_metrics["ents_f"] > best_ents_f):
                    best_ents_r = validate_metrics["ents_r"]
                    best_ents_f = validate_metrics["ents_f"]
                    best_ents_p = validate_metrics["ents_p"]
                    linha += f"{'S':8} | \n"
                    
                    path = os.path.join(MODEL_TRAIN_PATH, label)
                    Path(path).mkdir(parents=True, exist_ok=True)
                    nlp_train.to_disk(path)                
                else:
                    linha += f"{'N':8} | \n"
                
                for label in lista_label:
                    linha += f"{'':8} | {label:12} | {validate_metrics['ents_per_type'][label]['p']*100:06.2f} | {validate_metrics['ents_per_type'][label]['r']*100:06.2f} | {validate_metrics['ents_per_type'][label]['f']*100:06.2f} | {'':8} | \n"
                
                elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
                linha += elapsed

                print(linha)
                log_file.write(linha + "\n")
                log_file.flush()
    return nlp_train

In [None]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET, group_by_label=False)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET, group_by_label=False)
nlp = train_spacy_muliple_label(train_data,validate_data, 10, LABEL_LIST, "Treino Agregado, Multiplos Labels com uma camada de NER")

#IT   | Loss     | Label        | Prec   | Recall | F_Scr  | Save Mod | Duração    
