In [48]:
import os
from pathlib import Path
import re
import string
import json
import random
import sys


from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.training import Example

In [57]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATASET_PATH = "NER-Data"
NER_PATH = "NER-Process"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"
VALIDATE_DATASET = "devel"
TEST_DATASET = "test"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

## Converson para JSON

In [58]:
nlp = spacy.blank("en") # load a new spacy model

for dataset_type in DATASET_TYPE:
    for label in LABEL_LIST:
        sentenca = ""
        entities = []
        lista_ner = []
        ini_entity_atual = -1
        pos_atual = 0
        entidade_atual = ""
        
        for dir_dataset in LABEL_TO_DIR[label]:
            dataset_ner_file = os.path.join(DATASET_PATH, dir_dataset, dataset_type + ".tsv")
            with open(dataset_ner_file) as f_ner:
                for linha in f_ner:
                    if len(entidade_atual) > 0 and ("\tO" in linha or "\tB" in linha or linha == "\n"):
                        entities.append({"entidade":entidade_atual, 
                                             "start":ini_entity_atual, 
                                             "end": ini_entity_atual + len(entidade_atual),
                                             "label": label
                                            })
                        entidade_atual = ""
                        ini_entity_atual = -1
                        
                    if linha != "\n":
                        if (pos_atual != 0):
                            sentenca += " "
                            pos_atual += 1
                        if len(entidade_atual) > 0:
                            entidade_atual += " "
                        
                        if ("\tO" in linha):
                            linha_tratada = linha.replace("\tO","").replace("\n", "")                            
                        elif("\tB" in linha):
                            ini_entity_atual = pos_atual
                            linha_tratada = linha.replace("\tB","").replace("\n", "")
                            entidade_atual = linha_tratada                
                        elif("\tI" in linha):
                            linha_tratada = linha.replace("\tI","").replace("\n", "")
                            entidade_atual += linha_tratada

                        pos_atual += len(linha_tratada)
                        sentenca = sentenca + linha_tratada
                    else:
                        lista_ner.append({"texto": sentenca, "entities": entities})
                        sentenca = ""
                        entities=[]
                        pos_atual = 0

        path_label = os.path.join(NER_PATH, label)
        file_json = os.path.join(path_label, label + "-" + dataset_type + ".json")
        Path(path_label).mkdir(parents=True, exist_ok=True)
        with open(file_json, 'w') as json_file:            
            json.dump(lista_ner, json_file)
              
        db = DocBin() # create a DocBin object
        for ner in lista_ner:
            doc = nlp.make_doc(ner['texto']) # create doc object from text
            ents=[]
            for entidade in ner['entities']:
                span = doc.char_span(entidade['start'], entidade['end'], label=entidade['label'], alignment_mode="contract")
                if span is None:
                    print ("Span None")
                    print(ner['texto'])
                    print(entidade)
                else:
                    ents.append(span)

            doc.ents = ents
            db.add(doc)
        
        file_spacy = os.path.join(path_label, label + "-" + dataset_type + ".spacy")
        db.to_disk(file_spacy) 

In [59]:
def load_data(label, data_set_type=TRAIN_DATASET):
    train_data = []
    file = os.path.join(NER_PATH, label, label + "-" + data_set_type + ".json")
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    for item in data:
        texto = item["texto"]
        entities = []
        for entidade in item["entities"]:
            info = (entidade["start"], entidade["end"], entidade["label"])
            entities.append(info)
        train_data.append((texto,{"entities":entities}))
    return train_data


In [64]:
def format_data_for_evalation(data, nlp_model): 
    data_formated = []
    for text, annotations in data:
        doc = nlp_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        data_formated.append(example)
    return data_formated


In [130]:
def train_spacy(train_data, validate_data, iterations, label):
    ner_name = label + "_ner"
    
    nlp_train = spacy.blank("en")
    #nlp_train = spacy.load("en_core_web_md")
    
    ner = nlp_train.add_pipe("ner",name=ner_name)
    ner.add_label(label)    
    
    best_ents_p = -1
    best_ents_r = -1
    best_ents_f = -1

    other_pipes = [pipe for pipe in nlp_train.pipe_names if pipe != ner_name]
    with nlp_train.disable_pipes(*other_pipes):
        optimizer = nlp_train.begin_training()
        #optimizer = nlp_train.create_optimizer()
        print(f"{'#IT':5} | {'Loss':10} | {'Prec':10} | {'Recall':10} | {'F_Score':10} | {'Save Mod':10} ")
        for itn in range(iterations):
            linha = f"{str(itn):5} | "
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=512)
            for batch in batches:
                for text, annotations in batch:
                    doc = nlp_train.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp_train.update( [example],
                        drop=0.2,  
                        sgd=optimizer,
                        losses=losses)
            
            validate_metrics = nlp_train.evaluate(format_data_for_evalation(validate_data, nlp_train))
            linha += f"{losses[list(losses)[0]]:08.2f} | {validate_metrics['ents_p']*100:05.2f} | {validate_metrics['ents_r']*100:05.2f} | {validate_metrics['ents_f']*100:05.2f} | "
            
            if (validate_metrics["ents_p"] > best_ents_p and validate_metrics["ents_f"] > best_ents_f):
                best_ents_r = validate_metrics["ents_r"]
                best_ents_f = validate_metrics["ents_f"]
                best_ents_p = validate_metrics["ents_p"]
                path = os.path.join(NER_PATH, label, "best_model")
                linha += "S \n"
                
                Path(path).mkdir(parents=True, exist_ok=True)
                nlp_train.to_disk(path)                
            else:
                linha += "N"
            print(linha)
    return nlp_train

In [128]:
train_data =  load_data(LABEL_SPECIES, TRAIN_DATASET)
validate_data = load_data(LABEL_SPECIES, VALIDATE_DATASET)
nlp = train_spacy(train_data,validate_data, 20, LABEL_SPECIES)

#IT   | Loss       | Prec       | Recall     | F_Score    | Save Mod   

0     | 
{'SPECIES_ner': 4382.3382432607} | 0.7147613762486127 | 0.5881278538812785 | 0.6452905811623246 | 
S 

1     | 
{'SPECIES_ner': 2864.2779926537396} | 0.6951735817104149 | 0.7497716894977169 | 0.7214411247803164 | 
N 

2     | 
{'SPECIES_ner': 2456.4296717148363} | 0.8433079434167573 | 0.7077625570776256 | 0.7696127110228401 | 
S 

3     | 
{'SPECIES_ner': 2099.9769765368787} | 0.8920086393088553 | 0.754337899543379 | 0.8174171202375062 | 
S 

4     | 
{'SPECIES_ner': 1899.7548692341943} | 0.8140703517587939 | 0.7397260273972602 | 0.7751196172248802 | 
N 

5     | 
{'SPECIES_ner': 1814.0921068344476} | 0.8736842105263158 | 0.6063926940639269 | 0.7159029649595687 | 
N 

6     | 
{'SPECIES_ner': 1838.4631381303398} | 0.8461538461538461 | 0.7132420091324201 | 0.7740336967294351 | 
N 

7     | 
{'SPECIES_ner': 1711.0829395561345} | 0.8929936305732484 | 0.6401826484018265 | 0.7457446808510639 | 
N 

8     | 
{'

KeyboardInterrupt: 

In [129]:
nlp = spacy.load(os.path.join(NER_PATH, LABEL_SPECIES, "best_model"))
test_data = load_data(LABEL_SPECIES, TEST_DATASET)
test_data_spacy=format_data_for_evalation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.8217605248769819, 'ents_r': 0.6831818181818182, 'ents_f': 0.746090841399851, 'ents_per_type': {'SPECIES': {'p': 0.8217605248769819, 'r': 0.6831818181818182, 'f': 0.746090841399851}}, 'speed': 38213.13912196184}


In [None]:
train_data =  load_data(LABEL_DRUG_PROTEIN, TRAIN_DATASET)
validate_data = load_data(LABEL_DRUG_PROTEIN, VALIDATE_DATASET)
nlp = train_spacy(train_data,validate_data, 10, LABEL_DRUG_PROTEIN)

#IT   | Loss       | Prec       | Recall     | F_Score    | Save Mod   


In [None]:
nlp = spacy.load(os.path.join(NER_PATH, LABEL_DRUG_PROTEIN, "best_model"))
test_data = load_data(LABEL_DRUG_PROTEIN, TEST_DATASET)
test_data_spacy=format_data_for_evalation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

In [None]:
train_data =  load_data(LABEL_CHEMICAL, TRAIN_DATASET)
validate_data = load_data(LABEL_CHEMICAL, VALIDATE_DATASET)
nlp = train_spacy(train_data,validate_data, 10, LABEL_CHEMICAL)

In [None]:
nlp = spacy.load(os.path.join(NER_PATH, LABEL_CHEMICAL, "best_model"))
test_data = load_data(LABEL_CHEMICAL, TEST_DATASET)
test_data_spacy=format_data_for_evalation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)

In [None]:
train_data =  load_data(LABEL_DISEASE, TRAIN_DATASET)
validate_data = load_data(LABEL_DISEASE, VALIDATE_DATASET)
nlp = train_spacy(train_data,validate_data, 10, LABEL_DISEASE)

In [None]:
nlp = spacy.load(os.path.join(NER_PATH, LABEL_DISEASE, "best_model"))
test_data = load_data(LABEL_DISEASE, TEST_DATASET)
test_data_spacy=format_data_for_evalation(test_data, nlp)
test_metrics = nlp.evaluate(test_data_spacy)
print(test_metrics)