In [1]:
import os
import re
import string
import json

from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

In [2]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATASET_PATH = 'NER-Data'

DATASET_TYPE = ["train_dev", "test"]



## Converson para JSON

In [4]:
nlp = spacy.blank("en") # load a new spacy model

for dataset_type in DATASET_TYPE:
    for label in LABEL_LIST:
        sentenca = ""
        entities = []
        lista_ner = []
        ini_entity_atual = -1
        pos_atual = 0
        entidade_atual = ""
        
        for dir_dataset in LABEL_TO_DIR[label]:
            dataset_ner_file = os.path.join(DATASET_PATH, dir_dataset, dataset_type + ".tsv")
            with open(dataset_ner_file) as f_ner:
                for linha in f_ner:
                    if len(entidade_atual) > 0 and ("\tO" in linha or "\tB" in linha or linha == "\n"):
                        entities.append({"entidade":entidade_atual, 
                                             "start":ini_entity_atual, 
                                             "end": ini_entity_atual + len(entidade_atual),
                                             "label": label
                                            })
                        entidade_atual = ""
                        ini_entity_atual = -1
                        
                    if linha != "\n":
                        if (pos_atual != 0):
                            sentenca += " "
                            pos_atual += 1
                        if len(entidade_atual) > 0:
                            entidade_atual += " "
                        
                        if ("\tO" in linha):
                            linha_tratada = linha.replace("\tO","").replace("\n", "")                            
                        elif("\tB" in linha):
                            ini_entity_atual = pos_atual
                            linha_tratada = linha.replace("\tB","").replace("\n", "")
                            entidade_atual = linha_tratada                
                        elif("\tI" in linha):
                            linha_tratada = linha.replace("\tI","").replace("\n", "")
                            entidade_atual += linha_tratada

                        pos_atual += len(linha_tratada)
                        sentenca = sentenca + linha_tratada
                    else:
                        lista_ner.append({"texto": sentenca, "entities": entities})
                        sentenca = ""
                        entities=[]
                        pos_atual = 0


        with open(os.path.join(DATASET_PATH, label + "-" + dataset_type + ".json"), 'w') as json_file:            
            json.dump(lista_ner, json_file)
              
        db = DocBin() # create a DocBin object
        for ner in lista_ner:
            doc = nlp.make_doc(ner['texto']) # create doc object from text
            ents=[]
            for entidade in ner['entities']:
                span = doc.char_span(entidade['start'], entidade['end'], label=entidade['label'], alignment_mode="contract")
                if span is None:
                    print ("Span None")
                    print(ner['texto'])
                    print(entidade)
                else:
                    ents.append(span)

            doc.ents = ents
            db.add(doc)
        db.to_disk(os.path.join(DATASET_PATH, label + "-" + dataset_type + ".spacy")) 