# Convertir Dataset fornecido em formato Spacy

In [1]:
from glob import glob
from os.path import join,basename
from tqdm import tqdm
from os import system

In [2]:
# O nome do tipo de entidade para cada dataset
DATASET_NAME_2_ENTITY_TYPE=\
{
    'BC2GM':'DRUG',
    'BC4CHEMD':'CHEMICAL',
    'BC5CDR-disease':'DISEASE',
    'BC5CDR-chem':'CHEMICAL',
    'JNLPBA':'DRUG',
    'linnaeus':'SPECIES',
    'NCBI-disease':'DISEASE',
    's800':'SPECIES',
}

In [None]:
def convert_to_iob_format(input_file, output_file, entity_type):
    # Abrimos o arquivo de anotação base 
    with open(input_file,'r') as file:
        lines = file.readlines()

    sents=[] # Lista de sentenças 
    sent=[] # Uma sentença
    
    # Iteramos ao longo do documento 
    for line in lines:
        line=line.strip()
        
        if line:
            # Para cada linha removemos o quebre de linha e separamos a palavra do tag
            word,tag=line.replace('\n','').split('\t')
            
            # Anotamos o complemento I -> I->ENTITY e B -> B-entity
            if tag in ['I','B']:
                tag=f'{tag}-{entity_type}'
                
            # Armazenamos a palavra e o tag no formato IOB
            sent.append(f'{word}'+'|''|'+f'{tag}')
        else:
            
            # se a linha estiver vacia então indica o final da linha. 
            # Se armazena a sentença e se limpa "sent" par coletar a seguinte linha
            sents.append(sent)
            sent=[]

    # Juntamos todas as sentenças por espaço ' ' e todas as linhas por quebre de linha '\n'
    sents_content = '\n'.join([' '.join(sent) for sent in sents])

    # Salvamos o formato IOB
    with open(output_file,'w+') as file:
        file.write(sents_content)

# Pocessamos todos os arquivos usando CLI: 
```bash
python -m spacy convert ...
```

Documentação: [https://spacy.io/api/cli#convert](https://spacy.io/api/cli#convert)

In [None]:
folder_paths = glob('..\\NERdata\\*')
for folder_path in tqdm(folder_paths, total=len(folder_paths)):
    
    dataset_name = basename(folder_path)
    
    entity_type = DATASET_NAME_2_ENTITY_TYPE[dataset_name]
    
    for input_file in glob(join(folder_path,'*.tsv')):
        
        iob_file = input_file.replace('.tsv','.iob')
        
        convert_to_iob_format(input_file, iob_file, entity_type)
    
        system(f'python -m spacy convert -c iob -s -n 10 -b en_core_web_sm {iob_file} {folder_path}') #v3

# Treinamento do dataset BC2GM (Usar máquina com GPU)

Inicializar configuração de treinamento: [https://spacy.io/usage/training#config](https://spacy.io/usage/training#config)

!python -m spacy init fill-config base_config.cfg config.cfg

Inicializamos o treinamento

In [None]:
!python -m spacy train config.cfg --verbose --paths.train ./NERdata/BC2GM/train.spacy --paths.dev ./NERdata/BC2GM/devel.spacy

# Inferência

In [None]:
import spacy

text=\
"Immunohistochemical staining was positive for S - 100 in all 9 cases stained , positive for HMB - 45 " +\
"in	9 ( 90 % ) of 10 , and negative for cytokeratin in all 9 cases in which myxoid melanoma remained" +\
"in the	block after previous sections ."

model_path=R".\output\model-best"
nlp1 = spacy.load(model_path) #load the best model
doc = nlp1(text) # input sample text
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter