In [None]:
!pip install pycorenlp
!pip install pyneuroner[gpu]
!python -m spacy download es

In [1]:
# -*- coding: utf-8 -*-
#source code available at https://github.com/juand-r/entity-recognition-datasets/tree/master/data/i2b2_2014
#another source : https://github.com/Franck-Dernoncourt/NeuroNER
import codecs
import glob
import json
import os

from pycorenlp import StanfordCoreNLP
import spacy

from neuroner import utils_nlp


def get_start_and_end_offset_of_token_from_spacy(token):
    start = token.idx
    end = start + len(token)
    return start, end

def get_sentences_and_tokens_from_spacy(text, spacy_nlp):
    document = spacy_nlp(text)
    # sentences
    sentences = []
    for span in document.sents:
        sentence = [document[i] for i in range(span.start, span.end)]
        sentence_tokens = []
        for token in sentence:
            token_dict = {}
            token_dict['start'], token_dict['end'] = get_start_and_end_offset_of_token_from_spacy(token)
            token_dict['text'] = text[token_dict['start']:token_dict['end']]
            if token_dict['text'].strip() in ['\n', '\t', ' ', '']:
                continue
            # Make sure that the token text does not contain any space
            if len(token_dict['text'].split(' ')) != 1:
                print("WARNING: the text of the token contains space character, replaced with hyphen\n\t{0}\n\t{1}".format(token_dict['text'], 
                                                                                                                           token_dict['text'].replace(' ', '-')))
                token_dict['text'] = token_dict['text'].replace(' ', '-')
            sentence_tokens.append(token_dict)
        sentences.append(sentence_tokens)
    return sentences

def get_stanford_annotations(text, core_nlp, port=9000, annotators='tokenize,ssplit,pos,lemma'):
    output = core_nlp.annotate(text, properties={
        "timeout": "10000",
        "ssplit.newlineIsSentenceBreak": "two",
        'annotators': annotators,
        'outputFormat': 'json'
    })
    if type(output) is str:
        output = json.loads(output, strict=False)
    return output

def get_sentences_and_tokens_from_stanford(text, core_nlp):
    stanford_output = get_stanford_annotations(text, core_nlp)
    sentences = []
    for sentence in stanford_output['sentences']:
        tokens = []
        for token in sentence['tokens']:
            token['start'] = int(token['characterOffsetBegin'])
            token['end'] = int(token['characterOffsetEnd'])
            token['text'] = text[token['start']:token['end']]
            if token['text'].strip() in ['\n', '\t', ' ', '']:
                continue
            # Make sure that the token text does not contain any space
            if len(token['text'].split(' ')) != 1:
                print("WARNING: the text of the token contains space character, replaced with hyphen\n\t{0}\n\t{1}".format(token['text'], 
                                                                                                                           token['text'].replace(' ', '-')))
                token['text'] = token['text'].replace(' ', '-')
            tokens.append(token)
        sentences.append(tokens)
    return sentences

def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False):
    # load text
    with codecs.open(text_filepath, 'r', 'UTF-8') as f:
        text =f.read()
    if verbose: print("\ntext:\n{0}\n".format(text))

    # parse annotation file
    entities = []
    with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
        for line in f.read().splitlines():
            anno = line.split()
            id_anno = anno[0]
            # parse entity
            if id_anno[0] == 'T':
                entity = {}
                entity['id'] = id_anno
                entity['type'] = anno[1]
                entity['start'] = int(anno[2])
                entity['end'] = int(anno[3])
                entity['text'] = ' '.join(anno[4:])
                if verbose:
                    print("entity: {0}".format(entity))
                # Check compatibility between brat text and anootation
                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
                    print("Warning: brat text and annotation do not match.")
                    print("\ttext: {0}".format(text[entity['start']:entity['end']]))
                    print("\tanno: {0}".format(entity['text']))
                # add to entitys data
                entities.append(entity)
    if verbose: print("\n\n")
    
    return text, entities

def check_brat_annotation_and_text_compatibility(brat_folder):
    '''
    Check if brat annotation and text files are compatible.
    '''
    dataset_type =  os.path.basename(brat_folder)
    print("Checking the validity of BRAT-formatted {0} set... ".format(dataset_type), end='')
    text_filepaths = sorted(glob.glob(os.path.join(brat_folder, '*.txt')))
    for text_filepath in text_filepaths:
        base_filename = os.path.splitext(os.path.basename(text_filepath))[0]
        annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann')
        # check if annotation file exists
        if not os.path.exists(annotation_filepath):
            raise IOError("Annotation file does not exist: {0}".format(annotation_filepath))
        text, entities = get_entities_from_brat(text_filepath, annotation_filepath)
    print("Done.")

def brat_to_conll(input_folder, output_filepath, tokenizer, language):
    '''
    Assumes '.txt' and '.ann' files are in the input_folder.
    Checks for the compatibility between .txt and .ann at the same time.
    '''
    if tokenizer == 'spacy':
        spacy_nlp = spacy.load(language)
    elif tokenizer == 'stanford':
        core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000))
    else:
        raise ValueError("tokenizer should be either 'spacy' or 'stanford'.")
    verbose = False
    dataset_type =  os.path.basename(input_folder)
    print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type), end='')
    text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt')))
    output_file = codecs.open(output_filepath, 'w', 'utf-8')
    for text_filepath in text_filepaths:
        base_filename = os.path.splitext(os.path.basename(text_filepath))[0]
        annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann')
        # create annotation file if it does not exist
        if not os.path.exists(annotation_filepath):
            codecs.open(annotation_filepath, 'w', 'UTF-8').close()

        text, entities = get_entities_from_brat(text_filepath, annotation_filepath)
        entities = sorted(entities, key=lambda entity:entity["start"])
        
        if tokenizer == 'spacy':
            sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp)
        elif tokenizer == 'stanford':
            sentences = get_sentences_and_tokens_from_stanford(text, core_nlp)
        
        for sentence in sentences:
            inside = False
            previous_token_label = 'O'
            for token in sentence:
                token['label'] = 'O'
                for entity in entities:
                    if entity['start'] <= token['start'] < entity['end'] or \
                       entity['start'] < token['end'] <= entity['end'] or \
                       token['start'] < entity['start'] < entity['end'] < token['end']:

                        token['label'] = entity['type'].replace('-', '_') # Because the ANN doesn't support tag with '-' in it

                        break
                    elif token['end'] < entity['start']:
                        break
                        
                if len(entities) == 0:
                    entity={'end':0}
                if token['label'] == 'O':
                    gold_label = 'O'
                    inside = False
                elif inside and token['label'] == previous_token_label:
                    gold_label = 'I-{0}'.format(token['label'])
                else:
                    inside = True
                    gold_label = 'B-{0}'.format(token['label'])
                if token['end'] == entity['end']:
                    inside = False
                previous_token_label = token['label']
                if verbose: print('{0} {1} {2} {3} {4}\n'.format(token['text'], base_filename, token['start'], token['end'], gold_label))
                output_file.write('{0} {1} {2} {3} {4}\n'.format(token['text'], base_filename, token['start'], token['end'], gold_label))
            if verbose: print('\n')
            output_file.write('\n')

    output_file.close()
    print('Done.')
    if tokenizer == 'spacy':
        del spacy_nlp
    elif tokenizer == 'stanford':
        del core_nlp

In [2]:
brat_to_conll('../input/meddoprofner-task/experiments/experiments/test', './test.txt', 'spacy', 'es')
brat_to_conll('../input/meddoprofner-task/meddoprof-train-set/task1', './train.txt', 'spacy', 'es')

Formatting test set from BRAT to CONLL... Done.
Formatting task1 set from BRAT to CONLL... Done.


In [3]:
import pandas as pd
test=pd.read_csv('./test.txt',sep=' ',names=['words','fileId','start','end','label'],quoting=3, error_bad_lines=False)
test.head()

Unnamed: 0,words,fileId,start,end,label
0,Nombre,S0034-98872006000200012-1,0,6,O
1,:,S0034-98872006000200012-1,6,7,O
2,Gabriel,S0034-98872006000200012-1,8,15,O
3,.,S0034-98872006000200012-1,16,17,O
4,Apellidos,S0034-98872006000200012-1,18,27,O


In [4]:
import pandas as pd
train=pd.read_csv('./train.txt',sep=' ',names=['words','fileId','start','end','label'],quoting=3, error_bad_lines=False)
train.head()

Unnamed: 0,words,fileId,start,end,label
0,Un,32247016_ES,0,2,O
1,paciente,32247016_ES,3,11,O
2,de,32247016_ES,12,14,O
3,unos,32247016_ES,15,19,O
4,cincuenta,32247016_ES,20,29,O


In [5]:
set(train.label)

{'B-ACTIVIDAD',
 'B-PROFESION',
 'B-SITUACION_LABORAL',
 'I-ACTIVIDAD',
 'I-PROFESION',
 'I-SITUACION_LABORAL',
 'O'}

In [6]:
!pip install -q flair

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyldavis 3.3.1 requires numpy>=1.20.0, but you have numpy 1.19.5 which is incompatible.
pyldavis 3.3.1 requires pandas>=1.2.0, but you have pandas 1.1.5 which is incompatible.[0m


In [7]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

In [8]:
columns = {0: 'words', 4: 'label'}
corpus: Corpus = ColumnCorpus('./', columns,
                              train_file='./train.txt',
                              test_file='./test.txt',
                              dev_file='./test.txt')

2021-06-06 03:30:03,448 Reading data from .
2021-06-06 03:30:03,449 Train: train.txt
2021-06-06 03:30:03,450 Dev: test.txt
2021-06-06 03:30:03,451 Test: test.txt


In [9]:
import pandas as pd
data = [[len(corpus.train), len(corpus.test), len(corpus.dev)]]
# Prints out the dataset sizes of train test and development in a table.
pd.DataFrame(data, columns=["Train", "Test", "Development"])

Unnamed: 0,Train,Test,Development
0,49640,10002,10002


In [10]:
from flair.embeddings import TransformerWordEmbeddings

embeddings = TransformerWordEmbeddings(
    model='dccuchile/bert-base-spanish-wwm-cased',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
)

Downloading:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
from flair.models import SequenceTagger
# 2. what tag do we want to predict?
tag_type = 'label'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type='label',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

In [12]:
# 6. initialize trainer with AdamW optimizer
from flair.trainers import ModelTrainer
import torch

trainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.AdamW)

In [13]:
# 7. run training with XLM parameters (20 epochs, small LR)
from torch.optim.lr_scheduler import OneCycleLR

trainer.train('resources/taggers/ner-spanish-large',
              learning_rate=5.0e-6,
              mini_batch_size=4,
              mini_batch_chunk_size=1,
              max_epochs=10,
              scheduler=OneCycleLR,
              embeddings_storage_mode='gpu',
              weight_decay=0.,
              )

2021-06-06 03:31:06,262 ----------------------------------------------------------------------------------------------------
2021-06-06 03:31:06,265 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(31002, 768, padding_idx=1)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): D

Token indices sequence length is longer than the specified maximum sequence length for this model (660 > 512). Running this sequence through the model will result in indexing errors


2021-06-06 03:40:22,020 epoch 1 - iter 2482/12410 - loss 0.16222971 - samples/sec: 17.65 - lr: 0.000005
2021-06-06 03:45:03,125 epoch 1 - iter 3723/12410 - loss 0.14997034 - samples/sec: 17.66 - lr: 0.000005
2021-06-06 03:49:43,210 epoch 1 - iter 4964/12410 - loss 0.14941224 - samples/sec: 17.73 - lr: 0.000005
2021-06-06 03:54:24,074 epoch 1 - iter 6205/12410 - loss 0.14672297 - samples/sec: 17.68 - lr: 0.000005
2021-06-06 03:59:07,540 epoch 1 - iter 7446/12410 - loss 0.14585816 - samples/sec: 17.51 - lr: 0.000005
2021-06-06 04:03:50,721 epoch 1 - iter 8687/12410 - loss 0.14434159 - samples/sec: 17.53 - lr: 0.000005
2021-06-06 04:08:35,427 epoch 1 - iter 9928/12410 - loss 0.14224814 - samples/sec: 17.44 - lr: 0.000005
2021-06-06 04:13:17,061 epoch 1 - iter 11169/12410 - loss 0.13942542 - samples/sec: 17.63 - lr: 0.000005
2021-06-06 04:18:03,482 epoch 1 - iter 12410/12410 - loss 0.13699907 - samples/sec: 17.33 - lr: 0.000005
2021-06-06 04:18:03,484 --------------------------------------

{'test_score': 0.9914529914529914,
 'dev_score_history': [0.8721804511278195,
  0.9298429319371727,
  0.9603429796355841,
  0.9661733615221988,
  0.985042735042735,
  0.9914529914529914,
  0.9925293489861259,
  0.9903948772678762,
  0.9914529914529914,
  0.9914529914529914],
 'train_loss_history': [0.13699906963480882,
  0.11770114456034944,
  0.10816561263953835,
  0.10411377371401635,
  0.09948162731419896,
  0.09748751203627926,
  0.09346435220860712,
  0.09360180739185398,
  0.09264584391795248,
  0.0944850795910929],
 'dev_loss_history': [0.0042916531674563885,
  0.0014818062772974372,
  0.0008463345584459603,
  0.0006835437961854041,
  0.00042431947076693177,
  0.00021610480325762182,
  0.00020059198141098022,
  0.00012177122698631138,
  0.00010632267367327586,
  9.802718705032021e-05]}

In [14]:
from flair.data import Sentence
from flair.models import SequenceTagger

input_sentence = 'Un paciente de unos cincuenta años sin antecedentes personales acudió a su médico de cabecera en la zona de Marsella el 6 de marzo de 2020 , reportando fiebre , cefalea y tos con una evolución desde el 29 de febrero .' 
tagger: SequenceTagger = SequenceTagger.load("./resources/taggers/ner-spanish-large/final-model.pt")
sentence: Sentence = Sentence(input_sentence)
tagger.predict(sentence)
print(sentence.to_tagged_string())

2021-06-06 12:19:36,203 loading file ./resources/taggers/ner-spanish-large/final-model.pt
Un paciente de unos cincuenta años sin antecedentes personales acudió a su médico <B-PROFESION> de <I-PROFESION> cabecera <I-PROFESION> en la zona de Marsella el 6 de marzo de 2020 , reportando fiebre , cefalea y tos con una evolución desde el 29 de febrero .


In [None]:
#sentence.get_spans()

In [15]:
!zip -r ./flair_bert_meddoprof.zip ./resources/taggers/ner-spanish-large

  adding: resources/taggers/ner-spanish-large/ (stored 0%)
  adding: resources/taggers/ner-spanish-large/training.log (deflated 88%)
  adding: resources/taggers/ner-spanish-large/dev.tsv (deflated 77%)
  adding: resources/taggers/ner-spanish-large/loss.tsv (deflated 53%)
  adding: resources/taggers/ner-spanish-large/test.tsv (deflated 77%)
  adding: resources/taggers/ner-spanish-large/final-model.pt (deflated 7%)
  adding: resources/taggers/ner-spanish-large/weights.txt (stored 0%)


In [16]:
#tagger: SequenceTagger = SequenceTagger.load("./resources/taggers/ner-spanish-large/final-model.pt")

def ftagger(txt):
    sentence: Sentence = Sentence(txt)
    tagger.predict(sentence)
    return sentence.get_spans()#sentence.to_tagged_string()

In [17]:
#!rm -r ./out
!mkdir ./out
def format(path):
    file=open(path).read()
    out=ftagger(file)
    fname=str(path).split('/')[-1][:-4]
    ann=open("./out/"+fname+".ann", "w")
    i=1
    for ntt in out:
        ntt=ntt.to_dict()
        text=ntt['text']
        start=ntt['start_pos']
        end=ntt['end_pos']
        tag=str(ntt['labels']).split(' ')[0][1:]
        ann.write('T'+str(i)+'\t'+tag+' '+str(start)+' '+str(end)+'\t'+text+'\n')
        i=i+1
        
        

In [None]:
#!rm -r ./resources

In [18]:
import os
from pathlib import Path
path=Path('../input/meddoprofner-task/meddoprof_test_txt/meddoprof_test_txt')

In [19]:

for user in os.listdir(path):
            if str(user)[-1]=='n' : 
                continue
            print("\t\t Working on text: ", user )
            format(os.path.join(path, user))
            

Token indices sequence length is longer than the specified maximum sequence length for this model (740 > 512). Running this sequence through the model will result in indexing errors


		 Working on text:  casos_clinicos_profesiones181.txt
		 Working on text:  caso_clinico_psiquiatria438.txt
		 Working on text:  casos_clinicos_profesiones113.txt
		 Working on text:  cc_onco884.txt
		 Working on text:  casos_clinicos_profesiones134.txt
		 Working on text:  caso_clinico_dermatologia456.txt
		 Working on text:  casos_clinicos_profesiones222.txt
		 Working on text:  cc_onco1574.txt
		 Working on text:  caso_clinico_psiquiatria471.txt
		 Working on text:  casos_clinicos_profesiones145.txt
		 Working on text:  casos_clinicos_profesiones141.txt
		 Working on text:  caso_clinico_medicina_interna407.txt
		 Working on text:  casos_clinicos_profesiones167.txt
		 Working on text:  caso_clinico_endocrinologia47.txt
		 Working on text:  caso_clinico_psiquiatria22.txt
		 Working on text:  casos_clinicos_profesiones138.txt
		 Working on text:  casos_clinicos_profesiones232.txt
		 Working on text:  caso_clinico_psiquiatria69.txt
		 Working on text:  caso_clinico_psiquiatria360.txt
		

In [None]:
!git clone https://github.com/TeMU-BSC/meddoprof-evaluation-library.git

In [None]:
!python ./meddoprof-evaluation-library/src/main.py -g ../input/meddoprofner-task/experiments/experiments/test -p ./out -c ./meddoprof-evaluation-library/meddoprof_valid_codes.tsv -s ner


In [20]:
!zip -r ./flair_bert_meddoprof_submission.zip ./out

  adding: out/ (stored 0%)
  adding: out/casos_clinicos_profesiones123.ann (stored 0%)
  adding: out/S0034-98872006000200011-1.ann (deflated 10%)
  adding: out/S0034-98872012001100010-1.ann (deflated 9%)
  adding: out/casos_clinicos_profesiones188.ann (deflated 37%)
  adding: out/caso_clinico_dermatologia469.ann (stored 0%)
  adding: out/casos_clinicos_profesiones124.ann (stored 0%)
  adding: out/cc_reuma56.ann (stored 0%)
  adding: out/caso_clinico_psiquiatria440.ann (stored 0%)
  adding: out/caso_clinico_psiquiatria474.ann (deflated 14%)
  adding: out/caso_clinico_urologia270.ann (stored 0%)
  adding: out/casos_clinicos_profesiones212.ann (stored 0%)
  adding: out/casos_clinicos_profesiones222.ann (deflated 39%)
  adding: out/casos_clinicos_profesiones204.ann (deflated 41%)
  adding: out/casos_clinicos_profesiones138.ann (stored 0%)
  adding: out/casos_clinicos_profesiones139.ann (deflated 11%)
  adding: out/caso_clinico_medtropical28.ann (stored 0%)
  adding: out/casos_clinicos_prof