<a href="https://colab.research.google.com/github/deivismartinez/NER-Medical-Uninorte/blob/main/NER2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install transformers
!pip install --no-cache-dir transformers sentencepiece
#!pip install spacy
#!python -m spacy download es_core_news_md
#!pip install flair
#!pip install stanza

In [None]:
from transformers import pipeline, AutoTokenizer
from transformers.models.bert.modeling_bert import BertModel,BertForMaskedLM
import json
from pathlib import Path
from google.colab import files
from os import listdir
from os.path import isfile, join
#import spacy
from matplotlib import pyplot as plt
from math import ceil
import shutil
import os
import zipfile
import glob
#import stanza
import sys

In [None]:
!wget https://zenodo.org/record/4279323/files/meddocan.zip?download=1 -O meddocan.zip

In [None]:
!unzip meddocan.zip

In [None]:
dataset = {}
for path in (Path('dev'), Path('test'), Path('train')):
    dir_path = Path('meddocan')/path/Path('brat')
    filenames = tuple(f[:-4] for f in listdir(dir_path) if isfile(join(dir_path, f)) if f[-4:] == '.txt')
    dataset[str(path)] = []
    for file_name in filenames:
      d = dict()
      with open(dir_path/Path(file_name+'.txt'), 'r') as f:
        dataset[str(path)].append({"text":f.read(),"file":file_name,"file_name_path":dir_path/Path(file_name+'.txt')})

In [None]:
!wget https://raw.githubusercontent.com/deivismartinez/NER-Medical-Uninorte/main/tag.json
!wget https://raw.githubusercontent.com/deivismartinez/NER-Medical-Uninorte/main/models.json

In [None]:
def get_models():
  #file = open("models.json","r")
  #models = json.load(file)
  #file.close()
  models = [
    {"id":1,"name":"roberta-large-NER", "folder":"51la5/"},
    {"id":2,"name":"bert-base-multilingual-cased-ner-spanish", "folder":"alvarobartt/"},
    {"id":3,"name":"anonymizer-beto-cased-flair", "folder":"aymurai/"},
    {"id":4,"name":"flair-ner-spanish-judicial", "folder":"aymurai/"},
    {"id":5,"name":"wikineural-multilingual-ner", "folder":"Babelscape/"},
    {"id":6,"name":"bertin-base-ner-conll2002-es", "folder":"bertin-project/"},
    {"id":7,"name":"roberta_model_for_anonimization", "folder":"BSC-LT/"},
    {"id":8,"name":"roberta-base-bne-capitel-ner-plus", "folder":"BSC-LT/"},
    {"id":9,"name":"NER-MEDDOCAN", "folder":"Dnidof/"},
    {"id":10,"name":"bert-base-multilingual-cased-fine_tuned-ner-WikiNeural_Multilingual", "folder":"DunnBC22/"},
    {"id":11,"name":"xlm-roberta-large-finetuned-conll03-english", "folder":"FacebookAI/"},
    {"id":12,"name":"xlm-roberta-large-finetuned-conll03-german", "folder":"FacebookAI/"},
    {"id":13,"name":"ner-multi", "folder":"flair/"},
    {"id":14,"name":"ner-multi-fast", "folder":"flair/"},
    {"id":15,"name":"ner-spanish-large", "folder":"flair/"},
    {"id":16,"name":"AuthorParserModel", "folder":"GEOcite/"},
    {"id":17,"name":"distilbert-base-multilingual-cased-finetuned-conll2003-ner", "folder":"gunghio/"},
    {"id":18,"name":"xlm-roberta-base-finetuned-panx-ner", "folder":"gunghio/"},
    {"id":19,"name":"BETO-finetuned-ner-3", "folder":"ifis/"},
    {"id":20,"name":"span-marker-bert-base-multilingual-cased-multinerd", "folder":"lxyuan/"},
    {"id":21,"name":"span-marker-bert-base-multilingual-uncased-multinerd", "folder":"lxyuan/"},
    {"id":22,"name":"xlm-roberta-large-ner-spanish", "folder":"MMG/"},
    {"id":23,"name":"bert-spanish-cased-finetuned-ner", "folder":"mrm8488/"},
    {"id":24,"name":"RuPERTa-base-finetuned-ner", "folder":"mrm8488/"},
    {"id":25,"name":"TinyBERT-spanish-uncased-finetuned-ner", "folder":"mrm8488/"},
    {"id":26,"name":"NER-fine-tuned-BETO", "folder":"NazaGara/"},
    {"id":27,"name":"flair-ner-multi", "folder":"Omnifact/"},
    {"id":28,"name":"ca_anonimization_core_lg", "folder":"PlanTL-GOB-ES/"},
    {"id":29,"name":"es_anonimization_core_lg", "folder":"PlanTL-GOB-ES/"},
    {"id":30,"name":"roberta-base-bne-capitel-ner", "folder":"PlanTL-GOB-ES/"},
    {"id":31,"name":"roberta-base-bne-capitel-ner-plus", "folder":"PlanTL-GOB-ES/"},
    {"id":32,"name":"roberta-large-bne-capitel-ner", "folder":"PlanTL-GOB-ES/"},
    {"id":33,"name":"DEBERTA_CIEL", "folder":"projecte-aina/"},
    {"id":34,"name":"codeswitch-spaeng-ner-lince", "folder":"sagorsarker/"},
    {"id":35,"name":"span-marker-bert-base-conll2002-es", "folder":"sepulm01/"},
    {"id":36,"name":"es_core_news_lg", "folder":"spacy/"},
    {"id":37,"name":"es_core_news_md", "folder":"spacy/"},
    {"id":38,"name":"es_core_news_sm", "folder":"spacy/"},
    {"id":39,"name":"stanza-es", "folder":"stanfordnlp/"},
    {"id":40,"name":"gliner_multi_pii-v1", "folder":"urchade/"},
    ]
  return models

def select_model(model_index):
  models = get_models()
  if model_index > len(models):
    return None
  else:
    model = models[model_index-1].get("folder") + models[model_index-1].get("name")
    return model

def save_models(folder_base):
  models = get_models()
  models_dict = {"models":models}
  with open(folder_base+"models.json", "w") as fp:
    json.dump(models_dict, fp)

def get_ner_pipe(model):
  ner_pipe = pipeline(task="ner", model = model)
  return ner_pipe

In [None]:
file = open("tag.json","r")
tag = json.load(file)
file.close()
def get_tag(entity):
  tags=["LOC","PER","ORG", "OTH", "MISC"]
  for tag_l in tags:
    if tag_l in entity:
      return tag[tag_l]
  return "NEW TAG "

In [None]:
def get_tokenizer(model):
  tokenizer = AutoTokenizer.from_pretrained(model,add_prefix_space=True)
  #do_grafic(tokenizer)
  return tokenizer

def do_grafic(tokenizer):
  token_length = [len(tokenizer(x.get('text'))['input_ids']) for x in dataset['test']]
  plt.hist(token_length)

def change_start_end(list_values, n_string):
  for entity in list_values:
    entity["start"] = entity.get("start") + n_string
    entity["end"] = entity.get("end") + n_string
  return list_values

def build_file(entity_list, file_name, min_score = 0.5):
  t = 0
  file= open(file_name,"w")
  end_last = -1
  entity_last = ""
  word_last = ""
  for entity in entity_list:
      if entity['score'] > min_score:
        if (entity['start'] == end_last or end_last == -1) or ((entity['start'] == (end_last+1))
        and get_tag(entity['entity']) == get_tag(entity_last)):
          if (entity['start'] == end_last or end_last == -1):
            word_last += entity['word']
          else:
            word_last += ' ' + entity['word']
          if end_last == -1:
            start_first = entity['start']
            entity_first = entity['entity']
        else:
          t += 1
          end_now = entity['end']
          entity_now = get_tag(entity_first)
          row = 'T'+str(t) +'\t'+ str(entity_now) +' ' + str(start_first) +' '+ str(end_last) +'\t' + word_last.replace('Ġ',' ').replace('Ã±','ñ').replace('Ã¡','á').replace('Ã©','é').replace('ÃŃ','í').replace('Ã³','ó').replace('Ãº','ú').replace('▁',' ').replace('##','') +'\n'
          if(entity_now!='OTHER'):
            file.write(row)
          word_last = entity['word']
          start_first = entity['start']
          entity_first = entity['entity']
        end_last = entity['end']
        entity_last = entity['entity']
  t += 1
  entity_now = get_tag(entity_first)
  row = 'T'+str(t) +'\t'+ str(entity_now) +' ' + str(start_first) +' '+ str(end_last) +'\t' + word_last.replace('Ġ',' ').replace('Ã±','ñ').replace('Ã¡','á').replace('Ã©','é').replace('ÃŃ','í').replace('Ã³','ó').replace('Ãº','ú').replace('▁',' ').replace('##','') +'\n'
  if(entity_now!='OTHER'):
    file.write(row)
  file.close ()

def get_values_sents(nlp_var, ner_pipe):
  entity_list =  []
  sum = 0
  #print(type(nlp_var))
  for text_npl in nlp_var.sents:
    for entity in ner_pipe(str(text_npl)):
      entity["start"] = entity.get("start") + sum
      entity["end"] = entity.get("end") + sum
      entity_list.append(entity)
    sum = sum + len(str(text_npl))
  return entity_list

def get_values_split(text, ner_pipe):
  entity_list =  []
  text_split = text.split('.')
  sum = 0
  for text_npl in text_split:
    #print(text_npl)
    for entity in ner_pipe(str(text_npl)):
      entity["start"] = entity.get("start") + sum
      entity["end"] = entity.get("end") + sum
      entity_list.append(entity)
    sum = sum + len(str(text_npl))
  return entity_list

def get_values_512(text, ner_pipe, model_index, tokenizer):
  chunk_size = 512
  entity_list = []
  model = select_model(model_index)
  if model != None:
    for i in range(0, len(text), chunk_size):
        chunk = text[i: i + chunk_size]
        valors = ner_pipe(chunk)
        valors = change_start_end(valors, i)
        entity_list.extend(valors)
  return entity_list

def prepare_file(text, file_name, folder_system, min_score,
                 type_text, ner_pipe, model_index, tokenizer):
  file_ann_name = folder_system + file_name + ".ann"
  entity_list = []
  if type_text == 0:
    nlp_var = nlp(text)
    entity_list = ner_pipe(str(nlp_var))
  elif type_text == 1:
    nlp_var = nlp(text)
    entity_list = get_values_sents(nlp_var, ner_pipe = ner_pipe)
  elif type_text == 2:
    entity_list = get_values_512(text, ner_pipe = ner_pipe,
                                 model_index=model_index, tokenizer=tokenizer) #para los casos atipicos
  elif type_text == 3:
    entity_list = get_values_split(text, ner_pipe = ner_pipe)
  else:
    print("Type not exist")
  if len(entity_list) > 0:
    build_file(entity_list,file_ann_name, min_score = min_score)

def builder(min_score, quantity = 1, type_text = 0, model_index = 1, model_type = 1):
  model = select_model(model_index)
  if model != None:
    tokenizer = get_tokenizer(model)
    ner_pipe = get_ner_pipe(model)

    folder_base = "system/"
    folder_system = folder_base+"model_"+str(model_index)+"/test"+str(type_text)+"/"
    folder_gold = folder_base+"gold/"
    if not Path(folder_system).exists():
      path = Path(folder_system)
      path.mkdir(parents=True)
    if not Path(folder_gold).exists():
      path = Path(folder_gold)
      path.mkdir(parents=True)
    save_models(folder_base)
    for text in dataset['test']:
        try:
          path_completo = str(text.get('file_name_path'))
          path_completo = path_completo[:-4]
          prepare_file(text.get('text'), text.get('file'), folder_system,
                      min_score = min_score, type_text = type_text,
                      ner_pipe = ner_pipe, model_index = model_index, tokenizer = tokenizer)
          shutil.copy(path_completo+".txt", folder_gold+text.get('file')+".txt")
          shutil.copy(path_completo+".ann", folder_gold+text.get('file')+".ann")
          shutil.copy(path_completo+".txt", folder_system+text.get('file')+".txt")
        except:
          print(f"NO PROCESADO   -   ----   --   {text.get('file')}")
        quantity -= 1
        print(quantity)
        if quantity < 1:
          break

def builder_all():
  for i in range(0,4):
    for j in range(1,25):
      builder(type_text = i, min_score = 0.0, quantity = 1, model_index = j)

#Type: {'1':'Normal Transformer','2':'span_marker'}
def builder_all_test(start_model = 1, end_model = 1, text = 2, quantity_test = 1):
  for number_model in range(start_model,end_model):
    builder(min_score = 0.0, quantity = quantity_test, type_text = text, model_index = number_model)

In [None]:
builder_all_test(start_model = 1, end_model = 2, text = 2, quantity_test = 1)

In [None]:
def comprimir():
  fantasy_zip = zipfile.ZipFile('/content/system.zip', 'w')
  for folder, subfolders, filesFolder in os.walk('/content/system'):
      for file in filesFolder:
          if file.endswith('.conll') or file.endswith('.txt') or file.endswith('.ann'):
              fantasy_zip.write(os.path.join(folder, file), os.path.relpath(os.path.join(folder,file), '/content/system'), compress_type = zipfile.ZIP_DEFLATED)
  fantasy_zip.close()

In [None]:
comprimir()