In [1]:
# Instalaciones
!pip install pytorch-pretrained-bert
!pip install seqeval
!pip install langdetect


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 6.9 MB/s 
Collecting boto3
  Downloading boto3-1.24.66-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 53.1 MB/s 
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.28.0,>=1.27.66
  Downloading botocore-1.27.66-py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 44.8 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 8.2 MB/s 
[?25hCollecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 41.6 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-a

In [None]:
# Importaciones
# GENERAL Y PREPROCESADO
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import spacy
!python -m spacy download es_core_news_sm
import es_core_news_sm
from langdetect import detect
import os.path
import datetime
# BERT
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam
from seqeval.metrics import f1_score



In [None]:
# Conexion a drive y descompresión de los corpus
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!unzip /content/drive/MyDrive/COLAB\ -\ TFM/corpora-master.zip

## Funciones

In [None]:
def clasifica_data(file):
  file_csv = file
  ingles = []
  espanol = []
  with open(file_csv, "r") as f:
    texto = f.read().split("\n")
  for frase in texto:
    espaciado = frase.split(" ")
    idioma = detect(frase)
    if idioma == "en":
      ingles.append(frase)
    elif idioma == "es":
      espanol.append(frase)
  return ingles, espanol

  
def data_preprocessing_es(texto, ann = None):
  nlp = es_core_news_sm.load()
  nlp_sentences = [nlp(x) for x in texto]
  list_registro = []
  for i, frase in enumerate(nlp_sentences):
    for j, token in enumerate(frase):
      list_registro.append([i, j, token.lower_, token.pos_])
  df_train = pd.DataFrame(list_registro)
  df_train.columns = ["Sentence #", "Word In Sentence #", "Word", "POS"]
  df_train = df_train[~df_train["Word"].isin([" ",""])][df_train["POS"] != "PUNCT"]
  if ann != None:
    with open(ann, "r") as f:
      texto_ann = [x.split("\t") for x in f.read().split("\n")]
    df_ann = pd.DataFrame(texto_ann)
    df_ann.columns = ["num_termino", "desc", "Word"]
    df_ann_terminos = df_ann[df_ann["num_termino"].str.contains("T")].copy()
    df_ann_terminos["TipoEntidad"] = df_ann_terminos["desc"].str.split(" ").apply(lambda x: x[0])
    df_ann_terminos["Word_list"] = df_ann_terminos["Word"].apply(lambda x: list(nlp(x)))
    df_ann_terminos = df_ann_terminos.explode("Word_list")
    df_ann_terminos["rn"] = df_ann_terminos.groupby("num_termino")["Word"].cumcount()+1
    df_ann_terminos.loc[:,"Tag"]  = "I-"+df_ann_terminos.loc[df_ann_terminos["rn"] == 1,"TipoEntidad"]
    df_ann_terminos.loc[df_ann_terminos["rn"] == 1,"Tag"] = "B-"+df_ann_terminos.loc[df_ann_terminos["rn"] == 1,"TipoEntidad"]
    df_ann_terminos = df_ann_terminos[~df_ann_terminos["Word_list"].isin([" ",""])]
    df_ann_cruce = df_ann_terminos[["num_termino","Word_list","Tag"]].drop_duplicates().reset_index(drop=True)
    df_ann_cruce.columns = ["num_termino","Word", "Tag"]
    # Algoritmo de anotacion
    # Nos quedamos con un unico registro por numero de termino, palabra y tag
    df_ann_cruce = df_ann_terminos[["num_termino","Word_list","Tag"]].drop_duplicates().reset_index(drop=True)
    df_ann_cruce.columns = ["num_termino","Word", "Tag"]
    # generamos un flag unico
    df_ann_cruce["unico"] = df_ann_cruce["num_termino"] + df_ann_cruce["Word"].\
                              apply(lambda x: str(x))
    # inicializamos el pasado a NO
    df_ann_cruce["pasado"] = "NO"
    # Inicializamos la ventana de busqueda
    ventana_busqueda = 20
    registro = []
    df_train_preproceso = df_train.copy()
    # Incializamos el ultimo termino encontrado a -1 para evitar descartar ningun termino de partida
    max_termino = -1
    # Iteramos por cada palabra del corpus y buscaremos en el dataframe de anotacion
    for num_linea, linea in enumerate(df_train_preproceso.iterrows()):
      palabra = linea[1]["Word"]
      # Definimos el dataframe de busqueda
      df_lookup = df_ann_cruce.head(ventana_busqueda)
      # Nos quedamos con el registro encontrado
      ann_match = df_lookup[df_lookup["Word"].\
                              apply(lambda x: str(x).lower()).\
                              str.replace(".","") == palabra.replace(".","")].head(1)
      # if linea[1]["Sentence #"] == 724:
      #   print(palabra)
      #   print(df_lookup)
      #   print(ann_match)
      #   print(num_linea)
      #   print(df_ann_cruce)
      if len(ann_match) > 0:
        registro.append([linea[1]["Sentence #"], linea[1]["Word"],
                      linea[1]["POS"], ann_match["Tag"].iloc[0], ann_match["num_termino"].iloc[0]])
        df_ann_cruce = df_ann_cruce.loc[df_ann_cruce["unico"] != ann_match["unico"].iloc[0],:]
        max_termino = ann_match["num_termino"].iloc[0]
        # print(len(df_ann_cruce), print(len(registro)))
      else:
        registro.append([linea[1]["Sentence #"], linea[1]["Word"],
                          linea[1]["POS"], "O", "NA"])
      if num_linea % 100 == 0 and num_linea > 0:
        df_ann_cruce.loc[df_ann_cruce["num_termino"].apply(lambda x: int(x[1:])) < int(max_termino[1:]),"pasado"] = "SI"
        df_ann_cruce = df_ann_cruce.loc[df_ann_cruce["pasado"] != "SI",:]
    df = pd.DataFrame(registro)
    df.columns = ["Sentence #", "Word", "POS", "Tag", "num_concepto"]
    data = df[["Sentence #", "Word", "POS", "Tag"]]
    MAX_LEN = int(data["Sentence #"].value_counts().quantile(0.95))
    df_tmp = (data["Sentence #"].value_counts() <= MAX_LEN)
    data = data[data["Sentence #"].isin(df_tmp[df_tmp].index.tolist())]
    return data, MAX_LEN
  else:
    return df_train[["Word", "POS"]]


def data_preprocessing_en(texto, ann = None):
  nlp = spacy.load("en_core_web_sm")
  nlp_sentences = [nlp(x) for x in texto]
  list_registro = []
  for i, frase in enumerate(nlp_sentences):
    for j, token in enumerate(frase):
      list_registro.append([i, j, token.lower_, token.pos_])
  df_train = pd.DataFrame(list_registro)
  df_train.columns = ["Sentence #", "Word In Sentence #", "Word", "POS"]
  df_train = df_train[~df_train["Word"].isin([" ",""])][df_train["POS"] != "PUNCT"]
  if ann != None:
    with open(ann, "r") as f:
      texto_ann = [x.split("\t") for x in f.read().split("\n")]
    df_ann = pd.DataFrame(texto_ann)
    df_ann.columns = ["num_termino", "desc", "Word"]
    df_ann_terminos = df_ann[df_ann["num_termino"].str.contains("T")].copy()
    df_ann_terminos["TipoEntidad"] = df_ann_terminos["desc"].str.split(" ").apply(lambda x: x[0])
    df_ann_terminos["Word_list"] = df_ann_terminos["Word"].apply(lambda x: list(nlp(x)))
    df_ann_terminos = df_ann_terminos.explode("Word_list")
    df_ann_terminos["rn"] = df_ann_terminos.groupby("num_termino")["Word"].cumcount()+1
    df_ann_terminos.loc[:,"Tag"]  = "I-"+df_ann_terminos.loc[df_ann_terminos["rn"] == 1,"TipoEntidad"]
    df_ann_terminos.loc[df_ann_terminos["rn"] == 1,"Tag"] = "B-"+df_ann_terminos.loc[df_ann_terminos["rn"] == 1,"TipoEntidad"]
    df_ann_terminos = df_ann_terminos[~df_ann_terminos["Word_list"].isin([" ",""])]
    df_ann_cruce = df_ann_terminos[["num_termino","Word_list","Tag"]].drop_duplicates().reset_index(drop=True)
    df_ann_cruce.columns = ["num_termino","Word", "Tag"]
    # Algoritmo de anotacion
    # Nos quedamos con un unico registro por numero de termino, palabra y tag
    df_ann_cruce = df_ann_terminos[["num_termino","Word_list","Tag"]].drop_duplicates().reset_index(drop=True)
    df_ann_cruce.columns = ["num_termino","Word", "Tag"]
    # generamos un flag unico
    df_ann_cruce["unico"] = df_ann_cruce["num_termino"] + df_ann_cruce["Word"].\
                              apply(lambda x: str(x))
    # inicializamos el pasado a NO
    df_ann_cruce["pasado"] = "NO"
    # Inicializamos la ventana de busqueda
    ventana_busqueda = 20
    registro = []
    df_train_preproceso = df_train.copy()
    # Incializamos el ultimo termino encontrado a -1 para evitar descartar ningun termino de partida
    max_termino = -1
    # Iteramos por cada palabra del corpus y buscaremos en el dataframe de anotacion
    for num_linea, linea in enumerate(df_train_preproceso.iterrows()):
      palabra = linea[1]["Word"]
      # Definimos el dataframe de busqueda
      df_lookup = df_ann_cruce.head(ventana_busqueda)
      # Nos quedamos con el registro encontrado
      ann_match = df_lookup[df_lookup["Word"].\
                              apply(lambda x: str(x).lower()).\
                              str.replace(".","") == palabra.replace(".","")].head(1)
      # if linea[1]["Sentence #"] == 724:
      #   print(palabra)
      #   print(df_lookup)
      #   print(ann_match)
      #   print(num_linea)
      #   print(df_ann_cruce)
      if len(ann_match) > 0:
        registro.append([linea[1]["Sentence #"], linea[1]["Word"],
                      linea[1]["POS"], ann_match["Tag"].iloc[0], ann_match["num_termino"].iloc[0]])
        df_ann_cruce = df_ann_cruce.loc[df_ann_cruce["unico"] != ann_match["unico"].iloc[0],:]
        max_termino = ann_match["num_termino"].iloc[0]
        # print(len(df_ann_cruce), print(len(registro)))
      else:
        registro.append([linea[1]["Sentence #"], linea[1]["Word"],
                          linea[1]["POS"], "O", "NA"])
      if num_linea % 100 == 0 and num_linea > 0:
        df_ann_cruce.loc[df_ann_cruce["num_termino"].apply(lambda x: int(x[1:])) < int(max_termino[1:]),"pasado"] = "SI"
        df_ann_cruce = df_ann_cruce.loc[df_ann_cruce["pasado"] != "SI",:]
    df = pd.DataFrame(registro)
    df.columns = ["Sentence #", "Word", "POS", "Tag", "num_concepto"]
    data = df[["Sentence #", "Word", "POS", "Tag"]]
    MAX_LEN = int(data["Sentence #"].value_counts().quantile(0.95))
    df_tmp = (data["Sentence #"].value_counts() <= MAX_LEN)
    data = data[data["Sentence #"].isin(df_tmp[df_tmp].index.tolist())]
    return data, MAX_LEN
  else:
    return df_train[["Word", "POS"]]


class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

def bert_formating(data, MAX_LEN, test_size=0.2):
  #concat sentence
  getter = SentenceGetter(data)
  word_list = [ [s[0] for s in sent] for sent in getter.sentences] 
  sentences = word_list
  labels = [[s[2] for s in sent] for sent in getter.sentences]
  tags_vals = list(set(data["Tag"].values))
  tag2idx = {t: i for i, t in enumerate(tags_vals)}
  idx2tag = {i: t for i, t in enumerate(tags_vals) }
  words = list(set(data["Word"].values))
  n_words = len(words); 
  word2idx = {w: i + 2 for i, w in enumerate(words)}
  word2idx["UNK"] = 1
  word2idx["PAD"] = 0
  idx2word = {i: w for w, i in word2idx.items()}
  bs = 16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  if device == torch.device("cuda"):
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0) 
  tokenized_texts = word_list
  tokens_ids = [[word2idx[w] for w in s] for s in tokenized_texts]
  input_ids = pad_sequences(tokens_ids,
                          maxlen=int(MAX_LEN), dtype="int64", truncating="post", padding="post")
  for i in tokens_ids:
    if len(i) > MAX_LEN:
        #print(tokens_ids)
        print("need more max_len - defect after filtering")
        MAX_LEN = len(i)
  t_list = [[tag2idx.get(l) for l in lab] for lab in labels]
  tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="int64", truncating="post")
  attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
  #split train test
  tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                              random_state=2022, test_size=test_size)
  tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                              random_state=2022, test_size=test_size)
  tr_inputs = torch.tensor(tr_inputs)
  val_inputs = torch.tensor(val_inputs)
  tr_tags = torch.tensor(tr_tags)
  val_tags = torch.tensor(val_tags)
  tr_masks = torch.tensor(tr_masks)
  val_masks = torch.tensor(val_masks)
  train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

  valid_data = TensorDataset(val_inputs, val_masks, val_tags)
  valid_sampler = SequentialSampler(valid_data)
  valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)
  return train_data, train_sampler, train_dataloader, valid_data, valid_sampler,\
            valid_dataloader, tag2idx, device, tags_vals, idx2word, val_inputs
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def model_creation(tag2idx, device, FULL_FINETUNING = True, reentreno= False, save_path = ""):
  model = BertForTokenClassification.from_pretrained(u"bert-base-uncased", num_labels=len(tag2idx))
  if device == torch.device("cuda"):
    model.cuda()
  if reentreno:
    print("Loading existing model...")
    model.load_state_dict(torch.load(save_path))
  if FULL_FINETUNING:
      param_optimizer = list(model.named_parameters())
      no_decay = ['bias', 'gamma', 'beta']
      optimizer_grouped_parameters = [
          {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
          'weight_decay_rate': 0.01},
          {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
          'weight_decay_rate': 0.0}
      ]
  else:
      param_optimizer = list(model.classifier.named_parameters()) 
      optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
  optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
  return model, optimizer

def training(model, optimizer, train_dataloader, valid_dataloader, tags_vals, idx2word, epochs = 200,
             max_grad_norm = 1.0, save_path = "./bert1"):
  train_f1 = []
  train_losses = []
  val_losses = []
  contador_max = -1
  for _ in trange(epochs, desc="Epoch"):
      # TRAIN loop
      model.train()
      tr_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0
      for step, batch in enumerate(train_dataloader):
          # add batch to gpu
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_input_mask, b_labels = batch
          # forward pass
          loss = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask, labels=b_labels)
          # backward pass
          loss.backward()
          # track train loss
          tr_loss += loss.item()
          nb_tr_examples += b_input_ids.size(0)
          nb_tr_steps += 1
          # gradient clipping
          torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
          # update parameters
          optimizer.step()
          model.zero_grad()
      # print train loss per epoch
      print("Train loss: {}".format(tr_loss/nb_tr_steps))
      # VALIDATION on validation set
      model.eval()
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0
      predictions , true_labels, true_inputs = [], [],[]
      for batch in valid_dataloader:
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_input_mask, b_labels = batch
          
          with torch.no_grad():
              tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                    attention_mask=b_input_mask, labels=b_labels)
              logits = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          inputs = b_input_ids.to('cpu').numpy()
          
          true_inputs.append(inputs)
          predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
          true_labels.append(label_ids)
          
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          
          eval_loss += tmp_eval_loss.mean().item()
          eval_accuracy += tmp_eval_accuracy
          
          nb_eval_examples += b_input_ids.size(0)
          nb_eval_steps += 1
      eval_loss = eval_loss/nb_eval_steps
      print("Validation loss: {}".format(eval_loss))
      print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
      pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
      valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
      valid_inputs = [[idx2word[l_ii] for l_ii in l_i] for l in  true_inputs  for l_i in l ]
      f1 = f1_score([pred_tags], [valid_tags])
      train_f1.append(f1)
      train_losses.append(tr_loss/nb_tr_steps)
      val_losses.append(eval_loss)

      max_f1 = max(train_f1)
      if f1 == max_f1:
        contador_max = 1
        torch.save(model.state_dict(), save_path)
      if contador_max > 0:
        contador_max += 1
      print("F1-Score: " + str(train_f1[-1]))
      if round(max_f1, 2) > 0.1 and contador_max > 10 :
        print("Early stopping...")
        return 0
def evaluate(model, valid_dataloader, tag2idx, device, tags_vals, idx2word, val_inputs, save_path = "./bert1",
             guarda_resultado="/content/drive/MyDrive/COLAB - TFM/resultado_entrenamiento1.csv"):
  #evaluate model
  model = BertForTokenClassification.from_pretrained(u"bert-base-uncased", num_labels=len(tag2idx))
  model.load_state_dict(torch.load(save_path))
  if device == torch.device("cuda"):
    model.cuda()  
  model.eval()
  predictions = []
  true_labels = []
  true_inputs = []

  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  print(len(valid_dataloader))
  for batch in tqdm(valid_dataloader):
      #print(len(batch))
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      with torch.no_grad():
          tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
          logits = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask)
          
      logits = logits.detach().cpu().numpy()
      predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      label_ids = b_labels.to('cpu').numpy()
      inputs = b_input_ids.to('cpu').numpy()
      true_inputs.append(inputs)
      
      
      true_labels.append(label_ids)
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      eval_accuracy += tmp_eval_accuracy

      nb_eval_examples += b_input_ids.size(0)
      nb_eval_steps += 1

  pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
  valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
  valid_inputs = [[idx2word[l_ii] for l_ii in l_i] for l in  true_inputs  for l_i in l ]

  count = 0 
  all_data = 0
  registro_resultado = []
  for i,j,ll in zip(pred_tags,valid_tags,val_inputs):
      for k,l,kk in zip(i,j,ll):
        count += 1
        registro_resultado.append([k,l,idx2word[kk.item()]])
        all_data += 1
  df_alldata = pd.DataFrame(registro_resultado)
  df_alldata.columns = ["pred_tags", "valid_tags", "val_inputs"]
  df_alldata[df_alldata["val_inputs"] != "PAD"].\
        to_csv(guarda_resultado,sep=";", encoding = "utf-8")
  return pred_tags, valid_tags, valid_inputs




## Entrenamiento

In [None]:

raiz = "/content/drive/MyDrive/COLAB - TFM/"
sufijo = "_preprocessing.csv"
archivos = [("wikinews.25.txt","wikinews.75.txt","wikinews.300.es.txt"),
            ("medline.25.txt","medline.75.txt","medline.1200.es.txt"),
            ("cord.50.txt","cord.150.txt")]

In [None]:

modelo_en_len0_25 = "bert_en_len_0_25"
modelo_en_len25_50 = "bert_en_len_25_50"

modelo_es_len0_25 = "bert_es_len_0_25"
modelo_es_len25_50 = "bert_es_len_25_50"

raiz = "/content/drive/MyDrive/COLAB - TFM/"

log_file = "/content/drive/MyDrive/COLAB - TFM/full_training.log"

NUM_EPOCHS = 100
for corpus, anotacion in corpus_anotacion:
  resultado = corpus.split("/")[-1]+"_resultado"
  ingles, espanol = clasifica_data(file=corpus)
  if len(ingles) > 0:
    df_ingles_analiza, MAX_LEN = data_preprocessing_en(ingles, ann = anotacion)
    df_tmp = df_ingles_analiza.groupby("Sentence #").Word.count().reset_index()
    df_tmp["longitud"] = df_tmp["Word"].astype(int)
    df_tmp.loc[df_tmp["longitud"] <= 25, "len_grupo"] = "Menos25"
    df_tmp.loc[(df_tmp["longitud"] > 25) & (df_tmp["longitud"] <= 50), "len_grupo"] = "entre25_50"
    df_cruce_ingles = pd.merge(df_ingles_analiza, df_tmp[["Sentence #", "len_grupo"]], on=["Sentence #"], how="left")
  else:
    df_cruce_ingles = pd.DataFrame(columns=["Sentence #", "Word", "POS", "Tag", "len_grupo"])
  if len(espanol) > 0:
    df_espanol_analiza, MAX_LEN = data_preprocessing_es(espanol, ann = anotacion)
    df_tmp = df_espanol_analiza.groupby("Sentence #").Word.count().reset_index()
    df_tmp["longitud"] = df_tmp["Word"].astype(int)
    df_tmp.loc[df_tmp["longitud"] <= 25, "len_grupo"] = "Menos25"
    df_tmp.loc[(df_tmp["longitud"] > 25) & (df_tmp["longitud"] <= 50), "len_grupo"] = "entre25_50"
    df_cruce_espanol = pd.merge(df_espanol_analiza, df_tmp[["Sentence #", "len_grupo"]], on=["Sentence #"], how="left")
  else:
    df_cruce_espanol = pd.DataFrame(columns=["Sentence #", "Word", "POS", "Tag", "len_grupo"])
  for data, modelo, MAX_LEN, lan in ([df_cruce_espanol.loc[(df_cruce_espanol["len_grupo"] == "entre25_50") & (df_cruce_espanol["Sentence #"].isin(df_cruce_espanol["Sentence #"].head(25).values)),["Sentence #", "Word", "POS", "Tag"]], raiz+modelo_es_len25_50, 50, "es"],
                                #[df_cruce_espanol.loc[df_cruce_espanol["len_grupo"] == "Menos25",["Sentence #", "Word", "POS", "Tag"]], raiz+modelo_es_len0_25, 25, "es"],
                                #[df_cruce_ingles.loc[df_cruce_ingles["len_grupo"] == "entre25_50",["Sentence #", "Word", "POS", "Tag"]], raiz+modelo_en_len25_50, 50, "en"],
                                #[df_cruce_ingles.loc[df_cruce_ingles["len_grupo"] == "mas50",["Sentence #", "Word", "POS", "Tag"]], raiz+modelo_en_len0_25, 25, "en"]
                                ):
    with open(log_file, "a+") as f:
      f.write(";".join([str(datetime.datetime.now()), corpus, modelo, "\n"]))
    if len(data) > 0:
      print(modelo)
      print(len(data))
      train_data, train_sampler, train_dataloader, valid_data, valid_sampler, valid_dataloader, tag2idx, device, tags_vals, idx2word, val_inputs = bert_formating(data, MAX_LEN, test_size=0.2)
      if os.path.exists(modelo):
        with open(log_file, "a+") as f:
          f.write("Escojo modelo existente: " +  modelo +"\n")
        model, optimizer = model_creation(tag2idx, device, FULL_FINETUNING = True, reentreno= True, save_path = modelo)
      else:
        model, optimizer = model_creation(tag2idx, device, FULL_FINETUNING = True)
      training(model, optimizer, train_dataloader, valid_dataloader, tags_vals, idx2word, epochs = NUM_EPOCHS,
                  max_grad_norm = 1.0, save_path = modelo)
      pred_tags, valid_tags, valid_inputs = evaluate(model, valid_dataloader, tag2idx, device, tags_vals, idx2word, val_inputs, save_path = modelo,
                  guarda_resultado=raiz+resultado+"_validacion_"+str(MAX_LEN)+"_"+lan+".csv")
      pred_tags, valid_tags, valid_inputs = evaluate(model, train_dataloader, tag2idx, device, tags_vals, idx2word, val_inputs, save_path = modelo,
                  guarda_resultado=raiz+resultado+"_entrenamiento_"+str(MAX_LEN)+"_"+lan+".csv")
    else:
      print("No hay datos")
