In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Entrenamiento de modelos Beto (Spanish Bert) y Tulio (Chilean Spanish Bert) para crear text embeddings para búsqueda de sentencias asimétricas en español

# Configuración de parametros

In [4]:
CONFIG = {
    # Modelo base disponible en Hugging Face
    'model_name': 'dccuchile/bert-base-spanish-wwm-cased', #dccuchile/tulio-chilean-spanish-bert #dccuchile/bert-base-spanish-wwm-uncased
    'max_seq_length': 512,

    # Train model
    'epochs': 10,
    'warmup_steps': 1000,
    'batch_size': 16,
    'optimizer_params': {'lr': 2e-5},
    'loss': 'mnrl',  # {'mnrl', 'mse', 'tl'}

    # Dataset
    'dataset_train_size': 120330, #48133, #4813 / 481335
    'dataset_name': "IIC/ms_marco_es",  # {"dariolopez/ms-marco-es-500k", "IIC/ms_marco_es"}

    # General
    'seed': 2023
}

# Instalación de bibliotecas necesarias
* Sentence Transformers: Biblioteca par entrenar modelos para similitud de oraciones.
* Datasets: Biblioteca para utilizar datasets de huggingface

In [5]:
!pip install sentence-transformers datasets

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

# Importación de Bibliotecas

In [6]:
import os
from datetime import datetime

from sentence_transformers import InputExample, SentenceTransformer, models, losses
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#Si el entrenamiento se hace en otro dispositivo con soporte cuda (nvidia). En caso de realizae el entrenamiento en macOS cambiar por "mps"

# Seed

In [8]:
import numpy as np


def set_seed(seed):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(CONFIG['seed'])

# Modelo

In [9]:
word_embedding_model = models.Transformer(
    model_name_or_path=CONFIG['model_name'],
    max_seq_length=CONFIG['max_seq_length'],
    tokenizer_name_or_path=CONFIG['model_name']
)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension()
)
model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model],
    device=device
)

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [10]:
CONFIG['length_embedding'] = word_embedding_model.get_word_embedding_dimension()

In [11]:
print(CONFIG)

{'model_name': 'dccuchile/bert-base-spanish-wwm-cased', 'max_seq_length': 512, 'epochs': 10, 'warmup_steps': 1000, 'batch_size': 16, 'optimizer_params': {'lr': 2e-05}, 'loss': 'mnrl', 'dataset_train_size': 120330, 'dataset_name': 'IIC/ms_marco_es', 'seed': 2023, 'length_embedding': 768}


In [12]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


# Cargar Dataset

In [13]:
%%time
import datasets

marco_es = datasets.load_dataset(CONFIG['dataset_name'])

Downloading readme:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/981 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/96.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/388055 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/47535 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/45745 [00:00<?, ? examples/s]

CPU times: user 3.52 s, sys: 883 ms, total: 4.4 s
Wall time: 13.2 s


In [14]:
from datasets import concatenate_datasets

if CONFIG['dataset_name'] == 'dariolopez/ms-marco-es-500k':
    marco_es = marco_es['train']
else:  # "IIC/ms_marco_es"
    marco_es = concatenate_datasets([marco_es['train'], marco_es['validation'], marco_es['test']])

In [15]:
print(marco_es)

Dataset({
    features: ['query', 'passages', 'labels'],
    num_rows: 481335
})


# Preparación para el entrenamiento

In [16]:
if CONFIG['dataset_name'] == 'dariolopez/ms-marco-es-500k':  # query - positive - negative  https://huggingface.co/datasets/dariolopez/ms-marco-es
    train_samples = [
        InputExample(texts=[row['query'], row['positive'], row['negative']])
        for row in marco_es.select(range(CONFIG['dataset_train_size']))
    ]
else:  # query - passage - label https://huggingface.co/datasets/IIC/ms_marco_es
    train_samples = [
        InputExample(texts=[row['query'], row['passages']], label=row['labels'])
        for row in marco_es.select(range(CONFIG['dataset_train_size']))
    ]

In [17]:
print(f"Largo de set de datos de entrenamiento: {len(train_samples)}")

Largo de set de datos de entrenamiento: 120330


In [18]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=CONFIG['batch_size'])

In [19]:
##eliminar
import json

now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = os.path.join('output', now)
os.makedirs(model_save_path, exist_ok=True)
with open(os.path.join(model_save_path, 'train_config.json'), 'w') as file:
    file.write(json.dumps(CONFIG, indent=4))

In [20]:
import gc

def free_memory():
    torch.cuda.empty_cache()
    gc.collect()

In [21]:
if CONFIG['loss'] == 'mnrl':
    train_loss = losses.MultipleNegativesRankingLoss(model=model)
elif CONFIG['loss'] == 'mse':
    train_loss = losses.MarginMSELoss(model=model)
elif CONFIG['loss'] == 'tl':
    train_loss = losses.TripletLoss(model=model)
else:
    train_loss = losses.CosineSimilarityLoss(model=model)

In [22]:
print(train_loss)

MultipleNegativesRankingLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (cross_entropy_loss): CrossEntropyLoss()
)


In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Entrenamiento

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=CONFIG['epochs'],
    warmup_steps=CONFIG['warmup_steps'],
    save_best_model=True,
    show_progress_bar=True,
    use_amp=True,
    callback=free_memory,
    checkpoint_save_steps=len(train_dataloader),
    checkpoint_path=model_save_path,
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7521 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7521 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7521 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7521 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7521 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7521 [00:00<?, ?it/s]

In [None]:
# Guardar modelo
model.save('/content/output/final')

In [None]:
print(model_save_path)

In [None]:
!zip -r beto-mnrl-v3.zip /content/output/final/

In [None]:
!cp beto-mnrl-v3.zip /content/drive/MyDrive/