# Fine tune and evaluate a Sentence Transformers Model in Spanish

In [None]:
%%capture
!pip install sentence-transformers transformers


## How to prepare your dataset for training a Sentence Transformers model


In [None]:
%%capture
!pip install datasets

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer


In [6]:
import os
import pandas as pd

In [1]:
from google.colab import drive

drive.mount('/content/drive',)

Mounted at /content/drive


In [5]:
#Set the path to the data folder, datafile and output folder and files
root_folder = '/content/drive/My Drive/'
data_folder = os.path.abspath(os.path.join(root_folder, 'datasets/MSCOCO_spanish/data'))
train_file = os.path.abspath(os.path.join(data_folder, 'train_human_spanish.xlsx'))

train_folder = os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/coco-es-train'))
validation_folder = os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/coco-es-val'))
test_folder = os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/coco-es-test'))

model_save_path= os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/bertin-sts-ft-coco-es'))

In [7]:
dataset = pd.read_excel(train_file, header=0)

In [10]:
df_counts = dataset.groupby(['image_id']).size().reset_index(name='counts')

In [20]:
df_counts[df_counts['counts']<5]['image_id'].values


array([10108])

In [21]:
dataset[dataset['image_id'] not in df_counts[df_counts['counts']<5]['image_id'].values]

  """Entry point for launching an IPython kernel.


ValueError: ignored

In [16]:
dataset.groupby(['image_id']).nth(1)

Unnamed: 0_level_0,caption
image_id,Unnamed: 1_level_1
10114,Una pequeña cocina residencial con armarios y ...
10115,Imagen a una sala de descanso pequeña y estrec...
10123,Una mujer golpeando una pelota de tenis con un...
10125,Un par de edificios con un cartel en la parte ...
10130,Conjunto de imágenes sobre un teléfono y un pa...
...,...
19817,Un inodoro cerrado en un baño junto a una cort...
19818,Un gran espejo sobre un lavabo.
19828,Varios coches circulando por la calle de una c...
19836,Un envoltorio de condón y una cascara de pláta...


In [None]:
from datasets import load_dataset

#dataset_id = "embedding-data/QQP_triplets"
#dataset_id = "embedding-data/sentence-compression"
dataset_name= 'LeoCordoba/CC-NEWS-ES-titles'

dataset = load_dataset(dataset_name)
#dataset = load_dataset("LeoCordoba/CC-NEWS-ES-titles",split='train[:25%]')
#print(f"- The {dataset_name} dataset has {dataset.num_rows} examples.")

Downloading builder script:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading and preparing dataset cc-news-es-titles/default (download: 624.33 MiB, generated: 614.04 MiB, post-processed: Unknown size, total: 1.21 GiB) to /root/.cache/huggingface/datasets/LeoCordoba___cc-news-es-titles/default/0.0.0/4ce1747fb0af21e9f8f8b47a10039a2ea420c706adcb11d31c0edbbcbb3559f9...


Downloading data:   0%|          | 0.00/602M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/370125 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16092 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/16093 [00:00<?, ? examples/s]

Dataset cc-news-es-titles downloaded and prepared to /root/.cache/huggingface/datasets/LeoCordoba___cc-news-es-titles/default/0.0.0/4ce1747fb0af21e9f8f8b47a10039a2ea420c706adcb11d31c0edbbcbb3559f9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(f"- The {dataset_name} dataset has {dataset['train'].num_rows} examples.")
print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['text'])} as value.")
print(f"- Examples look like this: {dataset['train'][0]}")

- The LeoCordoba/CC-NEWS-ES-titles dataset has 370125 examples.
- Each example is a <class 'dict'> with a <class 'str'> as value.
- Examples look like this: {'text': ', se designó a la virgen Santa Rosa de Lima, como su patrona, mediante D.S. N°0027-89-IN, con fecha de publicación el 18 de setiembre de 1989 y finalmente quedó plasmado en el calendario anual de festividades, designándose el 30 de agosto de cada año como el Día de Santa Rosa de Lima.\t', 'output_text': 'MPT reconoce ardua labor de efectivos policiales en favor de la población'}


In [None]:
dataset[0]
#dataset['train'][0]

{'text': ', se designó a la virgen Santa Rosa de Lima, como su patrona, mediante D.S. N°0027-89-IN, con fecha de publicación el 18 de setiembre de 1989 y finalmente quedó plasmado en el calendario anual de festividades, designándose el 30 de agosto de cada año como el Día de Santa Rosa de Lima.\t',
 'output_text': 'MPT reconoce ardua labor de efectivos policiales en favor de la población'}

## Analyze the dataset

Support for third party widgets (widgets outside of the ipywidgets package) needs to be enabled separately. Support for these widgets will be loaded from a CDN external from Colab.

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
# tokenizer used in preprocessing
#tokenizer_name = "mrm8488/RuPERTa-base"
tokenizer_name = "bertin-project/bertin-roberta-base-spanish" #"dccuchile/bert-base-spanish-wwm-cased"
# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [None]:
# map text and summary len to dict as well as if sample is longer than 256 tokens
def map_to_length(x):
  x["text_len"] = len(tokenizer(x["text"]).input_ids)
  x["text_longer_256"] = int(x["text_len"] > 256)
  x["output_len"] = len(tokenizer(x["output_text"]).input_ids)
  x["output_longer_16"] = int(x["output_len"] > 16)
  x["output_longer_32"] = int(x["output_len"] > 32)
  return x

sample_size = 50000
data_stats = dataset.select(range(sample_size)).map(map_to_length, num_proc=4)

      

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (777 > 512). Running this sequence through the model will result in indexing errors


  

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (644 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (716 > 512). Running this sequence through the model will result in indexing errors


In [None]:
# Compute some basic stats like mean length of text and summary
def compute_and_print_stats(x):
  if len(x["text_len"]) == sample_size:
    print(
        "Article Mean: {}, %-Articles > 256:{}, Summary Mean:{}, %-Summary > 32:{}, %-Summary > 16:{}".format(
            sum(x["text_len"]) / sample_size,
            sum(x["text_longer_256"]) / sample_size, 
            sum(x["output_len"]) / sample_size,
            sum(x["output_longer_32"]) / sample_size,
            sum(x["output_longer_16"]) / sample_size,
        )
    )

output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

Article Mean: 317.98212, %-Articles > 256:0.41316, Summary Mean:18.3805, %-Summary > 32:0.03518, %-Summary > 16:0.54314


In [None]:
dataset['train'][0]['output_text']

'MPT reconoce ardua labor de efectivos policiales en favor de la población'

Convert the examples into `InputExample`s. It might around 10 seconds in Google Colab.

In [None]:
from sentence_transformers import InputExample

train_examples = []
train_data = dataset['train']
# For agility we only 1/2 of our available data
n_examples = dataset['train'].num_rows // 100
print('Rows to collect:', n_examples)

for i in range(n_examples):
  #if len(tokenizer(train_data[i]["text"]).input_ids)<256 and len(tokenizer(train_data[i]["output_text"]).input_ids)<32:
    #text_example = train_data[i]["text"]
    #output_example = train_data[i]["output_text"]
  train_examples.append(InputExample(texts=[train_data[i]["text"], train_data[i]["output_text"]]))

print(len(train_examples))

Rows to collect: 3701
3701


In [None]:
#train_examples[0]['texts']
print(train_examples[3])

<InputExample> label: 0, texts: Una camioneta blindada tipo militar “Sandcast” realiza un rondín de vigilancia por el Hotel Holiday Inn Reynosa, en la Zona Dorada de ese municipio.Del automóvil destaca el potente armamento que sólo el Ejército mexicano puede tener, con balas calibre .50, publicó El Universal.Se trata de un dispositivo de seguridad instalado, de manera inusitada, para la primera gira de trabajo del presidente Andrés Manuel López Obrador a ese municipio, uno de los más golpeados por la violencia y el crimen organizado.De acuerdo con versiones periodísticas, un día antes de la llegada del mandatario, quien viaja en vuelos comerciales y sin elementos del Estado Mayor Presidencial que cuiden su seguridad, se registró un enfrentamiento en el municipio entre fuerzas federales y grupos delincuenciales.Aquí es la tierra del Cártel del Golfo, por ello, además del Ejército, elementos de la policía Estatal ayudan a las labores de vigilancia.Con cartulinas, mantas o al grito de “ju

In [None]:
from sentence_transformers import InputExample

train_examples = []
train_data = dataset['train']['set']
# For agility we only 1/2 of our available data
n_examples = dataset['train'].num_rows // 2

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example['query'], example['pos'][0], example['neg'][0]]))

In [None]:
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

We have a <class 'list'> of length 20906 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


We wrap our training dataset into a Pytorch `Dataloader` to shuffle examples and get batch sizes.

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

## How Sentence Transformers models work


In [None]:
from sentence_transformers import SentenceTransformer, models

## Step 1: use an existing language model
word_embedding_model = models.Transformer("bertin-project/bertin-roberta-base-spanish")

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at bertin-project/bertin-roberta-base-spanish were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

Downloading tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/831k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

## Loss functions for training a Sentence Transformers model


In [None]:
from sentence_transformers import losses

train_loss = losses.MultipleNegativesRankingLoss(model=model)

In [None]:
# ORIGINAL CODE
from sentence_transformers import losses

train_loss = losses.TripletLoss(model=model)

## How to evaluate a model

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(train_examples, name='sts-test')
test_evaluator(model, output_path='.')




nan

## How to train a Sentence Transformer model


In [None]:
len(train_dataloader)

1307

In [None]:
num_epochs = 5 # 10 original

warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

Training takes around 45 minutes with a Google Colab Pro account. Decrease the number of epochs and examples if you are using a free account or no GPU.

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps) 

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1307 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1307 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1307 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1307 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1307 [00:00<?, ?it/s]

## How to share a Sentence Transformers to the Hugging Face Hub

In [None]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in yo

In [None]:
model.save_to_hub(
    "distilroberta-sentence-transformer-test", 
    #organization="embedding-data",
    train_datasets=["embedding-data/sentence-compression"],
    exist_ok=True, 
    )

Cloning https://huggingface.co/edumunozsala/distilroberta-sentence-transformer-test into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.34k/313M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/edumunozsala/distilroberta-sentence-transformer-test
   615c920..aaeb0a5  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/edumunozsala/distilroberta-sentence-transformer-test
   615c920..aaeb0a5  main -> main



'https://huggingface.co/edumunozsala/distilroberta-sentence-transformer-test/commit/aaeb0a54b2259a207b631168d6b6488b3ab857db'

In [None]:
model.save_to_hub(
    "distilroberta-base-sentence-transformer", 
    organization="embedding-data",
    train_datasets=["embedding-data/QQP_triplets"],
    exist_ok=True, 
    )

## Extra: How to fine-tune a Sentence Transformer model


Now we will fine-tune our Sentence Transformer model.

In [None]:
modelB = SentenceTransformer('embedding-data/distilroberta-base-sentence-transformer')

In [None]:
dataset_id = "embedding-data/sentence-compression"
datasetB = load_dataset(dataset_id)

In [None]:
print(f"Examples look like this: {datasetB['train']['set'][0]}")

In [None]:
train_examplesB = []
train_dataB = dataset['train']['set']
n_examples = dataset['train'].num_rows

for i in range(n_examples):
  example = train_dataB[i]
  train_examplesB.append(InputExample(texts=[example[0], example[1]]))

In [None]:
train_dataloaderB = DataLoader(train_examplesB, shuffle=True, batch_size=64)
train_lossB = losses.MultipleNegativesRankingLoss(model=modelB)
num_epochsB = 10
warmup_stepsB = int(len(train_dataloaderB) * num_epochsB * 0.1) #10% of train data

In [None]:
model.fit(train_objectives=[(train_dataloaderB, train_lossB)],
          epochs=num_epochsB,
          warmup_steps=warmup_stepsB) 

In [None]:
model.save_to_hub(
    "distilroberta-base-sentence-transformer", 
    organization="embedding-data",
    train_datasets=["embedding-data/sentence-compression"],
    exist_ok=True, 
    )