# Train a Sentence Transformers Model in Spanish

In [None]:
%%capture
!pip install sentence-transformers
!pip install transformers datasets

In [None]:
%%capture
!pip install transformers datasets

## Load the dataset from Huggingface Hub


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from sentence_transformers import InputExample, SentenceTransformer, models, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader

import os

In [None]:
#dataset_id = "embedding-data/sentence-compression"
dataset_name= 'LeoCordoba/CC-NEWS-ES-titles'

dataset = load_dataset(dataset_name)
#dataset = load_dataset("LeoCordoba/CC-NEWS-ES-titles",split='train[:25%]')
print(f"- The {dataset_name} dataset has {dataset.num_rows} examples.")



  0%|          | 0/3 [00:00<?, ?it/s]

- The LeoCordoba/CC-NEWS-ES-titles dataset has {'train': 370125, 'validation': 16092, 'test': 16093} examples.


## Analyze the dataset

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'output_text'],
        num_rows: 370125
    })
    validation: Dataset({
        features: ['text', 'output_text'],
        num_rows: 16092
    })
    test: Dataset({
        features: ['text', 'output_text'],
        num_rows: 16093
    })
})

Support for third party widgets (widgets outside of the ipywidgets package) needs to be enabled separately. Support for these widgets will be loaded from a CDN external from Colab.

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

We are interested in the maximun length of our texts and summaries, so we need a tokenizer to split our sentences in words.

In [None]:
# tokenizer used in preprocessing
tokenizer_name = "bertin-project/bertin-roberta-base-spanish" #"dccuchile/bert-base-spanish-wwm-cased"
# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

Next we create a function to count how many examples has a text longer than the maximun length we would like to use. then we repeat the operation with our summaries. And we edfine another function to show us the results.

In [None]:
# map text and summary len to dict as well as if sample is longer than 256 tokens
def map_to_length(x):
  x["text_len"] = len(tokenizer(x["text"]).input_ids)
  x["text_longer_max"] = int(x["text_len"] > 256)
  x["output_len"] = len(tokenizer(x["output_text"]).input_ids)
  x["output_longer_min"] = int(x["output_len"] > 16)
  x["output_longer_max"] = int(x["output_len"] > 32)
  return x

# Compute some basic stats like mean length of text and summary
def compute_and_print_stats(x):
  if len(x["text_len"]) == sample_size:
    print(
        "Article Mean: {}, %-Articles > 256:{}, Summary Mean:{}, %-Summary > 32:{}, %-Summary > 16:{}".format(
            sum(x["text_len"]) / sample_size,
            sum(x["text_longer_max"]) / sample_size, 
            sum(x["output_len"]) / sample_size,
            sum(x["output_longer_max"]) / sample_size,
            sum(x["output_longer_min"]) / sample_size,
        )
    )

In [None]:
# to reduce compute time we take sample from our dataset
sample_size = 50000
# Calculate the statistics
data_stats = dataset['train'].select(range(sample_size)).map(map_to_length, num_proc=4)
# Show the results
output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

## Preprocess our datasets

Now we can filter our datasets to a maximun lenght of 256 for the text feature and 32 for the output_text

In [None]:
train_dataset = dataset['train'].filter(lambda example: len(tokenizer(example["text"]).input_ids)<256 and len(tokenizer(example["output_text"]).input_ids)<32)
validation_dataset = dataset['validation'].filter(lambda example: len(tokenizer(example["text"]).input_ids)<256 and len(tokenizer(example["output_text"]).input_ids)<32)
# the test dataset is not going to be reduced in order to get a more realistic evaluation results 
test_dataset = dataset['test'].filter(lambda example: len(tokenizer(example["text"]).input_ids)<256 and len(tokenizer(example["output_text"]).input_ids)<32)



  0%|          | 0/17 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (743 > 512). Running this sequence through the model will result in indexing errors


In [None]:
print('Train dataset Length: ', train_dataset.num_rows)
print('Validation dataset Length: ', validation_dataset.num_rows)
print('Test dataset Length: ', test_dataset.num_rows)

Train dataset Length:  208855
Validation dataset Length:  9135
Test dataset Length:  9068


We save our filtered datasets to disk

In [None]:
from google.colab import drive

drive.mount('/content/drive',)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#Set the path to the data folder, datafile and output folder and files
root_folder = '/content/drive/My Drive/'

train_folder = os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/cc-news-es-train'))
validation_folder = os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/cc-news-es-val'))
test_folder = os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/cc-news-es-test'))

model_save_path= os.path.abspath(os.path.join(root_folder, 'Projects/sentence-transformer-spanish/bertin-sts-cc-news'))

In [None]:
train_dataset.save_to_disk(train_folder)
validation_dataset.save_to_disk(validation_folder)
test_dataset.save_to_disk(test_folder)



Flattening the indices:   0%|          | 0/10 [00:00<?, ?ba/s]



Convert the examples into `InputExample`s. It might around 10 seconds in Google Colab.

In [None]:
from sentence_transformers import InputExample

def prepare_input_data(dataset, num_rows):
  examples = []
  # For agility we only 1/2 of our available data
  n_examples = num_rows
  print('Rows to collect:', n_examples)

  for i in range(n_examples):
    examples.append(InputExample(texts=[dataset[i]["text"], dataset[i]["output_text"]]))

  return examples


Now it is time to prepare the dataset to be used for our sentence transformer training

In [None]:
train_examples= prepare_input_data(train_dataset, 3000) #train_dataset.num_rows
#validation_examples= prepare_input_data(validation_dataset, 500 ) #validation_dataset.num_rows
#validation_examples= prepare_input_data(validation_dataset, 1000 ) #validation_dataset.num_rows

Rows to collect: 3000


We wrap our training dataset into a Pytorch `Dataloader` to shuffle examples and get batch sizes.

In [None]:
# Create a DataLoader to be trained
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

## How Sentence Transformers models work


In [None]:
## Step 1: use an existing language model
word_embedding_model = models.Transformer("bertin-project/bertin-roberta-base-spanish")

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bertin-project/bertin-roberta-base-spanish were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

## Loss functions for training a Sentence Transformers model


In [None]:
# Define the loss function for our dataset
train_loss = losses.MultipleNegativesRankingLoss(model=model)

## Create an evaluator

In [None]:
def convert_dataset(dataset):
    dataset_samples=[]
    for df in dataset:
        score = float(df['similarity_score'])/5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[df['sentence1'], 
                                    df['sentence2']], label=score)
        dataset_samples.append(inp_example)
    return dataset_samples

In [None]:
def load_and_prepare_evaluator(dataset_path, dataset_name, split, eval_name):
  # Loading the dataset for evaluation
  df = load_dataset(dataset_path, name=dataset_name, split=split)
  # Convert the dataset for evaluation
  # For Dev set:
  samples = convert_dataset(df)
  evaluator = EmbeddingSimilarityEvaluator.from_input_examples(samples, name=eval_name)

  return evaluator


In [None]:
#Create the validation evaluator
evaluator = load_and_prepare_evaluator("stsb_multi_mt", "es", "dev", "sts-dev")



## How to train a Sentence Transformer model


In [None]:
len(train_dataloader)

188

In [None]:
num_epochs = 2 # 10 original
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of train data

Training takes around 45 minutes with a Google Colab Pro account. Decrease the number of epochs and examples if you are using a free account or no GPU.

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator= evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/188 [00:00<?, ?it/s]

Iteration:   0%|          | 0/188 [00:00<?, ?it/s]

## Save the model to disk

In [None]:
## Save the model
model.save(model_save_path, 'sts-bertin-cc-news-es', train_datasets= [dataset_name])

## Hot to evaluate the model in STSBenchmark

In [None]:
#Create the validation evaluator
test_evaluator = load_and_prepare_evaluator("stsb_multi_mt", "es", "test", "sts-test")
test_evaluator(model, output_path=model_save_path)



0.6461483977067295