# Train a Sentence Transformers Model in Spanish
## Data Processing

## Installing the libraries

In [2]:
!pip install sentence-transformers
!pip install transformers datasets[s3]

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m851.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting torch>=1.6.0
  Downloading torch-1.12.1-cp37-cp37m-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading torchvision-0.13.1-cp37-cp37m-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
#from sentence_transformers import InputExample, SentenceTransformer, models, losses
from torch.utils.data import DataLoader
from datasets.filesystems import S3FileSystem

## Loading the dataset from Huggingface Hub

In [4]:
dataset_name= 'LeoCordoba/CC-NEWS-ES-titles'

dataset = load_dataset(dataset_name)
#dataset = load_dataset("LeoCordoba/CC-NEWS-ES-titles",split='train[:25%]')
print(f"- The {dataset_name} dataset has {dataset.num_rows} examples.")

Downloading builder script:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading and preparing dataset cc-news-es-titles/default (download: 624.33 MiB, generated: 614.04 MiB, post-processed: Unknown size, total: 1.21 GiB) to /root/.cache/huggingface/datasets/LeoCordoba___cc-news-es-titles/default/0.0.0/4ce1747fb0af21e9f8f8b47a10039a2ea420c706adcb11d31c0edbbcbb3559f9...


Downloading data:   0%|          | 0.00/602M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/370125 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16092 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/16093 [00:00<?, ? examples/s]

Dataset cc-news-es-titles downloaded and prepared to /root/.cache/huggingface/datasets/LeoCordoba___cc-news-es-titles/default/0.0.0/4ce1747fb0af21e9f8f8b47a10039a2ea420c706adcb11d31c0edbbcbb3559f9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

- The LeoCordoba/CC-NEWS-ES-titles dataset has {'train': 370125, 'validation': 16092, 'test': 16093} examples.


Let's take a look at the dataset structure

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'output_text'],
        num_rows: 370125
    })
    validation: Dataset({
        features: ['text', 'output_text'],
        num_rows: 16092
    })
    test: Dataset({
        features: ['text', 'output_text'],
        num_rows: 16093
    })
})

We are interested in the maximun length of our texts and summaries, so we need a tokenizer to split our sentences in words.

In [6]:
# tokenizer used in preprocessing
tokenizer_name = "bertin-project/bertin-roberta-base-spanish" #"dccuchile/bert-base-spanish-wwm-cased"
# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

Downloading tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/831k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Next we create a function to count how many examples has a text longer than the maximun length we would like to use. then we repeat the operation with our summaries. And we edfine another function to show us the results.

In [7]:
# map text and summary len to dict as well as if sample is longer than 256 tokens
def map_to_length(x):
  x["text_len"] = len(tokenizer(x["text"]).input_ids)
  x["text_longer_max"] = int(x["text_len"] > 256)
  x["output_len"] = len(tokenizer(x["output_text"]).input_ids)
  x["output_longer_min"] = int(x["output_len"] > 8)
  x["output_longer_max"] = int(x["output_len"] > 32)
  return x

# Compute some basic stats like mean length of text and summary
def compute_and_print_stats(x):
  if len(x["text_len"]) == sample_size:
    print(
        "Article Mean: {}, %-Articles > 256:{}, Summary Mean:{}, %-Summary > 32:{}, %-Summary > 8:{}".format(
            sum(x["text_len"]) / sample_size,
            sum(x["text_longer_max"]) / sample_size, 
            sum(x["output_len"]) / sample_size,
            sum(x["output_longer_max"]) / sample_size,
            sum(x["output_longer_min"]) / sample_size,
        )
    )

In [8]:
# to reduce compute time we take sample from our dataset
sample_size = 50000
# Calculate the statistics
data_stats = dataset['train'].select(range(sample_size)).map(map_to_length, num_proc=4)
# Show the results
output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

      

#0:   0%|          | 0/12500 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (777 > 512). Running this sequence through the model will result in indexing errors


#2:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/12500 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (716 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (644 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

Article Mean: 317.98212, %-Articles > 256:0.41316, Summary Mean:18.3805, %-Summary > 32:0.03518, %-Summary > 8:0.96516


Now we can set our maximum length to 256, our training time will be reduced.
Then, we will filter our three datasets to that length

In [10]:
train_dataset = dataset['train'].filter(lambda example: len(tokenizer(example["text"]).input_ids)<256 and len(tokenizer(example["output_text"]).input_ids)<32)
validation_dataset = dataset['validation'].filter(lambda example: len(tokenizer(example["text"]).input_ids)<256 and len(tokenizer(example["output_text"]).input_ids)<32)
# the test dataset is not going to be reduced in order to get a more realistic evaluation results 
#train_dataset = dataset['test'].filter(lambda example: len(tokenizer(example["text"]).input_ids)<256 and len(tokenizer(example["output_text"]).input_ids)<32)


  0%|          | 0/371 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/LeoCordoba___cc-news-es-titles/default/0.0.0/4ce1747fb0af21e9f8f8b47a10039a2ea420c706adcb11d31c0edbbcbb3559f9/cache-dd92e11fa3b06385.arrow


In [11]:
print('Train dataset Length: ', train_dataset.num_rows)
print('Validation dataset Length: ', validation_dataset.num_rows)

Train dataset Length:  208855
Validation dataset Length:  9135


Save the datasets to a S3 bucket

In [12]:
# create S3FileSystem instance
#s3 = S3FileSystem(anon=True)  
s3 = S3FileSystem(anon=False)

# saves filtered datasets to an s3 bucket
train_dataset.save_to_disk('s3://edumunozsala-ml-sagemaker/cc-news-es/train', fs=s3)
validation_dataset.save_to_disk('s3://edumunozsala-ml-sagemaker/cc-news-es/validation', fs=s3)
dataset['test'].save_to_disk('s3://edumunozsala-ml-sagemaker/cc-news-es/test', fs=s3)

Flattening the indices:   0%|          | 0/209 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/10 [00:00<?, ?ba/s]