# **Fine tuning de un modelo de lenguaje Bert con textos para niños.**

Los textos utilizados pueden encontrarse
[aqui](https://https://github.com/franfram/Transformers-nlp/tree/main/books/Libros-en-txt).


## Entrenamiento

In [None]:
!pip install transformers[torch] datasets

Collecting transformers[torch]
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from tr

### Fetch text data

In [None]:

!git clone https://github.com/franfram/Transformers-nlp.git

!mkdir ./data

!cp ./Transformers-nlp/books/Libros-en-txt/* data/

!rm -r ./Transformers-nlp

Cloning into 'Transformers-nlp'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (170/170), done.[K
remote: Total 177 (delta 47), reused 115 (delta 4), pack-reused 0[K
Receiving objects: 100% (177/177), 3.29 MiB | 26.36 MiB/s, done.
Resolving deltas: 100% (47/47), done.


### Merge data into a single file

In [None]:
import glob
import os

def merge_text_data():
  for f in glob.glob("./data/*.txt"):
        os.system("cat "+f+" >> ./data/full_text_data.txt")

In [None]:
merge_text_data()

### Load data and split into train/test

In [None]:
from datasets import load_dataset


def load_and_split_data(data_path):
  temp = load_dataset("text", data_files = data_path)["train"]
  dataset = temp.train_test_split(test_size = 0.2, shuffle=False)
  return dataset


In [None]:
data_path = "./data/full_text_data.txt"

dataset = load_and_split_data(data_path)

dataset

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-e6c81ecff02864b4/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-e6c81ecff02864b4/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 24649
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6163
    })
})

In [None]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "CenIA/distillbert-base-spanish-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/269M [00:00<?, ?B/s]

In [None]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")


'>>> DistilBERT number of parameters: 67M'


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
import torch

text = "Quiero comer un [MASK]."

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_n_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()

for token in top_n_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Quiero comer un helado.'
'>>> Quiero comer un filete.'
'>>> Quiero comer un huevo.'
'>>> Quiero comer un pollo.'
'>>> Quiero comer un pastel.'
'>>> Quiero comer un plato.'
'>>> Quiero comer un cerdo.'
'>>> Quiero comer un poco.'
'>>> Quiero comer un pavo.'
'>>> Quiero comer un conejo.'


In [None]:
sample = dataset["train"].shuffle(seed=42).select(range(7))

for row in sample:
    print(f"\n'>>> Text: {row['text']}'")


'>>> Text: Cuando las palabras no dan a entender, la acción sí lo '

'>>> Text: lágrimas  y el corazón en un puño por ver  en aquel '

'>>> Text: EL LEÓN , LA ZORRA  Y EL CIERVO  '

'>>> Text: Viendo entonces los perros lo que pasaba se dijeron entre '

'>>> Text:     Entonces el granizo dejó de bailar sobre su cabe za y el viento '

'>>> Text: buscarlo —dijo la cigüeña.  '

'>>> Text: lleguemos a la Ciudad Esmeralda.  '


# Concatenate all the examples and then split the whole corpus into chunks of equal size.

## Tokenize corpus

In [None]:
def tokenize_function(data):
  result = tokenizer(data["text"])
  if tokenizer.is_fast:
    result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
  return result



In [None]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True, # enable fast multithreading
    remove_columns=["text"]
)
tokenized_datasets


Map:   0%|          | 0/24649 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1697 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/6163 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 24649
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 6163
    })
})

In [None]:
tokenizer.model_max_length

512

In [None]:
chunk_size = 128 # memory available in google colab's GPUs

In [None]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
  print(f"'>>> Review {idx} length: {len(sample)}")

'>>> Review 0 length: 4
'>>> Review 1 length: 2
'>>> Review 2 length: 5


## Concatenate all the samples

In [None]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}

total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 11'


In [None]:
tokenized_samples.keys()

dict_keys(['input_ids', 'attention_mask', 'word_ids'])

In [None]:
concatenated_examples.items()

dict_items([('input_ids', [4, 1444, 30980, 5, 4, 5, 4, 1032, 10565, 1081, 5]), ('attention_mask', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), ('word_ids', [None, 0, 0, None, None, None, None, 0, 1, 2, None])])

In [None]:
chunks = {
    k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
  print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 11'


In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/24649 [00:00<?, ? examples/s]

Map:   0%|          | 0/6163 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 2640
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1504
    })
})

In [None]:
tokenizer.decode(lm_datasets["train"][10]["input_ids"])

'##zel ; se la llevó. rapunzel era la niña más hermosa que viera el sol. cuando cumplió los doce años, la hechicera la encerró en una torre que no tenía puertas ni escaleras y se alzaba en medio de un bosque ; únicamente en lo alto había una diminuta ventana. cuando la bruja quería entrar, se colocaba al pie y gritaba : [UNK] ¡ rapunzel, rapunzel, suéltame tu cabellera! rapunzel tenía un cabello magnífico y larguísimo, fino como hebras de oro. cuando oía la voz de la hechicera se soltaba las trenzas, las envol'

Add mask

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
data_collator # note that vocab size is now 31k instead of ~50k? check later, may be related to the fact that CenIA is not there anymore

DataCollatorForLanguageModeling(tokenizer=DistilBertTokenizerFast(name_or_path='CenIA/distillbert-base-spanish-uncased', vocab_size=31002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
  _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
  print(f"\n'>>> {tokenizer.decode(chunk)}")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] 0á [SEP] [CLS] [SEP] [CLS] la camisa del [SEP] [CLS] hombre [MASK] [SEP] [CLS] león tolstoi [SEP] [CLS] ( 1828 - 1910 ) [SEP] [CLS] 1 [SEP] [CLS] [SEP] [CLS] la camisa del hombre feliz [SEP] [CLS] [SEP] [CLS] [SEP] [CLS] en las lejanas tierras del norte populares hace mucho tiempo, vivió un zar [SEP] [CLS] que enferm equivoco gravemente. [MASK] a los [MASK] médicos de [MASK] [SEP] [CLS] el imperio [MASK] que le aplicaron [MASK] los remedios que conocían y [SEP] [CLS] otros nuevos que [MASK]on sobre la marcha [MASK] pero [MASK] de mejorar [MASK] el estado del zar parecía cada [MASK] [MASK]. le hicieron [SEP] [CLS] tomar baños [MASK] y [MASK], ing

'>>> ##irió jarabes de eucalipto [MASK] [SEP] [CLS] menta y plantas exóticas traídas ayudantes caravanas de lejanos países. [SEP] [CLS] le aplicaron [UNK] y [MASK]samos con los ingredientes más insólito [MASK] [MASK] pero la15 del zar [MASK] mejoraba [MASK] tan desesperado [SEP] [CLS] [MASK] el hombre que prometió la mitad [MASK] [

Make data collator for whole word masking

In [None]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] 0á [SEP] [CLS] [SEP] [CLS] la camisa del [SEP] [CLS] hombre [MASK] [SEP] [CLS] león tolstoi [SEP] [CLS] ( 1828 [MASK] 1910 [MASK] [SEP] [CLS] 1 [SEP] [CLS] [SEP] [CLS] la camisa [MASK] [MASK] feliz [SEP] [CLS] [SEP] [CLS] [SEP] [CLS] en las [MASK] [MASK] tierras [MASK] norte, hace [MASK] [MASK], vivió un zar [SEP] [CLS] que enfermó gravemente. reunió a [MASK] mejores [MASK] de todo [SEP] [CLS] [MASK] imperio [MASK] que le [MASK] [MASK] todos los remedios que conocían y [SEP] [CLS] otros [MASK] [MASK] inventaron sobre la [MASK], pero lejos de mejorar, el estado del [MASK] parecía [MASK] vez peor. le hicieron [SEP] [CLS] tomar baños calientes y fríos, ing'

'>>> ##irió jarabes de eucalipto, [SEP] [CLS] menta y plantas [MASK] [MASK] traídas en caravanas de lejanos países. [SEP] [CLS] le aplicaron [UNK] [MASK] bálsamos con los ingredientes [MASK] insólitos, pero la salud del zar no mejoraba [MASK] tan desesperado [SEP] [CLS] estaba el hombre [MASK] prometió la mitad de lo que [

Downsample for faster training

In [None]:

downsampled_dataset = lm_datasets # no downsampling due to small dataset

# train_size = 2000
# test_size = int(0.1 * train_size)

# downsampled_dataset = lm_datasets["train"].train_test_split(
#     train_size=train_size, test_size=test_size, seed=42
# )
# downsampled_dataset

In [None]:
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 2640
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1504
    })
})

Aqui se nesecita tener cuenta en HF para obtener el token

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install accelerate -U



In [None]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-spanish-corpus",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=3.571428571428572e-7,#2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #push_to_hub=True,
    fp16=True, # boost in speed
    logging_steps=logging_steps,
    #report_to="wandb",
    run_name="finetuning-basic"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Check perplexity (exp(CrossEntropyLoss)) before model training

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 150.85


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,5.3374,4.914084
2,5.1641,4.776121
3,5.0907,4.814063


TrainOutput(global_step=126, training_loss=5.197472004663377, metrics={'train_runtime': 48.7344, 'train_samples_per_second': 162.513, 'train_steps_per_second': 2.585, 'total_flos': 262474008698880.0, 'train_loss': 5.197472004663377, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 117.91


Push to hub

In [None]:
#trainer.push_to_hub()

Using Accelerate to remove randomness in masking process

In [None]:
# TO DO

### Uso del modelo

In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="franfram/distillbert-base-spanish-uncased-finetuned-spanish-corpus"
)

In [None]:
text = "quiero comer un [MASK]."

In [None]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> quiero comer un poco.
>>> quiero comer un huevo.
>>> quiero comer un pastel.
>>> quiero comer un helado.
>>> quiero comer un plato.


# Nueva sección