## Vocabulary Analysis

In [2]:
%load_ext autoreload
%autoreload 2
import transformers
from transformers import AutoModel, AutoTokenizer

model_name = 'dccuchile/bert-base-spanish-wwm-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)


In [3]:
print("Tenemos ahora ", len(tokenizer), " tokens")

Tenemos ahora  31002  tokens


In [4]:
tokenizer.tokenize("CHUPAME LA ***")

['chupa', '##me', 'la', '*', '*', '*']

In [None]:
from transformers import BertTokenizerFast

In [9]:
type(tokenizer)

transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

In [4]:
tokenizer.tokenize("pija")

['pi', '##ja']

In [5]:
tokenizer.tokenize("trolo")

['tro', '##lo']

In [6]:
tokenizer.tokenize("maricón")

['maricón']

In [7]:
tokenizer.tokenize("marica")

['marica']

In [8]:
tokenizer.tokenize("puto")

['puto']

In [9]:
tokenizer.tokenize("Hacete ortear viejo trolazo")

['hace', '##te', 'or', '##tear', 'viejo', 'tro', '##laz', '##o']

Es un problema esto. Veamos cómo agregar posiblemente estos tokens...

In [10]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

## Training new tokenizer

In [11]:
from hatedetection import load_datasets

train_dataset, dev_dataset, test_dataset = load_datasets()

In [12]:
from hatedetection.preprocessing import preprocess_tweet

preprocess_tweet("@clarincom jajajaja #NoVuelvenMas 🤣❌❌", hashtag_token="[HASHTAG]")

'[USER] jaja [HASHTAG] no vuelven mas [EMOJI] cara revolviéndose de la risa [EMOJI][EMOJI] marca de cruz [EMOJI][EMOJI] marca de cruz [EMOJI]'

Veamos los que están más de 10 veces

In [13]:
from tokenizers import BertWordPieceTokenizer

new_tokenizer = BertWordPieceTokenizer(lowercase=True)
texts = [ex["text"] for ex in train_dataset]

special_tokens = [
    "[USER]",
    "[HASHTAG]",
    "[EMOJI]",
]

new_tokenizer.add_special_tokens(special_tokens)

3

In [14]:
new_tokenizer.train_from_iterator(
    texts, min_frequency=10
)

In [15]:
old_tokens = set(tokenizer.get_vocab())

missing_tokens = [tok for tok in new_tokenizer.get_vocab() if tok not in old_tokens]

len(missing_tokens)

2099

In [16]:
preprocessed_text = preprocess_tweet("@clarincom jajajaja #NoVuelvenMas 🤣❌❌", hashtag_token="[HASHTAG]")

print(preprocessed_text)
ids = new_tokenizer.encode(preprocessed_text).ids
print("Ids = ", ids)
print("Decoded = ", new_tokenizer.decode(ids))

[USER] jaja [HASHTAG] no vuelven mas [EMOJI] cara revolviéndose de la risa [EMOJI][EMOJI] marca de cruz [EMOJI][EMOJI] marca de cruz [EMOJI]
Ids =  [0, 447, 1, 131, 3324, 206, 2, 172, 484, 113, 124, 272, 2, 2, 5649, 113, 4576, 2, 2, 5649, 113, 4576, 2]
Decoded =  jaja no vuelven mas cara revolviendose de la risa marca de cruz marca de cruz


Ojo que el `.decode` saca todos los `[USER]`, `[HASHTAG]` y demás

Antes había ~2900, sacamos casi 800. Bien!

In [17]:

for i, tok in enumerate(sorted(missing_tokens)):
    print(f"{i+1:<4} -- {tok}")

1    -- #
2    -- ##5n
3    -- ##aaa
4    -- ##aan
5    -- ##abon
6    -- ##acion
7    -- ##aj
8    -- ##aleza
9    -- ##amer
10   -- ##anal
11   -- ##anan
12   -- ##andose
13   -- ##arde
14   -- ##arent
15   -- ##arentena
16   -- ##arma
17   -- ##aroni
18   -- ##bacion
19   -- ##bajo
20   -- ##baron
21   -- ##bica
22   -- ##carce
23   -- ##cci
24   -- ##ccion
25   -- ##cciones
26   -- ##cepcion
27   -- ##cepto
28   -- ##cero
29   -- ##chando
30   -- ##charon
31   -- ##chazo
32   -- ##chera
33   -- ##chita
34   -- ##chner
35   -- ##chor
36   -- ##chorros
37   -- ##choso
38   -- ##chul
39   -- ##ciada
40   -- ##cian
41   -- ##ciando
42   -- ##ciaron
43   -- ##ciela
44   -- ##cien
45   -- ##ciendose
46   -- ##cog
47   -- ##cras
48   -- ##cridad
49   -- ##ct
50   -- ##cter
51   -- ##cto
52   -- ##ctor
53   -- ##ctora
54   -- ##ctores
55   -- ##ctos
56   -- ##ctu
57   -- ##ctura
58   -- ##cua
59   -- ##cues
60   -- ##cun
61   -- ##cuper
62   -- ##dable
63   -- ##dalla
64   -- ##dando
65   

Ok, agreguemos sólo los que son alpha

In [18]:
add_tokens = [t for t in missing_tokens if t.isalnum() or t.startswith("##")]

len(add_tokens), len(missing_tokens)

(2089, 2099)

## Agregar nuevos tokens

In [19]:
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
tokenizer.add_tokens(add_tokens)



2089

In [20]:
len(special_tokens) + len(add_tokens) + 31002

33094

Bien

In [21]:
import os


os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [22]:
import torch
from transformers import BertForMaskedLM


model = BertForMaskedLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

tokenizer.model_max_length = 256

model.resize_token_embeddings(len(tokenizer))


Embedding(33094, 768)

In [23]:
def tokenize(batch):
    return tokenizer(batch['context'], batch['text'], padding='max_length', truncation=True)

batch_size = 16
eval_batch_size = 8

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset

HBox(children=(FloatProgress(value=0.0, max=2277.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=570.0), HTML(value='')))




Dataset({
    features: ['APPEARANCE', 'CALLS', 'CLASS', 'CRIMINAL', 'DISABLED', 'HATEFUL', 'LGBTI', 'POLITICS', 'RACISM', 'WOMEN', 'attention_mask', 'context', 'id', 'input_ids', 'text', 'token_type_ids'],
    num_rows: 9106
})

In [24]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [25]:
from transformers import Trainer, TrainingArguments

new_model_name = "beto-finetuned-context"

new_model_path = f"../models/{new_model_name}"

training_args = TrainingArguments(
    output_dir=new_model_path,
    overwrite_output_dir=True,
    num_train_epochs=5,
    evaluation_strategy="steps",
    eval_steps=500,
    # Smaller LR
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_steps=1500,
    logging_steps=50,
    do_eval= True,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

In [26]:
trainer.train()



Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,3.8485,3.586415,345.77,26.335
1000,3.3064,3.058932,346.1965,26.303
1500,2.9264,2.734172,345.813,26.332
2000,2.7247,2.506068,345.9198,26.324
2500,2.5076,2.366964,345.9981,26.318
3000,2.4402,2.251052,345.7121,26.34




RuntimeError: [enforce fail at inline_container.cc:274] . unexpected pos 740655808 vs 740655696

In [34]:
model.save_pretrained(new_model_path)
tokenizer.save_pretrained(new_model_path)

('../models/beto-finetuned-context/tokenizer_config.json',
 '../models/beto-finetuned-context/special_tokens_map.json',
 '../models/beto-finetuned-context/vocab.txt',
 '../models/beto-finetuned-context/added_tokens.json')