# Tokenization

### BertTweet

- fastBPE
- 64K subword

### Twilbert
- SentencePiece (fastBPE)
- 30k subword 

In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob

num_files = 100
tweet_files = glob("../../data/filtered_tweets/*.txt")

train_files = tweet_files[:2]


tweets = list([x.strip("\n") for x in open(tweet_files[0])])[:1_000_000]

In [2]:
len(tweets)

1000000

In [4]:
from tokenizers.processors import TemplateProcessing

TemplateProcessing?

[0;31mInit signature:[0m [0mTemplateProcessing[0m[0;34m([0m[0mself[0m[0;34m,[0m [0msingle[0m[0;34m,[0m [0mpair[0m[0;34m,[0m [0mspecial_tokens[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Provides a way to specify templates in order to add the special tokens to each
input sequence as relevant.

Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
sequences. The final result looks like this:

    - Single sequence: :obj:`[CLS] Hello there [SEP]`
    - Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`

With the type ids as following::

    [CLS]   ...   [SEP]   ...   [SEP]
      0      0      0      1      1

You can achieve such behavior using a TemplateProcessing::

    TemplateProcessing(
        single="[CLS] $0 [SEP]",
     

In [48]:
from tokenizers import SentencePieceBPETokenizer, BertWordPieceTokenizer, ByteLevelBPETokenizer
from tokenizers import normalizers 
from tokenizers.processors import RobertaProcessing
from finetune_vs_scratch.preprocessing import special_tokens
from finetune_vs_scratch.tokenizer import tokenizer_special_tokens

tokenizer = SentencePieceBPETokenizer()
#replacement="_")

strip_accents = True
lowercase = True
tokenizer.add_special_tokens(tokenizer_special_tokens)

tokenizer_normalizers = [
    normalizers.NFKC(),
    normalizers.BertNormalizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=strip_accents,
        lowercase=lowercase,
    )
]

tokenizer.normalizer = normalizers.Sequence(tokenizer_normalizers)

vocab = tokenizer.get_vocab()

tokenizer.post_processor = RobertaProcessing(
    cls=("<s>", tokenizer.token_to_id("<s>")),
    sep=("</s>", tokenizer.token_to_id("</s>")),
)

In [49]:
from finetune_vs_scratch.preprocessing import special_tokens
from finetune_vs_scratch.tokenizer import tokenizer_special_tokens

#tokenizer.add_special_tokens(tokenizer_special_tokens)
tokenizer.train_from_iterator(
    tweets,
    vocab_size=30_000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=300,
    special_tokens=tokenizer_special_tokens + special_tokens,
)






## Alphabet

In [50]:
vocab = tokenizer.get_vocab()

inv_vocab = {v:k for k, v in vocab.items()}
inv_vocab = [inv_vocab[i] for i in range(len(inv_vocab))]

print(f"First tokens: {inv_vocab[:200]}")

alphabet = sorted(list({a for x in tokenizer.get_vocab() for a in x}))
print("Alphabet = ", " ".join(alphabet))


First tokens: ['<s>', '<pad>', '</s>', '<unk>', '<mask>', '@usuario', 'url', 'hashtag', 'emoji', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', '¡', '¬', '¿', 'ı', 'ɪ', 'а', 'е', 'и', 'к', 'н', 'о', 'с', 'т', '،', 'ء', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ـ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي', 'ٹ', 'پ', 'چ', 'ڑ', 'ک', 'گ', 'ں', 'ھ', 'ہ', 'ۃ', 'ی', 'ے', '۔', 'क', 'ज', 'त', 'द', 'न', 'प', 'ब', 'म', 'य', 'र', 'ल', 'स', 'ह', 'ा', 'ि', 'ी', 'ो', 'ก', 'ค', 'ง', 'ด', 'น', 'ม', 'ย', 'ร', 'ล', 'ว', 'อ', 'า', 'เ', 'ᄀ', 'ᄁ', 'ᄂ', 'ᄃ', 'ᄄ', 'ᄅ', 'ᄆ', 'ᄇ', 'ᄉ', 'ᄊ', 'ᄋ', 'ᄌ', 'ᄍ', 'ᄎ', 'ᄏ', 'ᄐ', 'ᄑ', 'ᄒ', 'ᅡ', 'ᅢ', 'ᅣ', 'ᅥ', 'ᅦ', 'ᅧ', 'ᅨ', 'ᅩ', 'ᅪ', 'ᅬ

In [51]:
tokenizer.encode("@usuario son UNA MIERDA", "Viva Perón").tokens

['<s>',
 '@usuario',
 '▁son',
 '▁una',
 '▁mierda',
 '</s>',
 '</s>',
 '▁viva',
 '▁peron',
 '</s>']

In [52]:
from transformers import PreTrainedTokenizerFast

transformer_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    sep_token="</s>",
    cls_token="<s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

In [53]:

transformer_tokenizer.save_pretrained("small")

('small/tokenizer_config.json',
 'small/special_tokens_map.json',
 'small/tokenizer.json')

In [54]:
from transformers import AutoTokenizer
transformer_tokenizer = AutoTokenizer.from_pretrained("small")

() {'bos_token': '<s>', 'eos_token': '</s>', 'sep_token': '</s>', 'cls_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>', 'special_tokens_map_file': 'small/special_tokens_map.json', 'tokenizer_file': 'small/tokenizer.json', 'name_or_path': 'small'}


In [65]:
transformer_tokenizer._tokenizer.encode("Este es un forro @usuario impresionánte", "Corte gil corte basura").tokens


['<s>',
 '▁este',
 '▁es',
 '▁un',
 '▁forro',
 '▁',
 '@usuario',
 '▁impresionante',
 '</s>',
 '</s>',
 '▁corte',
 '▁gil',
 '▁corte',
 '▁basura',
 '</s>']

In [58]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

() {'model_max_length': 512, 'vocab_file': '/home/jmperez/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab', 'merges_file': '/home/jmperez/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b', 'tokenizer_file': '/home/jmperez/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730', 'special_tokens_map_file': None, 'name_or_path': 'roberta-base'}


In [64]:

tokenizer._tokenizer.encode("Oh man this is terrible", "Bullshit").tokens

['<s>',
 'Oh',
 'Ġman',
 'Ġthis',
 'Ġis',
 'Ġterrible',
 '</s>',
 '</s>',
 'Bull',
 'shit',
 '</s>']