# Pre-Tokenization Examples

In [None]:
import nltk
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
sentence = "Historically these two events are connected, although I still don't know how."

In [None]:
sent = word_tokenize(sentence)
print(sent)

['Historically', 'these', 'two', 'events', 'are', 'connected', ',', 'although', 'I', 'still', 'do', "n't", 'know', 'how', '.']


In [None]:
from nltk.stem import PorterStemmer


ps = PorterStemmer()
ps_stem_sent = [ps.stem(words_sent) for words_sent in sent]
print(ps_stem_sent)

['histor', 'these', 'two', 'event', 'are', 'connect', ',', 'although', 'i', 'still', 'do', "n't", 'know', 'how', '.']


In [None]:
from nltk.stem.wordnet import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
lem_sent = [lemmatizer.lemmatize(words_sent) for words_sent in sent]
print(lem_sent)

['Historically', 'these', 'two', 'event', 'are', 'connected', ',', 'although', 'I', 'still', 'do', "n't", 'know', 'how', '.']


In [None]:
import spacy


nlp = spacy.load("en_core_web_sm")

doc = nlp(sentence)
print([token.lemma_ for token in doc])

['historically', 'these', 'two', 'event', 'be', 'connect', ',', 'although', 'I', 'still', 'do', 'not', 'know', 'how', '.']


# BPE Tokenization First Iteration Example

In [14]:
corpus = 'leef low low low low lower lower widest widest widest newest newest newest newest newest'

pre_tokens = corpus.split(' ')
pre_tokens

['leef',
 'low',
 'low',
 'low',
 'low',
 'lower',
 'lower',
 'widest',
 'widest',
 'widest',
 'newest',
 'newest',
 'newest',
 'newest',
 'newest']

In [15]:
EOW = '</w>'

text = [[*[c for c in w], EOW] for w in pre_tokens]
text

[['l', 'e', 'e', 'f', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', 'e', 'r', '</w>'],
 ['l', 'o', 'w', 'e', 'r', '</w>'],
 ['w', 'i', 'd', 'e', 's', 't', '</w>'],
 ['w', 'i', 'd', 'e', 's', 't', '</w>'],
 ['w', 'i', 'd', 'e', 's', 't', '</w>'],
 ['n', 'e', 'w', 'e', 's', 't', '</w>'],
 ['n', 'e', 'w', 'e', 's', 't', '</w>'],
 ['n', 'e', 'w', 'e', 's', 't', '</w>'],
 ['n', 'e', 'w', 'e', 's', 't', '</w>'],
 ['n', 'e', 'w', 'e', 's', 't', '</w>']]

In [16]:
import collections
import itertools


def unpack_sublists(sublists):
    return list(itertools.chain.from_iterable(sublists))

def get_vocab(text):
    return collections.Counter(unpack_sublists(text))

vocab = get_vocab(text)
vocab

Counter({'l': 7,
         'e': 17,
         'f': 1,
         '</w>': 15,
         'o': 6,
         'w': 14,
         'r': 2,
         'i': 3,
         'd': 3,
         's': 8,
         't': 8,
         'n': 5})

In [17]:
def get_pairs(text):
    pairs = collections.defaultdict(int)
    for w in text:
        for i in range(len(w)-1):
            pairs[w[i], w[i+1]] += 1
    return pairs

get_pairs(text)

defaultdict(int,
            {('l', 'e'): 1,
             ('e', 'e'): 1,
             ('e', 'f'): 1,
             ('f', '</w>'): 1,
             ('l', 'o'): 6,
             ('o', 'w'): 6,
             ('w', '</w>'): 4,
             ('w', 'e'): 7,
             ('e', 'r'): 2,
             ('r', '</w>'): 2,
             ('w', 'i'): 3,
             ('i', 'd'): 3,
             ('d', 'e'): 3,
             ('e', 's'): 8,
             ('s', 't'): 8,
             ('t', '</w>'): 8,
             ('n', 'e'): 5,
             ('e', 'w'): 5})

In [18]:
len(get_pairs(text))

18

In [19]:
pairs = get_pairs(text)
best = max(pairs, key=pairs.get)
best

('e', 's')

In [20]:
import copy


def merge_pair_tokens(text, pair):
    # Change text
    new_text = copy.deepcopy(text)
    for w in new_text:
        for i in range(len(w)-1):
            if w[i] == pair[0] and w[i+1] == pair[1]:
                w[i] += w.pop(i+1)
    return new_text

new_text = merge_pair_tokens(text, best)
new_text

[['l', 'e', 'e', 'f', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', '</w>'],
 ['l', 'o', 'w', 'e', 'r', '</w>'],
 ['l', 'o', 'w', 'e', 'r', '</w>'],
 ['w', 'i', 'd', 'es', 't', '</w>'],
 ['w', 'i', 'd', 'es', 't', '</w>'],
 ['w', 'i', 'd', 'es', 't', '</w>'],
 ['n', 'e', 'w', 'es', 't', '</w>'],
 ['n', 'e', 'w', 'es', 't', '</w>'],
 ['n', 'e', 'w', 'es', 't', '</w>'],
 ['n', 'e', 'w', 'es', 't', '</w>'],
 ['n', 'e', 'w', 'es', 't', '</w>']]

In [21]:
get_vocab(new_text)

Counter({'l': 7,
         'e': 9,
         'f': 1,
         '</w>': 15,
         'o': 6,
         'w': 14,
         'r': 2,
         'i': 3,
         'd': 3,
         'es': 8,
         't': 8,
         'n': 5})

In [None]:
'|'.join(unpack_sublists(new_text))

'l|e|e|f|</w>|l|o|w|</w>|l|o|w|</w>|l|o|w|</w>|l|o|w|</w>|l|o|w|e|r|</w>|l|o|w|e|r|</w>|w|i|d|es|t|</w>|w|i|d|es|t|</w>|w|i|d|es|t|</w>|n|e|w|es|t|</w>|n|e|w|es|t|</w>|n|e|w|es|t|</w>|n|e|w|es|t|</w>|n|e|w|es|t|</w>'

# Byte-Level BPE (BBPE) Tokenizers Comparison
This work is inspired by the paper "[How Good is Your Tokenizer?
On the Monolingual Performance of Multilingual Language Models](https://aclanthology.org/2021.acl-long.243/)".

## Donwload Italian-English European Parliament Proceedings Parallel Corpus
Reference [here](https://www.statmt.org/europarl/).

In [None]:
!wget https://www.statmt.org/europarl/v7/it-en.tgz

--2023-06-20 16:32:45--  https://www.statmt.org/europarl/v7/it-en.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 196722035 (188M) [application/x-gzip]
Saving to: ‘it-en.tgz.1’


2023-06-20 16:34:11 (2.21 MB/s) - ‘it-en.tgz.1’ saved [196722035/196722035]



In [None]:
!tar zxvf it-en.tgz

europarl-v7.it-en.en
europarl-v7.it-en.it


In [None]:
!ls -lah

total 990M
drwxr-xr-x 1 root root  4.0K Jun 20 16:34 .
drwxr-xr-x 1 root root  4.0K Jun 20 15:54 ..
drwxr-xr-x 4 root root  4.0K Jun 14 18:26 .config
-rw-r--r-- 1 1026 users 285M May 15  2012 europarl-v7.it-en.en
-rw-r--r-- 1 1026 users 311M May 15  2012 europarl-v7.it-en.it
-rw-r--r-- 1 root root  188M May 16  2012 it-en.tgz
-rw-r--r-- 1 root root  188M May 16  2012 it-en.tgz.1
drwxr-xr-x 1 root root  4.0K Jun 14 18:27 sample_data
drwxr-xr-x 2 root root  4.0K Jun 20 15:58 tokenizer_it
-rw-r--r-- 1 root root  4.8M Jun 20 16:26 train_europarl-v7.it-en.en
-rw-r--r-- 1 root root  4.8M Jun 20 16:26 train_europarl-v7.it-en.it
-rw-r--r-- 1 root root  4.8M Jun 20 15:57 trunc_europarl-v7.it-en.en
-rw-r--r-- 1 root root  4.8M Jun 20 15:57 trunc_europarl-v7.it-en.it


In [None]:
!head -25 europarl-v7.it-en.it

Ripresa della sessione
Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
Avete chiesto che si tenesse una discussione su tale tema nei prossimi giorni, nel corso della presente tornata.
Nel frattempo è mio desiderio, come del resto mi è stato chiesto da alcuni colleghi, osservare un minuto di silenzio in memoria di tutte le vittime delle tempeste che si sono abbattute sui diversi paesi dell' Unione europea.
Vi invito pertanto ad alzarvi in piedi per osservare appunto un minuto di silenzio.
(Il Parlamento osserva un minuto di silenzio)
Signora Presidente, intervengo per una mozione d'ordine.
Come avrà letto sui giornali o sentito alla televisione, 

In [None]:
!head -25 europarl-v7.it-en.en

Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
You have requested a debate on this subject in the course of the next few days, during this part-session.
In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
Please rise, then, for this minute' s silence.
(The House rose and observed a minute' s silence)
Madam President, on a point of order.
You will be aware from the press and television that there have been a number of bomb explosions and

**KEEPING ONLY PART OF THE TEXT TO SPEED UP THE NOTEBOOK**

In [None]:
from pathlib import Path


TEXT_SIZE = 5_000_000

FILE_EN_PATH = Path('europarl-v7.it-en.en')
TRAIN_FILE_EN_PATH = Path('train_europarl-v7.it-en.en')

FILE_IT_PATH = Path('europarl-v7.it-en.it')
TRAIN_FILE_IT_PATH = Path('train_europarl-v7.it-en.it')

In [None]:
# Read text [EN] (298M chars total)
with FILE_EN_PATH.open('r') as f:
    text_en_tmp = f.read(TEXT_SIZE*2)
    text_en = text_en_tmp[:TEXT_SIZE]
    text_en_test = text_en_tmp[TEXT_SIZE:]

# Read text [IT] (322M chars total)
with FILE_IT_PATH.open('r') as f:
    text_it_tmp = f.read(TEXT_SIZE*2)
    text_it = text_it_tmp[:TEXT_SIZE]
    text_it_test = text_it_tmp[TEXT_SIZE:]

In [None]:
# Write train text [EN]
with TRAIN_FILE_EN_PATH.open("a") as f:
    f.write(text_en)

# Write train text [IT]
with TRAIN_FILE_IT_PATH.open("a") as f:
    f.write(text_it)

In [None]:
text_en[:100]

'Resumption of the session\nI declare resumed the session of the European Parliament adjourned on Frid'

In [None]:
text_it[:100]

'Ripresa della sessione\nDichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 di'

**...some text pre-processing is needed**

## Retrive GPT-2 BBPE tokenizer
This tokenizer has been trained to treat spaces like parts of the tokens so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not.

Reference [here](https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2TokenizerFast).

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import GPT2TokenizerFast


pretrained_weights = 'gpt2'
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)
# tokenizer_en.pad_token = tokenizer_en.eos_token

In [None]:
# Example of how the pre-tokenizer works
tokenizer_en.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text_en[:150])

[('Resumption', (0, 10)),
 ('Ġof', (10, 13)),
 ('Ġthe', (13, 17)),
 ('Ġsession', (17, 25)),
 ('Ċ', (25, 26)),
 ('I', (26, 27)),
 ('Ġdeclare', (27, 35)),
 ('Ġresumed', (35, 43)),
 ('Ġthe', (43, 47)),
 ('Ġsession', (47, 55)),
 ('Ġof', (55, 58)),
 ('Ġthe', (58, 62)),
 ('ĠEuropean', (62, 71)),
 ('ĠParliament', (71, 82)),
 ('Ġadjourned', (82, 92)),
 ('Ġon', (92, 95)),
 ('ĠFriday', (95, 102)),
 ('Ġ17', (102, 105)),
 ('ĠDecember', (105, 114)),
 ('Ġ1999', (114, 119)),
 (',', (119, 120)),
 ('Ġand', (120, 124)),
 ('ĠI', (124, 126)),
 ('Ġwould', (126, 132)),
 ('Ġlike', (132, 137)),
 ('Ġonce', (137, 142)),
 ('Ġagain', (142, 148)),
 ('Ġt', (148, 150))]

In [None]:
en_toks_ids = tokenizer_en.encode(text_en)
text_en_decoded = tokenizer_en.decode(en_toks_ids)

Token indices sequence length is longer than the specified maximum sequence length for this model (999130 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
len(en_toks_ids)

999130

## Create custom Italian BBPE tokenizer

Reference [here](https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py).

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer


TOKENIZER_IT_FILE_PATH = Path('./tokenizer_it')


tokenizer_it = ByteLevelBPETokenizer()

In [None]:
%%timeit

# Customize training with <|endoftext|> special GPT2 token and use the same vocab. size
tokenizer_it.train(
    files=str(TRAIN_FILE_IT_PATH),
    vocab_size=tokenizer_en.vocab_size,
    min_frequency=2,
    special_tokens=["<|endoftext|>"],
)

4.62 s ± 1.48 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# Save tokenizer
TOKENIZER_IT_FILE_PATH.mkdir(exist_ok=True, parents=True)
tokenizer_it.save_model(str(TOKENIZER_IT_FILE_PATH))

['tokenizer_it/vocab.json', 'tokenizer_it/merges.txt']

### Compare tokenizers on Italian text
Use the subword fertility and the proportion of continued words metrics as done in the [original paper](https://aclanthology.org/2021.acl-long.243/).

In [None]:
from spacy.lang.it import Italian


nlp = Italian()
pre_tokenizer_it = nlp.tokenizer

**Use the test segment of corpus**

In [None]:
words_it = pre_tokenizer_it(text_it_test)
num_words_it = len(words_it)

**Sub-word fertility**: how aggressively a tokenizer splits words? (*The higher the worst*)

In [None]:
tokens_en = tokenizer_en.encode(text_it)
tokens_it = tokenizer_it.encode(text_it)

In [None]:
subword_fertility_en = len(tokens_en) / num_words_it
subword_fertility_it = len(tokens_it) / num_words_it

subword_fertility_en, subword_fertility_it

(1.9703256591368599, 1.016110029448685)

**Proportion of continued words**: how often a tokenizer splits words?  (*The higher the worst*)

In [None]:
complete_tokens_en = sum([1 if len(tokenizer_en.encode(str(w))) > 1 else 0 for w in words_it])
complete_tokens_it = sum([1 if len(tokenizer_it.encode(str(w))) > 1 else 0 for w in words_it])

In [None]:
pocws_en = complete_tokens_en / num_words_it
pocws_it = complete_tokens_it / num_words_it

pocws_en, pocws_it

(0.560793901162639, 0.361889977355016)