Link: [Building WordPiece](https://towardsdatascience.com/how-to-build-a-wordpiece-tokenizer-for-bert-f505d97dddbb)

Link: [Tokenizer](https://huggingface.co/docs/tokenizers/quicktour)

In [10]:
!pip install datasets


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import datasets

all_ds = datasets.list_datasets()
print(len(all_ds))

12893


In [12]:
dataset = datasets.load_dataset(
    'oscar',
    'unshuffled_deduplicated_en',
    split='train',
    streaming=True)

In [13]:
dataset


<datasets.iterable_dataset.IterableDataset at 0x7f750d900250>

In [14]:
import os

os.mkdir('./oscar_en')

In [None]:
from tqdm.auto import tqdm  # for our loading bar

text_data = []
file_count = 0

for sample in tqdm(dataset):
    # remove newline characters from each sample as we need to use exclusively as seperators
    sample = sample['text'].replace('\n', '\s')
    text_data.append(sample)
    if len(text_data) == 5_000:
        # once we hit the 5K mark, save to file
        with open(f'./oscar_it/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 5K chunks, we may have leftover samples, we save those now too
with open(f'./oscar_it/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))

0it [00:00, ?it/s]

In [None]:
from pathlib import Path
paths = [str(x) for x in Path('./oscar_en').glob('**/*.txt')]
paths[:5]

In [None]:
len(paths)


In [None]:
# !pip install tokenizers
from tokenizers import BertWordPieceTokenizer

# initialize
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=paths, vocab_size=30_000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=[
                    '[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

In [None]:

os.mkdir('./bert-en')

tokenizer.save_model('./bert-en', 'bert-en')

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('./bert-it')

In [None]:
tokenizer('ciao! come va?')  # hi! how are you?


In [None]:
with open('./bert-it/vocab.txt', 'r') as fp:
    vocab = fp.read().split('\n')

In [None]:

vocab[2], vocab[13884], vocab[5], \
    vocab[2095], vocab[2281], vocab[35], \
        vocab[3]

In [None]:
tokenizer('ho capito niente')  # I understood nothing


In [None]:

vocab[2], vocab[2318], vocab[5945], \
    vocab[4576], vocab[3]

In [None]:
tokenizer('responsbilità')  # responsibility


In [None]:

vocab[2], vocab[24140], vocab[1016], \
    vocab[16948], vocab[3]