In [25]:
import os
from datasets import load_dataset
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

# Retrieve a corpus

In [2]:
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [19]:
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i:i + 1000]["text"]

In [20]:
filepath = "../data/wikitext-2/wikitext-2.txt"

In [21]:
filedir = '/'.join(filepath.split('/')[:-1])
if not os.path.exists(filedir):
    os.makedirs(filedir)

In [22]:
with open(filepath, "w", encoding="utf-8", ) as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")

# WordPiece tokenizer

In [26]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

## Normalization

### Replicate `bert-base-uncased` normalization

In [31]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [32]:
tokenizer.normalizer.normalize_str(u"\u0085")

''

### Build normalizer from scratch

In [33]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [34]:
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [36]:
# NOTE: this can be fixed by adding a couple of complicated
# regex statements to the sequence
tokenizer.normalizer.normalize_str(u"\u0085")

'\x85'

## Pre-tokenization

### Replicate `bert-base-uncased` pre-tokenization

In [38]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

### Build pre-tokenizer from scratch

In [44]:
# Splits on whitespace and punctuation
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [45]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [46]:
# Splits on whitespace only
pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [47]:
pre_tokenizer = pre_tokenizers.WhitespaceSplit()
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[("Let's", (0, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre-tokenizer.', (14, 28))]

In [48]:
# Split on whitespace and pupnctuation, using Sequence
pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]