In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import BpeTrainer

Assume we've run DataPrep

In [2]:
tokenizer = Tokenizer(BPE())
tokenizer.add_special_tokens(["[CLS]", "[SEP]", "[MASK]", "[PAD]"])
tokenizer.pre_tokenizer = Whitespace()

In [3]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=(
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ),
)
tokenizer.enable_padding(length=25)

In [4]:
import glob
files = glob.glob(f"data/rawtext/**/*.*", recursive=True)
print(files[:10])
print(f"Running on {len(files)} files")

['data/rawtext\\uk_geojson_reduced.txt', 'data/rawtext\\us\\il\\cook_county_us.txt', 'data/rawtext\\us\\mn\\ramsey-addresses-county.txt']
Running on 3 files


In [5]:
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=30000, min_frequence=2)
tokenizer.train(files=files, trainer=trainer)

In [6]:
tokenizer.save("tokenizers/uk_us.json")

In [7]:
e = tokenizer.encode_batch(["rock road wadebridge united kingdom pl27 6nw"])

In [8]:
e = e[0]

In [9]:
e.attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

In [10]:
tokenizer.token_to_id('sw')

In [11]:
e

Encoding(num_tokens=25, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [12]:
e.tokens

['[CLS]',
 'ro',
 'ck',
 'ro',
 'ad',
 'w',
 'ade',
 'bridge',
 'u',
 'ni',
 'ted',
 'king',
 'dom',
 'p',
 'l',
 '27',
 '6',
 'n',
 'w',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']