In [None]:
# Standlone Gretel tokenizers wrap any tokenizer in a simple abstract API, these tokenizers can be used
# during training / generation. Below is an example of using two built-in tokenizers we have on some training
# data. Aside from the classes used, the API is the same, which makes swapping out tokenizers trivial.

# There are separate clases for training and loading a tokenizer. This is because when loading a tokenizer model
# back in, we aren't really loading the training params that were used for them.

from gretel_synthetics.base_config import BaseConfig

# We'll make a base config that doesn't actually train a model, it just creates
# our shell model directory, we'll just store the tokenizer data here
config = BaseConfig(input_data_path="poe.txt", checkpoint_dir="tokenizer_demo")

In [None]:
# Let's start with the basic char/idx mapping tokenizer

from gretel_synthetics.tokenizers.char import CharTokenizerTrainer, CharTokenizer

trainer = CharTokenizerTrainer(config=config)

# First we build our annotated trainig data, for char tokenizer, it just makes a copy of the training
# data. This writes the training data to our model directory
trainer.create_annotated_training_data()

# This builds the char/idx mappings and saves the model to disk
trainer.train()

In [None]:
# Now we can load our tokenizer back in from the saved model

tok = CharTokenizer.load(config)

In [None]:
# Our tokenizer has basic API interfaces
print(tok.total_vocab_size)
ids = tok.encode_to_ids("Once upon a midnight dreary, while I pondered, weak and weary,")
print(ids)

# now back to the string
print(tok.decode_from_ids(ids))

In [None]:
# Let's use the same exact interface methods, but now using SentencePiece as the underlying tokenizer

from gretel_synthetics.tokenizers.sentencepiece import SentencepieceTokenizerTrainer, SentencePieceTokenizer

trainer = SentencepieceTokenizerTrainer(config=config)

trainer.create_annotated_training_data()

trainer.train()

In [None]:
# Now the same interface for SP as the simple Char tokenizer
tok = SentencePieceTokenizer.load(config)

In [None]:
# Our tokenizer has basic API interfaces
print(tok.total_vocab_size)
ids = tok.encode_to_ids("Once upon a midnight dreary, while I pondered, weak and weary,<n>")
print(ids)

# now back to the string
print(tok.decode_from_ids(ids))