In [1]:
import logging
import sys

logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%H:%M:%S",
)

%load_ext autoreload
%autoreload 2

# Load the pretrained tokenizer

There is a pretrained tokenizer from the `pretrained/tokenizer` directory which was trained on the 125,000 rows of the `fineweb-edu` dataset. It has 50257 tokens including the end-of-text token.

In [2]:
from microgpt import Tokenizer, PretrainedTokenizerConfig

tokenizer = await Tokenizer.load(
    config=PretrainedTokenizerConfig(),
)
tokenizer

21:42:32 Loading pretrained tokenizer: config=type='custom_trained' dir_path='/Users/gpahal/Developer/python/projects/microgpt/pretrained/tokenizer'
21:42:32 Loaded pretrained tokenizer: tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  merges_size=50000
  vocab_size=50257
)


Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  merges_size=50000
  vocab_size=50257
)

In [3]:
text = "Hello, world! I'm a generative pre-trained transformer (GPT)"
ids = tokenizer.encode(text)
[tokenizer.decode([id]) for id in ids]

['Hello',
 ',',
 ' world',
 '!',
 " I'm",
 ' a',
 ' gener',
 'ative',
 ' pre',
 '-trained',
 ' transformer',
 ' (',
 'G',
 'PT',
 ')']

In [4]:
decoded_text = tokenizer.decode(ids=ids)
assert decoded_text == text
decoded_text

"Hello, world! I'm a generative pre-trained transformer (GPT)"

# Train a custom tokenizer

To see how to train a custom tokenizer with larger huggingface datasets, see the `scripts/tokenizer/train_tokenizer.py` script.

To train a very simplified model, follow the step given below.

In [5]:
from microgpt import (
    TextDataSource,
    TokenizerTrainer,
    TokenizerTrainerConfig,
    UntrainedTokenizerConfig,
)

text = """
A generative pre-trained transformer (GPT) is a type of large language model (LLM)[1][2][3] and a prominent framework for generative artificial intelligence.[4][5] It is an artificial neural network that is used in natural language processing by machines.[6] It is based on the transformer deep learning architecture, pre-trained on large data sets of unlabeled text, and able to generate novel human-like content.[2][3] As of 2023, most LLMs had these characteristics[7] and are sometimes referred to broadly as GPTs.[8]

The first GPT was introduced in 2018 by OpenAI.[9] OpenAI has released significant GPT foundation models that have been sequentially numbered, to comprise its "GPT-n" series.[10] Each of these was significantly more capable than the previous, due to increased size (number of trainable parameters) and training. The most recent of these, GPT-4o, was released in May 2024.[11] Such models have been the basis for their more task-specific GPT systems, including models fine-tuned for instruction following—which in turn power the ChatGPT chatbot service.[1]

The term "GPT" is also used in the names and descriptions of such models developed by others. For example, other GPT foundation models include a series of models created by EleutherAI,[12] and seven models created by Cerebras in 2023.[13] Companies in different industries have developed task-specific GPTs in their respective fields, such as Salesforce's "EinsteinGPT" (for CRM)[14] and Bloomberg's "BloombergGPT" (for finance).[15]
""".strip()
tokenizer_trainer = await TokenizerTrainer.load(
    config=TokenizerTrainerConfig(
        tokenizer_config=UntrainedTokenizerConfig(
            special_tokens={"<|endoftext|>": 356},
            eot_id=356,
        ),
        output_dir_path="trained_tokenizer",
        vocab_size=357,
        data_sources=[TextDataSource(name="sample", text=text)],
    ),
)
tokenizer = await tokenizer_trainer.run()
tokenizer

21:42:32 Loading tokenizer trainer: config=tokenizer_config=UntrainedTokenizerConfig(type='untrained', split_pattern=None, special_tokens={'<|endoftext|>': 356}, eot_id=356) output_dir_path='trained_tokenizer' checkpointing_config=None vocab_size=357 data_sources=[TextDataSource(name=sample, text_len=1513)]
21:42:32 Loading untrained tokenizer: config=type='untrained' split_pattern=None special_tokens={'<|endoftext|>': 356} eot_id=356
21:42:32 Loaded untrained tokenizer: tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 356}
  eot_id=356
  merges_size=0
  vocab_size=257
)
21:42:32 Creating output directory: dir_path=trained_tokenizer
21:42:32 Loaded non-checkpointed tokenizer trainer: tokenizer

Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 356}
  eot_id=356
  merges_size=100
  vocab_size=357
)

In [6]:
text = "Hello, world! I'm a generative pre-trained transformer (GPT)"
ids = tokenizer.encode(text)
[tokenizer.decode([id]) for id in ids]

['H',
 'el',
 'lo',
 ',',
 ' w',
 'or',
 'l',
 'd',
 '!',
 ' ',
 'I',
 "'",
 'm',
 ' a',
 ' gen',
 'er',
 'ati',
 've',
 ' pre',
 '-t',
 'rain',
 'ed',
 ' t',
 'ra',
 'n',
 's',
 'for',
 'm',
 'er',
 ' (',
 'GPT',
 ')']

In [7]:
decoded_text = tokenizer.decode(ids)
assert decoded_text == text
decoded_text

"Hello, world! I'm a generative pre-trained transformer (GPT)"

# Load the custom tokenizer

In [8]:
from microgpt import Tokenizer, CustomTrainedTokenizerConfig

tokenizer = await Tokenizer.load(
    config=CustomTrainedTokenizerConfig(
        dir_path="trained_tokenizer",
    ),
)
tokenizer

21:42:32 Loading custom trained tokenizer: config=type='custom_trained' dir_path='trained_tokenizer'
21:42:32 Loaded custom trained tokenizer: tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 356}
  eot_id=356
  merges_size=100
  vocab_size=357
)


Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 356}
  eot_id=356
  merges_size=100
  vocab_size=357
)

In [9]:
decoded_text = tokenizer.decode(ids=ids)
assert decoded_text == text
decoded_text

"Hello, world! I'm a generative pre-trained transformer (GPT)"