In [1]:
import logging
import sys

logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%H:%M:%S",
)

%load_ext autoreload
%autoreload 2

# Load the pretrained model

There is a pretrained model from the `pretrained/model` directory which was trained on 10 billion tokens from the `fineweb-edu` dataset. It has 50257 tokens including the end-of-text token.

In [2]:
from microgpt import Model, PretrainedModelConfig

model = await Model.load(
    config=PretrainedModelConfig(),
)
model

21:43:48 Pretrained model file size: 501099710 bytes
21:43:48 Loading pretrained model: config=type='custom_trained' dir_path='/Users/gpahal/Developer/python/projects/microgpt/pretrained/model'
21:43:48 Loading custom trained tokenizer: config=type='custom_trained' dir_path='/Users/gpahal/Developer/python/projects/microgpt/pretrained/model'
21:43:48 Loaded custom trained tokenizer: tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  merges_size=50000
  vocab_size=50257
)
21:43:48 Loading pretrained model params: file_path=/Users/gpahal/Developer/python/projects/microgpt/pretrained/model/model.json
21:43:48 Loaded pretrained model params: params={'max_seq_len': 1024, 'd_mod

Model(
  device=cpu
  params={'max_seq_len': 1024, 'd_model': 768, 'n_layers': 12, 'n_heads': 12, 'use_padded_vocab_size': True, 'use_rope': True, 'rope_theta': 10000.0, 'is_rope_full_precision': True, 'embd_dropout_p': 0.0, 'attn_dropout_p': 0.0, 'residual_dropout_p': 0.0, 'init_std': 0.02, 'init_residual_scaled_factor': 2.0}
  tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  merges_size=50000
  vocab_size=50257
)
  vocab_size=50257
  padded_vocab_size=50304
)

In [3]:
text = "Hi, I'm a language model,"
generated_text = model.generate_text(text=text, max_new_tokens=16)
assert len(generated_text) > len(text)
assert generated_text.startswith(text)
generated_text

"Hi, I'm a language model, and I'm sorry to hear that I've received an error message. Now, my aim"

# Train a custom model

To see how to train a custom model with larger huggingface datasets, see the `scripts/model/train_model_stage1.py` script. The pretrained model was trained in 2 stages on 8xH200 GPUs for around 4 hours:

1. Training the model on 10 billion tokens from the `fineweb-edu` dataset
2. Training the model on 3 runs of ~265 million tokens from the `cosmopedia` and `alpaca_cleaned` high quality datasets and combining the model weights

To train a very simplified model, follow the step given below.

## Prepare data for training

In [4]:
%%bash

# Go to the project root directory
cd ..

# Prepare the data for training. The data is stored in shards of 10 million tokens each at `scripts/model/data/data_stage1/shards`
uv run python -m scripts.model.data.prepare_data_stage1

# Go back to the notebooks directory
cd notebooks

21:43:53 [    INFO] Saving tokenized datasets: shards_dir_path=/Users/gpahal/Developer/python/projects/microgpt/scripts/model/data/data_stage1/shards cpu_count=8 n_procs=4 (prepare_data_stage1.py:179)
21:43:53 [    INFO] Loading datasets (prepare_data_stage1.py:181)
21:43:53 [    INFO] Loading dataset fineweb-edu (prepare_data_stage1.py:185)
21:44:42 [    INFO] Loaded dataset fineweb-edu (prepare_data_stage1.py:195)
21:44:42 [    INFO] Loaded datasets (prepare_data_stage1.py:196)
21:44:42 [    INFO] Loading tokenizer (prepare_data_stage1.py:207)
21:44:42 [    INFO] Loading pretrained tokenizer: config=type='custom_trained' dir_path='/Users/gpahal/Developer/python/projects/microgpt/pretrained/tokenizer' (tokenizer.py:520)
21:44:42 [    INFO] Loaded pretrained tokenizer: tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(

## Train a custom model

In [5]:
from microgpt import (
    ModelTrainer,
    ModelTrainerConfig,
    PretrainedTokenizerConfig,
    UntrainedModelConfig,
)

model_trainer = await ModelTrainer.load(
    config=ModelTrainerConfig(
        model_konfig=UntrainedModelConfig(
            tokenizer_config=PretrainedTokenizerConfig(),
        ),
        output_dir_path="trained_model",
        max_iterations_per_epoch=25,
        batch_size=1,
        loss_output_file_path="trained_model/loss.txt",
        eval_output_file_path="trained_model/eval.txt",
    ),
)
model = await model_trainer.run()
model

21:45:06 CUDA is not available: device=cpu
21:45:06 Master process: checkpointing disabled
21:45:06 Loading model trainer: config=model_konfig=UntrainedModelConfig(type='untrained', tokenizer_config=PretrainedTokenizerConfig(type='pretrained'), max_seq_len=512, d_model=384, n_layers=6, n_heads=6, use_padded_vocab_size=True, use_rope=True, rope_theta=10000.0, is_rope_full_precision=True, embd_dropout_p=0.0, attn_dropout_p=0.0, residual_dropout_p=0.0, init_std=None, init_residual_scaled_factor=2.0) output_dir_path='trained_model' checkpointing_config=None manual_seed=None should_compile=True data_dir_path=None epochs=1 max_iterations_per_epoch=25 batch_size=1 gradient_accumulation_iterations=1 max_learning_rate=None min_learning_rate=None learning_rate_warmup_iterations=None learning_rate_decay_iterations=None weight_decay=None betas=(0.9, 0.95) max_grad_norm=1.0 log_interval=1 eval_interval=None eval_iterations=None enable_hellaswag_eval=False hellaswag_eval_interval=None generate_text_

OptimizedModule(
  (_orig_mod): Model(
    device=cpu
    params={'max_seq_len': 512, 'd_model': 384, 'n_layers': 6, 'n_heads': 6, 'use_padded_vocab_size': True, 'use_rope': True, 'rope_theta': 10000.0, 'is_rope_full_precision': True, 'embd_dropout_p': 0.0, 'attn_dropout_p': 0.0, 'residual_dropout_p': 0.0, 'init_std': None, 'init_residual_scaled_factor': 2.0}
    tokenizer=Tokenizer(
    split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
    special_tokens={'<|endoftext|>': 50256}
    eot_id=50256
    merges_size=50000
    vocab_size=50257
  )
    vocab_size=50257
    padded_vocab_size=50304
  )
)

In [6]:
text = "Hi, I'm a language model,"
generated_text = model.generate_text(text=text, max_new_tokens=16)
assert len(generated_text) > len(text)
assert generated_text.startswith(text)
generated_text

"Hi, I'm a language model, balanced of to appealedoceseLect: to10 is by.L for Flor that lethar"

# Load the custom model

In [7]:
from microgpt import Model, CustomTrainedModelConfig

model = await Model.load(
    config=CustomTrainedModelConfig(
        dir_path="trained_model",
    ),
)
model

21:45:20 Loading custom_trained model: config=type='custom_trained' dir_path='trained_model'
21:45:20 Loading custom trained tokenizer: config=type='custom_trained' dir_path='trained_model'
21:45:20 Loaded custom trained tokenizer: tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  merges_size=50000
  vocab_size=50257
)
21:45:20 Loading custom_trained model params: file_path=trained_model/model.json
21:45:20 Loaded custom_trained model params: params={'max_seq_len': 512, 'd_model': 384, 'n_layers': 6, 'n_heads': 6, 'use_padded_vocab_size': True, 'use_rope': True, 'rope_theta': 10000.0, 'is_rope_full_precision': True, 'embd_dropout_p': 0.0, 'attn_dropout_p': 0.0, 'residual

Model(
  device=cpu
  params={'max_seq_len': 512, 'd_model': 384, 'n_layers': 6, 'n_heads': 6, 'use_padded_vocab_size': True, 'use_rope': True, 'rope_theta': 10000.0, 'is_rope_full_precision': True, 'embd_dropout_p': 0.0, 'attn_dropout_p': 0.0, 'residual_dropout_p': 0.0, 'init_std': None, 'init_residual_scaled_factor': 2.0}
  tokenizer=Tokenizer(
  split_pattern=[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  merges_size=50000
  vocab_size=50257
)
  vocab_size=50257
  padded_vocab_size=50304
)

In [8]:
text = "Hi, I'm a language model,"
generated_text = model.generate_text(text=text, max_new_tokens=16)
assert len(generated_text) > len(text)
assert generated_text.startswith(text)
generated_text

"Hi, I'm a language model, est and objection., propelled ability Edit use it look ofuesgram for.\n.\n"