In [1]:
import logging
import sys

logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%H:%M:%S",
)

%load_ext autoreload
%autoreload 2

# Tokenizer

In [2]:
from microgpt import Tokenizer, PretrainedGPTTokenizerConfig

tokenizer = await Tokenizer.load(
    config=PretrainedGPTTokenizerConfig(encoding_or_model_name="gpt-2"),
)
tokenizer

21:43:16 Loading pretrained gpt tokenizer: config=type='pretrained_gpt' encoding_or_model_name='gpt-2'
21:43:16 Pretrained gpt tokenizer encoding: encoding=gpt-2
21:43:22 Loaded mergeable ranks
21:43:23 Loaded pretrained gpt tokenizer: tokenizer=GPTTokenizer(
  encoding=gpt-2
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  mergeable_ranks_size=50256
  merges_size=50000
  vocab_size=50257
)


GPTTokenizer(
  encoding=gpt-2
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  mergeable_ranks_size=50256
  merges_size=50000
  vocab_size=50257
)

In [3]:
text = "<|endoftext|>Hello, world!"
ids = tokenizer.encode(text, allowed_special_tokens="all")
[tokenizer.decode([id]) for id in ids]

['<|endoftext|>', 'Hello', ',', ' world', '!']

In [4]:
decoded_text = tokenizer.decode(ids)
assert decoded_text == text
decoded_text

'<|endoftext|>Hello, world!'

# Model

In [5]:
from microgpt import Model, PretrainedGPT2ModelConfig, PretrainedGPT2ModelType

model = await Model.load(
    config=PretrainedGPT2ModelConfig(model_type=PretrainedGPT2ModelType.GPT_2),
)
model

21:43:23 Loading pretrained GPT-2 model: config=type='pretrained_gpt_2' model_type=<PretrainedGPT2ModelType.GPT_2: 'gpt-2'> embd_dropout_p=None attn_dropout_p=None residual_dropout_p=None


  from .autonotebook import tqdm as notebook_tqdm


21:43:24 Loading pretrained gpt tokenizer: config=type='pretrained_gpt' encoding_or_model_name='gpt-2'
21:43:24 Pretrained gpt tokenizer encoding: encoding=gpt-2
21:43:24 Loaded mergeable ranks
21:43:24 Loaded pretrained gpt tokenizer: tokenizer=GPTTokenizer(
  encoding=gpt-2
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  mergeable_ranks_size=50256
  merges_size=50000
  vocab_size=50257
)
21:43:24 Loaded model params: params={'max_seq_len': 1024, 'd_model': 768, 'n_layers': 12, 'n_heads': 12, 'use_padded_vocab_size': False, 'use_rope': False, 'rope_theta': 10000.0, 'is_rope_full_precision': True, 'embd_dropout_p': 0.1, 'attn_dropout_p': 0.1, 'residual_dropout_p': 0.1, 'init_std': None, 'init_residual_scaled_factor': 2.0}
21:43:27 No. of parameters: 124.44M
21:43:27 Loading Huggingface pretrained GPT-2 model: huggingface_model_name=gpt2
21:43:28 Loaded Huggingface pretrained GPT-2 model
21:43:28 Loaded pretrained GPT-2 model: model=Model(
  device=cpu
  params={'max_seq_len'

Model(
  device=cpu
  params={'max_seq_len': 1024, 'd_model': 768, 'n_layers': 12, 'n_heads': 12, 'use_padded_vocab_size': False, 'use_rope': False, 'rope_theta': 10000.0, 'is_rope_full_precision': True, 'embd_dropout_p': 0.1, 'attn_dropout_p': 0.1, 'residual_dropout_p': 0.1, 'init_std': None, 'init_residual_scaled_factor': 2.0}
  tokenizer=GPTTokenizer(
  encoding=gpt-2
  special_tokens={'<|endoftext|>': 50256}
  eot_id=50256
  mergeable_ranks_size=50256
  merges_size=50000
  vocab_size=50257
)
  vocab_size=50257
  padded_vocab_size=50257
)

In [6]:
text = "Hi, I'm a language model,"
generated_text = model.generate_text(text=text, max_new_tokens=16)
assert len(generated_text) > len(text)
assert generated_text.startswith(text)
generated_text

'Hi, I\'m a language model, because of nest things for sure!"\n\nhe said.\n\n"You'