Shows how one can generate text given a prompt and some hyperparameters, using either minGPT or huggingface/transformers

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from mingpt.model import GPT
from mingpt.utils import set_seed
from mingpt.bpe import BPETokenizer
set_seed(3407)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
use_mingpt = True # use minGPT or huggingface/transformers model?
model_type = 'gpt2'
device = 'cpu'

In [3]:
if use_mingpt:
    model = GPT.from_pretrained(model_type,vanilla=False)
    model_vanilla = GPT.from_pretrained(model_type, True)
else:
    model = GPT2LMHeadModel.from_pretrained(model_type)
    model.config.pad_token_id = model.config.eos_token_id # suppress a warning

# ship model to device and set to eval mode
model.to(device)
model.eval();
model_vanilla.to(device)
model_vanilla.eval()

number of parameters: 152.79M
MinGPT model has: 221 parameters
HuggingFace checkpoint has: 149 parameters
Keys to copy: 149
number of parameters: 152.79M
MinGPT model has: 221 parameters
HuggingFace checkpoint has: 149 parameters
Keys to copy: 149


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttentionVanilla(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_vanilla): CausalSelfAttentionVanilla(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
         

In [4]:

def generate(prompt='', num_samples=10, steps=20, do_sample=True):
        
    # tokenize the input prompt into integer input sequence
    if use_mingpt:
        tokenizer = BPETokenizer()
        if prompt == '':
            # to create unconditional samples...
            # manually create a tensor with only the special <|endoftext|> token
            # similar to what openai's code does here https://github.com/openai/gpt-2/blob/master/src/generate_unconditional_samples.py
            x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
        else:
            x = tokenizer(prompt).to(device)
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(model_type)
        if prompt == '': 
            # to create unconditional samples...
            # huggingface/transformers tokenizer special cases these strings
            prompt = '<|endoftext|>'
        encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
        x = encoded_input['input_ids']
    
    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)
    model.reset_kv_cache()
    # forward the model `steps` times to get samples, in a batch
    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
    
    for i in range(num_samples):
        
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('-'*80)
        print(out)
        

In [5]:
generate(prompt='Michael Jordan is', num_samples=10, steps=20, do_sample=False)

--------------------------------------------------------------------------------
Michael Jordan is a, and the "I was a, and the "I was a, and the "I
--------------------------------------------------------------------------------
Michael Jordan is a, and the "I was a, and the "I was a, and the "I
--------------------------------------------------------------------------------
Michael Jordan is a, and the "I was a, and the "I was a, and the "I
--------------------------------------------------------------------------------
Michael Jordan is a, and the "I was a, and the "I was a, and the "I
--------------------------------------------------------------------------------
Michael Jordan is a, and the "I was a, and the "I was a, and the "I
--------------------------------------------------------------------------------
Michael Jordan is a, and the "I was a, and the "I was a, and the "I
--------------------------------------------------------------------------------
Michael Jordan is a, and 

In [6]:

def generate_vanilla(prompt='', num_samples=10, steps=20, do_sample=False):
        
    # tokenize the input prompt into integer input sequence
    if use_mingpt:
        tokenizer = BPETokenizer()
        if prompt == '':
            # to create unconditional samples...
            # manually create a tensor with only the special <|endoftext|> token
            # similar to what openai's code does here https://github.com/openai/gpt-2/blob/master/src/generate_unconditional_samples.py
            x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
        else:
            x = tokenizer(prompt).to(device)
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(model_type)
        if prompt == '': 
            # to create unconditional samples...
            # huggingface/transformers tokenizer special cases these strings
            prompt = '<|endoftext|>'
        encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
        x = encoded_input['input_ids']
    
    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)
    #model.reset_kv_cache()
    # forward the model `steps` times to get samples, in a batch
    y = model_vanilla.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
    
    for i in range(num_samples):
        
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('-'*80)
        print(out)
        

In [7]:
generate_vanilla(prompt='Michael Jordan is', num_samples=10, steps=20)

--------------------------------------------------------------------------------
Michael Jordan is a senior writer for ESPN The Magazine. Follow him on Twitter @JordanWWC.<|endoftext|>The U
--------------------------------------------------------------------------------
Michael Jordan is a senior writer for ESPN The Magazine. Follow him on Twitter @JordanWWC.<|endoftext|>The U
--------------------------------------------------------------------------------
Michael Jordan is a senior writer for ESPN The Magazine. Follow him on Twitter @JordanWWC.<|endoftext|>The U
--------------------------------------------------------------------------------
Michael Jordan is a senior writer for ESPN The Magazine. Follow him on Twitter @JordanWWC.<|endoftext|>The U
--------------------------------------------------------------------------------
Michael Jordan is a senior writer for ESPN The Magazine. Follow him on Twitter @JordanWWC.<|endoftext|>The U
--------------------------------------------------

In [8]:
def compare(prompt='', num_samples=1, steps=20, do_sample=False):
# tokenize the input prompt into integer input sequence
    model = GPT.from_pretrained(model_type,vanilla=False)
    model_vanilla = GPT.from_pretrained(model_type, True)
    model.to(device)
    model.eval();
    model_vanilla.to(device)
    model_vanilla.eval();
    if use_mingpt:
        tokenizer = BPETokenizer()
        if prompt == '':
            # to create unconditional samples...
            # manually create a tensor with only the special <|endoftext|> token
            # similar to what openai's code does here https://github.com/openai/gpt-2/blob/master/src/generate_unconditional_samples.py
            x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
        else:
            x = tokenizer(prompt).to(device)
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(model_type)
        if prompt == '': 
            # to create unconditional samples...
            # huggingface/transformers tokenizer special cases these strings
            prompt = '<|endoftext|>'
        encoded_input = tokenizer(prompt, return_tensors='pt').to(device)
        x = encoded_input['input_ids']
    
    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    '''b, t = x.size()
    num_tokens = t
    kv_cached = False
    assert t <= model.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
    if not kv_cached:
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
    else:
        seq = num_tokens  # broj tokena do sada, od 1..N
        pos_index = (seq - 1)   # da ne prelazi limit
        pos = torch.tensor([[pos_index]], dtype=torch.long, device=device)
    # forward the GPT model itself

    tok_emb = model.transformer.wte(x) # token embeddings of shape (b, t, n_embd)
    pos_emb = model.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
    x = model.transformer.drop(tok_emb + pos_emb)'''
    '''x1 = model.transformer.h[0](x)
    x2 = model_vanilla.transformer.h[0](x)'''
    x1 = model.generate(x, max_new_tokens=5, do_sample=False, top_k=40)
    x2 = model_vanilla.generate(x, max_new_tokens=5, do_sample=False, top_k=40)
    return torch.allclose(x1,x2)
    

In [10]:
print(compare(prompt='Michael Jordan is'))

number of parameters: 152.79M
MinGPT model has: 221 parameters
HuggingFace checkpoint has: 149 parameters
Keys to copy: 149
number of parameters: 152.79M
MinGPT model has: 221 parameters
HuggingFace checkpoint has: 149 parameters
Keys to copy: 149
False
