In [67]:
import numpy as np
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
checkpoint = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, )

Some top models and their sizes.

**open-source**
- GPT-Neo 125 mil
- GPT-Neo 1.3 bil (same as GPT-3 Babbage)
- GPT-2 1.5 bil
- GPT-Neo 2.7 bil
- GPT-J 6 bil
- GPT-NeoX 20 bil
- Bloom: ranges from 350m to 176 bil

**closed-source**
- GPT-3: 175 bil at biggest

# Try to get gradients
Link to understand GPT models better: [minGPT](https://github.com/karpathy/minGPT/blob/master/mingpt/model.py). Word embeddings are summed with positional embeddings then passed on.

In [68]:
# prepare inputs
raw_inputs = [
    "Life is what happens when you're busy",
]
inputs = tokenizer(raw_inputs, return_tensors="pt")
# inputs['input_ids'] = inputs['input_ids'].float()
# inputs['input_ids'].requires_grad = True
print(inputs)

model = AutoModelForCausalLM.from_pretrained(checkpoint, output_hidden_states=True)
outputs = model(**inputs)

# loss = outputs['logits'].sum()
# outputs['logits'].retain_grad()
# loss.backward(retain_graph=True)
# outputs['logits'].grad

# go through the model
trans = model._modules['transformer']
lm_head = model._modules['lm_head']
out = trans(inputs['input_ids'])
for k in out:
    print(k)
h = out['hidden_states'] # tuple of (layer x (batch_size, seq_len, hidden_size))
logits = lm_head(h[-1])  # select logits using last layer

# we got the same logits by going through the model
assert logits.shape == outputs['logits'].shape # tensor (batch_size, seq_len, vocab_size)
assert logits.sum() == outputs['logits'].sum()
assert logits.max() == outputs['logits'].max()

{'input_ids': tensor([[14662,   318,   644,  4325,   618,   345,   821,  8179]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
last_hidden_state
past_key_values
hidden_states


In [62]:
top_word_embeddings = logits.argmax(axis=2)

In [63]:
print(top_word_embeddings.shape, top_word_embeddings)

torch.Size([1, 11]) tensor([[286, 257, 356, 618, 345, 423, 407,  13, 257, 661,  13]])


In [64]:
for seq_pos in range(logits.shape[1]):
    logits_seq_pos = logits[0, seq_pos] 
    top_word_idx = logits_seq_pos.argmax()
    top_word_idxs = logits_seq_pos[logits_seq_pos[np.argsort(a[ind])]}
    print(logits_seq_pos.shape, top_word_idx)
    print(seq_pos, tokenizer.decode(top_word_idx))

torch.Size([50257])
0  of
torch.Size([50257])
1  a
torch.Size([50257])
2  we
torch.Size([50257])
3  when
torch.Size([50257])
4  you
torch.Size([50257])
5  have
torch.Size([50257])
6  not
torch.Size([50257])
7 .
torch.Size([50257])
8  a
torch.Size([50257])
9  people
torch.Size([50257])
10 .


In [69]:
encoded_input = tokenizer([
    "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
])

In [77]:
encoded_input.tokens()

['Do',
 'Ġnot',
 'Ġmedd',
 'le',
 'Ġin',
 'Ġthe',
 'Ġaffairs',
 'Ġof',
 'Ġwizards',
 ',',
 'Ġfor',
 'Ġthey',
 'Ġare',
 'Ġsubtle',
 'Ġand',
 'Ġquick',
 'Ġto',
 'Ġanger',
 '.']

In [73]:
decoded = tokenizer.decode(encoded_input['input_ids'][0])

In [74]:
decoded

'Do not meddle in the affairs of wizards, for they are subtle and quick to anger.'

In [23]:
outputs['logits'].sum()

tensor(-2390532.5000, grad_fn=<SumBackward0>)

In [20]:
outputs['hidden_states'][0].shape # tuple of (layer x (batch_size, seq_len, hidden_size))

torch.Size([1, 3, 768])

In [18]:
len(outputs['hidden_states'])

13

In [36]:
vars(model)

{'training': False,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('transformer',
               GPTNeoModel(
                 (wte): Embedding(50257, 768)
                 (wpe): Embedding(2048, 768)
                 (drop): Dropout(p=0.0, inplace=False)
                 (h): ModuleList(
                   (0): GPTNeoBlock(
                     (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                     (attn): GPTNeoAttention(
                       (attention): GPTNeoSelfAttention(
                         (attn_dropout): Dropout(p=0.0, inplace=False)
                         (resid_dropout): Dropout(p=0.0, inplace=False)
                         (k_proj): Linear(i

In [41]:
for k in model._modules:
    print(k)

transformer
lm_head


last_hidden_state
past_key_values
hidden_states


In [25]:
logits.sum()

tensor(-2390532.5000, grad_fn=<SumBackward0>)

# API ref
https://huggingface.co/docs/transformers/internal/generation_utils

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
generation_output = model.generate(**inputs,
                                   return_dict_in_generate=True,
                                   output_scores=True)


Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
