In [75]:
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from copy import deepcopy
checkpoint = "EleutherAI/gpt-neo-2.7B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some top models and their sizes.

**open-source**
- GPT-Neo 125 mil
- GPT-Neo 1.3 bil (same as GPT-3 Babbage)
- GPT-2 1.5 bil
- GPT-Neo 2.7 bil
- GPT-J 6 bil
- GPT-NeoX 20 bil
- Bloom: ranges from 350m to 176 bil

**closed-source**
- GPT-3: 175 bil at biggest

# Try to get gradients
Link to understand GPT models better: [minGPT](https://github.com/karpathy/minGPT/blob/master/mingpt/model.py). Word embeddings are summed with positional embeddings then passed on.

Architecture for [GPT-Neo](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neo/modeling_gpt_neo.py).

In [25]:
# prepare inputs
raw_inputs = [
    "1+3=4",
]
inputs = tokenizer(raw_inputs, return_tensors="pt")
# inputs['input_ids'] = inputs['input_ids'].float()
# inputs['input_ids'].requires_grad = True
print(inputs)

model = AutoModelForCausalLM.from_pretrained(checkpoint, output_hidden_states=True)
outputs = model(**inputs)

# loss = outputs['logits'].sum()
# outputs['logits'].retain_grad()
# loss.backward(retain_graph=True)
# outputs['logits'].grad

# go through the model
trans = model._modules['transformer']
lm_head = model._modules['lm_head']
out = trans(inputs['input_ids'])
for k in out:
    print(k)
h = out['hidden_states'] # tuple of (layer x (batch_size, seq_len, hidden_size))
logits = lm_head(h[-1])  # select logits using last layer

# we got the same logits by going through the model
assert logits.shape == outputs['logits'].shape # tensor (batch_size, seq_len, vocab_size)
assert logits.sum() == outputs['logits'].sum()
assert logits.max() == outputs['logits'].max()

{'input_ids': tensor([[16, 10, 18, 28, 19]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
last_hidden_state
past_key_values
hidden_states


In [26]:
# naive check
print('input text:', tokenizer.decode(inputs['input_ids'][0]))

# top word embeddings
decoded_toks = tokenizer.decode(logits[0].argmax(axis=-1))
print('decoded text from hidden states', decoded_toks)

# dissect token-by-token
for i, tok in enumerate(inputs.tokens()):
    print(f'{i}: ___{tok}___ --> ___{decoded_toks[i]}___')
# top_word_embeddings = logits.argmax(axis=2)
# print(top_word_embeddings.shape, top_word_embeddings)

input text: 1+3=4
decoded text from hidden states .\\4$
0: ___1___ --> ___.___
1: ___+___ --> ___\___
2: ___3___ --> ___\___
3: ___=___ --> ___4___
4: ___4___ --> ___$___


In [118]:
# alter the model
# trans.wte.forward(inputs['input_ids'])
w_embed = trans.wte.weight # vocab_size, embed_dim
vocab_size = w_embed.shape[0]
embed_size = w_embed.shape[1]

"""
emb_linear = nn.Linear(in_features=vocab_size, out_features=embed_size, bias=False)
print(emb_linear.weight.shape, w_embed.shape)
emb_linear.weight = nn.Parameter(w_embed.T)
"""

unemb_linear = nn.Linear(in_features=embed_size, out_features=vocab_size, bias=False)
pinv = torch.linalg.pinv(w_embed)
unemb_linear.weight = nn.Parameter(pinv.T)

# make sure unembedding works
ids = torch.Tensor([[16, 2, 3]]).int()
embs = trans.wte.forward(ids)

unembedded_onehot = unemb_linear(embs)
unembedded_ids = unembedded_onehot.argmax(axis=-1)
assert torch.all(unembedded_ids == ids)

# Do forward pass with embeddings

In [130]:
embeds = trans.wte.forward(ids)
model = AutoModelForCausalLM.from_pretrained(checkpoint, output_hidden_states=True)
outputs_using_embeds = model(inputs_embeds=embeds)
outputs = model(input_ids=ids)

assert outputs['logits'].sum() == outputs_using_embeds['logits'].sum()

In [137]:
embeds = trans.wte.forward(ids)
embeds.retain_grad()
outputs = model(inputs_embeds=embeds)
loss = outputs['logits'].sum()
loss.backward()

In [138]:
embeds.grad

tensor([[[  9575.5391,  -1355.6121,  -2623.6182,  ...,  12424.7637,
           -5506.7495,  -9292.2803],
         [ 12695.8623,  10005.6162,  17801.9453,  ...,   7830.0645,
           24521.3984,   4288.2812],
         [ -9870.7490,   -223.3108,  -3130.4512,  ...,  15149.9629,
          -23681.2461,  -5486.9058]]])

# API ref
https://huggingface.co/docs/transformers/internal/generation_utils

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
generation_output = model.generate(**inputs,
                                   return_dict_in_generate=True,
                                   output_scores=True)


Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
