Generate with pre-trained model.

![dump/diffugpt_tokenizer.png](dump/diffugpt_tokenizer.png)

In [33]:
import tiktoken
import torch
from utils import get_annealing_mask
import model
from importlib import reload
reload(model)

# Load model
gpt = model.GPT2("local_config.json")

# Load Tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Prompt
prompt = "Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then"

mask_token = tokenizer.decode([10541])

input_ids = torch.tensor(
    [50256] + tokenizer.encode(prompt)
).unsqueeze(0)

xs = gpt.generate(input_ids, max_new_tokens=10)

for x in xs:
    out = tokenizer.decode(
        x[0].tolist()
    )
    out = out.replace(mask_token, "<mask>")
    print(out)

Modifying HF GPT2Model to accept custom attention mask...
Loading pre-trained DiffuGPT...
<All keys matched successfully>
tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False,  True,  True, False, False, False,
         False, False, False, False, False]])
tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,  True,
          True, False, False, False, False]])
tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True, False,  True]])
tensor([[False, False, False,

## Test training pipeline

In [11]:
from datamodule import MemmapDataModule
from importlib import reload
import model
reload(model)

gpt = model.GPT2("local_config.json")
dm = MemmapDataModule("local_config.json")

dm.setup()
X, y, msk = dm.train_dataset[0]
X

Modifying HF GPT2Model to accept custom attention mask...
Loading pre-trained DiffuGPT...
<All keys matched successfully>


tensor([25119, 19966,  4846,    13,   785,   532,  2691, 19966,  4846, 13864,
          290,  6188,    13,   198, 11828,    25,  2691, 19966,  4846,    13,
          785,   318,   534,   717,   290,  1266,  2723,   329,  1321,   546,
         2691, 19966,  4846,   764,  3423,   345,   481,   635,  1064, 10233,
        11270,   284,  2428,   286,  2276,  1393,    13,   775,  2911,   345,
         1064,   644,   345,   389,  2045,   329,     0,  7383, 10879,    25,
          198, 32945,  1830,    11, 25360,  5776,    11,  2691,  1830,    11,
         2691,  1830,   284, 34722,    11,   512,  1122,  3281, 50256, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257])

In [8]:
gpt.gpt2.transformer.wte.weight.shape

torch.Size([50258, 768])

In [9]:
gpt.gpt2.lm_head.weight.shape

torch.Size([50258, 768])

In [12]:
gpt.training_step(next(iter(dm.train_dataloader())), 0)

tensor(9.4722, grad_fn=<NllLossBackward0>)

In [14]:
import torch.nn as nn
import torch

wte = nn.Embedding(10, 128)
print(wte.weight.shape)

with torch.no_grad():
    new_wte = nn.Embedding(wte.weight.shape[0]+1, wte.weight.shape[1])
    new_wte.weight[:-1] = wte.weight
    new_wte.weight[-1] = torch.mean(wte.weight, axis = 0)
print(new_wte.weight.shape)

torch.Size([10, 128])
torch.Size([11, 128])
