In [2]:
# !nvidia-smi

In [1]:
%load_ext autoreload
%autoreload 2

# %cd PROTES

%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
# !pip install -r requirements.txt
# !pip install transformers

In [9]:
import math

import jax
jax.config.update('jax_enable_x64', True)
import numpy as np
import torch
from torch.nn.functional import softmax, log_softmax
from torch.optim import AdamW
from transformers import AutoModelForCausalLM, AutoTokenizer


from protes import protes_gpt, protes

device = "cuda" if torch.cuda.is_available() else "cpu"

# Creating something

In [6]:
softmax = torch.nn.Softmax(dim=-1)
logsoftmax = torch.nn.LogSoftmax(dim=-1)

In [7]:
def custom_nll_loss(logits, I):
    """
    logits: 1 x (1 + d) x n
    I:      1 x (1 + d)
    where 1 is added becaus of SOS token in the beggining
    how to deal with bs > 1 i don't understand so far
    """
    loss = 0
    bs, seq_len = I.shape
    
    P = logsoftmax(logits)
    
    for i in range(1, seq_len):
        loss += P[:, i, I[0, i]]
    loss = torch.mean(loss, dim=0)
    return -loss



#     scores = model.compute_transition_scores(
#                 sequences=outputs.sequences,
#                 scores=outputs.scores,
#             )
#     loss = scores[idx][:, -1].mean()

In [27]:
def trainer(model, func, d, m, k, k_top, is_max):
    best_func_value = -torch.inf if is_max else torch.inf
    best_idx = None
    
#     prompt = torch.tensor([tokenizer.encode("<|endoftext|>")]).to(device)
    prompt = torch.tensor([[0]]).to(device)
    
    for i in range(math.ceil(m/k)):
        outputs =  model.generate(
            prompt,
            attention_mask=torch.ones_like(prompt).to(device),
            max_new_tokens=d,

            do_sample=True,
            num_beams=1,
            num_return_sequences=k,
            top_k=0,
            temperature=0.6,
            length_penalty=0,

            output_scores=True,
            return_dict_in_generate=True, 
            renormalize_logits=True, 
            output_hidden_states=True,
            
            pad_token_id=n+1   # MIGHT BE A PROBLEM
#             pad_token_id=tokenizer.eos_token_id
        )
        I = outputs.sequences

        val, idx = torch.topk(func(I[:, 1:]), largest=is_max, k=1)
        

        if is_max and (val > best_func_value):
            best_func_value = val
            best_idx = I[idx, 1:]
        
        if not is_max and (val < best_func_value):
            best_func_value = val
            best_idx = I[idx, 1:]
        
        _, idxes = torch.topk(func(I), largest=is_max, k=k_top)
        batch_of_best_I = I[idxes]
        
        optimizer.zero_grad()
        logits = model.forward(batch_of_best_I, attention_mask=torch.ones_like(batch_of_best_I).to(device)).logits
        
        loss = criterion(logits, batch_of_best_I)
        loss.backward()

        optimizer.step()

        if i % 10 == 0:
            print('batch {} loss: {} best_value {}'.format(i, loss.item(), val.item()))
    

    return best_func_value, best_idx, model

### Rosenbrock

In [18]:
a = -3
b = 3

n = 1002 #50257 #len(tokenizer)
d = 2
k = 32
m = 5000

k_top = 8
is_max = False
log = True

def f_rosenbrock(I):
    I = I / (n-1) * (b-a) + a
    f = (1 - I[:, 0]) ** 2 + 100 * (I[:, 1] - I[:, 0] ** 2) ** 2 
    return f

func = f_rosenbrock

In [56]:
# tokenizer = AutoTokenizer.from_pretrained("gpt2", add_special_tokens=True)
# add the EOS token as PAD token to avoid warnings
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.encode("<|endoftext|>")

model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)  # pad_token_id=0, bos_token_id=0
model.resize_token_embeddings(1 + n)
model.train()
2+2

4

In [58]:
idx = torch.tensor([
    [0, 1, 102],
    [0, 234, 88],
    [0, 11, 22]
])

q = model.forward(idx.to(device), labels=None, attention_mask=torch.ones_like(idx).to(device))

print(q.keys())
print(q.logits.shape)
logits = q.logits
logsoftmax(logits[0][0])

P = logsoftmax(logits)
P.shape

odict_keys(['logits', 'past_key_values'])
torch.Size([3, 3, 1003])


torch.Size([3, 3, 1003])

In [59]:
q.loss

In [60]:
logsoftmax(logits[0][0]).sum()

tensor(-11234.4492, device='cuda:0', grad_fn=<SumBackward0>)

In [61]:
q.past_key_values[0][0].shape

torch.Size([3, 12, 3, 64])

In [62]:
custom_nll_loss(logits, idx)

tensor(19.9962, device='cuda:0', grad_fn=<NegBackward0>)

In [63]:
optimizer = AdamW(model.parameters(), lr=1e-6)
criterion = custom_nll_loss  # torch.nn.NLLLoss(reduction="mean")

In [65]:
best_func_value, best_idx, model = trainer(model=model, func=func, d=d, m=m, k=k, k_top=k_top, is_max=is_max)

batch 0 loss: 16.352632522583008
batch 10 loss: 17.02643585205078
batch 20 loss: 20.18927001953125
batch 30 loss: 12.503631591796875
batch 40 loss: 18.658870697021484
batch 50 loss: 19.49532699584961
batch 60 loss: 16.496782302856445
batch 70 loss: 17.27944564819336
batch 80 loss: 11.55685043334961
batch 90 loss: 14.735280990600586
batch 100 loss: 15.888700485229492
batch 110 loss: 14.796897888183594
batch 120 loss: 18.978843688964844
batch 130 loss: 19.349014282226562
batch 140 loss: 15.496293067932129
batch 150 loss: 12.349271774291992
batch 160 loss: 14.377363204956055
batch 170 loss: 15.557156562805176
batch 180 loss: 13.888874053955078
batch 190 loss: 14.775052070617676
batch 200 loss: 8.580580711364746
batch 210 loss: 14.346628189086914
batch 220 loss: 12.121770858764648
batch 230 loss: 7.829305648803711
batch 240 loss: 6.710381031036377
batch 250 loss: 14.929302215576172
batch 260 loss: 4.754758358001709
batch 270 loss: 6.4517621994018555
batch 280 loss: 3.529346227645874
batch 

In [66]:
best_func_value

tensor([0.0529], device='cuda:0')

In [67]:
best_idx

tensor([[635, 611]], device='cuda:0')

In [18]:
best_X = best_idx / (n-1) * (b-a) + a
best_X

tensor([[-0.0989,  0.0450]], device='cuda:0')

In [77]:
# Original PROTES

i_opt, y_opt, ll_list = protes(f=func,
                          d=d, n=n, k=k, m=m, log=log, is_max=is_max,
                        k_top=k_top, k_gd=1, lr=1e-3)

print(f"i_opt = {i_opt}, x_opt = {i_opt / (n-1) * (b-a) + a} f_opt = {y_opt}")

protes > m 3.2e+01 | t 2.096e+00 | y  2.2653e+01
protes > m 6.4e+01 | t 2.116e+00 | y  9.0660e-01
protes > m 3.5e+02 | t 2.292e+00 | y  7.3126e-01
protes > m 4.2e+02 | t 2.332e+00 | y  1.2610e-01
protes > m 1.0e+03 | t 2.677e+00 | y  1.2610e-01 <<< DONE
i_opt = [613 574], x_opt = [0.67432567 0.44055944] f_opt = 0.12610207844521365


In [78]:
# Original PROTES lr smaller

i_opt, y_opt, ll_list = protes(f=func,
                          d=d, n=n, k=k, m=m, log=log, is_max=is_max,
                        k_top=k_top, k_gd=1, lr=1e-5)

print(f"i_opt = {i_opt}, x_opt = {i_opt / (n-1) * (b-a) + a} f_opt = {y_opt}")

protes > m 3.2e+01 | t 2.103e+00 | y  2.2653e+01
protes > m 6.4e+01 | t 2.123e+00 | y  9.4959e-01
protes > m 4.2e+02 | t 2.336e+00 | y  5.1479e-01
protes > m 4.8e+02 | t 2.375e+00 | y  4.4099e-02
protes > m 1.0e+03 | t 2.681e+00 | y  4.4099e-02 <<< DONE
i_opt = [693 725], x_opt = [1.15384615 1.34565435] f_opt = 0.04409876428981927


### Simple 3D

In [19]:
a = -6
b = 6

m = 50_000
# n0 = 3439

n = 1001
d = 3
k = 256

k_top = 64

is_max = False
log = True

def f_3d_squares(I):
    I = I / (n-1) * (b-a) + a
    x = I[:, 0]
    y = I[:, 1]
    z = I[:, 2]
    f = (x - 5) ** 2 + (y - 2) ** 2 + (z + 1) ** 2
    return f


func = f_3d_squares

In [20]:
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)  # pad_token_id=0, bos_token_id=0
model.resize_token_embeddings(1 + n)
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = custom_nll_loss

best_func_value, best_idx, model = trainer(model=model, func=func, d=d, m=m, k=k, k_top=k_top, is_max=is_max)

print(f"PROTES_GPT: i_opt = {best_idx}, x_opt = {best_idx / (n-1) * (b-a) + a} f_opt = {best_func_value}")

batch 0 loss: 28.244178771972656 best_value 11.118382453918457
batch 10 loss: 19.149003982543945 best_value 26.896944046020508
batch 20 loss: 15.415884017944336 best_value 9.147357940673828
batch 30 loss: 10.0076322555542 best_value 8.065051078796387
batch 40 loss: 5.951223373413086 best_value 6.316319942474365
batch 50 loss: 2.754258155822754 best_value 0.9665740728378296
batch 60 loss: 1.1546285152435303 best_value 8.319019317626953
batch 70 loss: 0.853102445602417 best_value 0.18782415986061096
batch 80 loss: 0.44864097237586975 best_value 3.0751194953918457
batch 90 loss: 5.502062797546387 best_value 0.18782415986061096
batch 100 loss: 0.14961010217666626 best_value 0.18782415986061096
batch 110 loss: 0.10203267633914948 best_value 0.9665740728378296
batch 120 loss: 0.2703596353530884 best_value 0.18782415986061096
batch 130 loss: 4.0289716720581055 best_value 0.3118560314178467
batch 140 loss: 4.659944534301758 best_value 0.18782415986061096
batch 150 loss: 6.2732744216918945 best

In [25]:
# Original PROTES

i_opt, y_opt, ll_list = protes(f=func,
                          d=d, n=n, k=k, m=m, log=log, is_max=is_max,
                        k_top=k_top, k_gd=1, lr=1e-5)

print(f"i_opt = {i_opt}, x_opt = {i_opt / (n-1) * (b-a) + a} f_opt = {y_opt}")

protes > m 2.6e+02 | t 3.801e+00 | y  7.4813e-01
protes > m 1.0e+03 | t 3.871e+00 | y  2.1806e-01
protes > m 3.3e+03 | t 4.085e+00 | y  2.5296e-02
protes > m 5.0e+04 | t 8.649e+00 | y  2.5296e-02 <<< DONE
i_opt = [918 663 404], x_opt = [ 5.016  1.956 -1.152] f_opt = 0.025295999999999735


In [26]:
i_opt, y_opt, ll_list = protes(f=func,
                          d=d, n=n, k=k, m=m, log=log, is_max=is_max,
                        k_top=k_top, k_gd=1, lr=1e-1)

print(f"i_opt = {i_opt}, x_opt = {i_opt / (n-1) * (b-a) + a} f_opt = {y_opt}")

protes > m 2.6e+02 | t 3.493e+00 | y  7.4813e-01
protes > m 1.0e+03 | t 3.567e+00 | y  1.2398e-01
protes > m 2.0e+03 | t 3.664e+00 | y  3.1776e-02
protes > m 1.0e+04 | t 4.443e+00 | y  2.4096e-02
protes > m 1.5e+04 | t 4.906e+00 | y  7.5840e-03
protes > m 2.7e+04 | t 6.087e+00 | y  3.9360e-03
protes > m 2.8e+04 | t 6.214e+00 | y  1.8720e-03
protes > m 3.6e+04 | t 7.028e+00 | y  1.1040e-03
protes > m 4.2e+04 | t 7.608e+00 | y  8.6400e-04
protes > m 4.6e+04 | t 7.974e+00 | y  3.3600e-04
protes > m 5.0e+04 | t 8.383e+00 | y  3.3600e-04 <<< DONE
i_opt = [918 667 416], x_opt = [ 5.016  2.004 -1.008] f_opt = 0.0003360000000000113


### nd

In [55]:
a = 0
b = 6


n = 101
d = 5
m = 100_000

k = 256

k_top = 64

is_max = False
log = True

def f_nd_squares(I):
    I = I / (n-1) * (b-a) + a
    bias = torch.arange(I[0].shape[0]) + 1
    I = I - bias.to(I.device)
    f = I[:, None, : ] @ I[:, :, None]
    f = f.squeeze(-1).squeeze(-1)
    return f


func = f_nd_squares


In [56]:
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)  # pad_token_id=0, bos_token_id=0
model.resize_token_embeddings(1 + n)
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = custom_nll_loss

In [57]:
idx = torch.tensor([
    [0, 1, 102],
    [0, 234, 88],
    [0, 11, 22]
])

In [58]:
func(idx).shape

torch.Size([3])

In [59]:
best_func_value, best_idx, model = trainer(model=model, func=func, d=d, m=m, k=k, k_top=k_top, is_max=is_max)

print(f"PROTES_GPT: i_opt = {best_idx}, x_opt = {best_idx / (n-1) * (b-a) + a} f_opt = {best_func_value}")

batch 0 loss: 29.457683563232422 best_value 3.702399253845215
batch 10 loss: 21.567256927490234 best_value 1.3803989887237549
batch 20 loss: 18.127717971801758 best_value 0.9340003728866577
batch 30 loss: 19.55776596069336 best_value 1.255600094795227
batch 40 loss: 15.982407569885254 best_value 0.7864000797271729
batch 50 loss: 19.291400909423828 best_value 0.4083998203277588
batch 60 loss: 20.750009536743164 best_value 0.23199968039989471
batch 70 loss: 10.374383926391602 best_value 0.24279993772506714
batch 80 loss: 10.523709297180176 best_value 0.15040001273155212
batch 90 loss: 8.642633438110352 best_value 0.44319969415664673
batch 100 loss: 7.258492469787598 best_value 0.30519968271255493
batch 110 loss: 6.422944068908691 best_value 0.493600070476532
batch 120 loss: 2.018096446990967 best_value 0.23799967765808105
batch 130 loss: 1.141192078590393 best_value 0.5979998707771301
batch 140 loss: 0.5364847779273987 best_value 0.5595998167991638
batch 150 loss: 8.27044677734375 best_v

In [60]:
# Original PROTES

def f_nd_squares_jax(I):
    I = I / (n-1) * (b-a) + a
    bias = np.arange(I[0].shape[0]) + 1
    I = I - bias
    f = I[:, None, : ] @ I[:, :, None]
    f = f.squeeze(-1).squeeze(-1)
    return f

func_jax = f_nd_squares_jax

i_opt, y_opt, ll_list = protes(f=func_jax,
                          d=d, n=n, k=k, m=m, log=log, is_max=is_max,
                        k_top=k_top, k_gd=1, lr=1e-3)

print(f"i_opt = {i_opt}, x_opt = {i_opt / (n-1) * (b-a) + a} f_opt = {y_opt}")

protes > m 2.6e+02 | t 3.453e+00 | y  1.2364e+00
protes > m 1.3e+03 | t 3.490e+00 | y  1.1380e+00
protes > m 2.6e+03 | t 3.529e+00 | y  8.1520e-01
protes > m 3.1e+03 | t 3.544e+00 | y  6.4960e-01
protes > m 1.1e+04 | t 3.792e+00 | y  5.6080e-01
protes > m 1.4e+04 | t 3.883e+00 | y  5.2000e-01
protes > m 4.3e+04 | t 4.782e+00 | y  5.1640e-01
protes > m 4.4e+04 | t 4.830e+00 | y  4.5760e-01
protes > m 4.8e+04 | t 4.940e+00 | y  2.9800e-01
protes > m 5.2e+04 | t 5.067e+00 | y  2.2840e-01
protes > m 7.3e+04 | t 5.730e+00 | y  1.9480e-01
protes > m 7.5e+04 | t 5.769e+00 | y  1.9120e-01
protes > m 1.0e+05 | t 6.554e+00 | y  1.9120e-01 <<< DONE
i_opt = [19 30 52 66 89], x_opt = [1.14 1.8  3.12 3.96 5.34] f_opt = 0.19120000000000004


### some drafts

In [20]:
idx = torch.tensor([
    [0, 1, 102],
#     [0, 234, 88]
])

idx = torch.tensor([[0]])
q = model.forward(idx.to(device), labels=None, attention_mask=torch.ones_like(idx).to(device))

print(q.keys())
print(q.logits.shape)
logits = q.logits
logsoftmax(logits[0][0])

P = logsoftmax(logits)
P.shape

odict_keys(['logits', 'past_key_values'])
torch.Size([1, 1, 1003])


torch.Size([1, 1, 1003])

In [6]:
# !pip show transformers

In [None]:
n = 10

# tokenizer = AutoTokenizer.from_pretrained("gpt2", add_special_tokens=True)
# # add the EOS token as PAD token to avoid warnings
# tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=0).to(device)
model.resize_token_embeddings(1 + n)
model.eval()
2-2

In [8]:
# # encode context the generation is conditioned on
# model_inputs = tokenizer('1 + 2', return_tensors='pt').to(device)
# print(model_inputs)

# # generate 40 new tokens
# greedy_output = model.generate(**model_inputs, max_new_tokens=40)
# print(f"Greedy output {greedy_output}")

# print("Output:\n" + 100 * '-')
# print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

In [9]:
q = torch.tensor([[0]]).to(device)
# inp = {"input_ids": q, "attention_mask"}
# multinomial sampling
greedy_output = model.generate(q, max_new_tokens=2, num_beams=1,
                               output_scores=True,
                               return_dict_in_generate=True, 
                               renormalize_logits=True, 
                            num_return_sequences=5, do_sample=True, 
                                output_hidden_states=True,
                                top_k=0,
                               temperature=0.6,
                               length_penalty=0,
                              )
#scores = torch.cat(greedy_output.scores)
idx = greedy_output.sequences

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [10]:
idx

tensor([[0, 0, 1],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], device='cuda:0')

In [11]:
softmax = torch.nn.Softmax(dim=-1)

In [12]:
z = model.forward(q)
print(z.keys())

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


odict_keys(['logits', 'past_key_values'])


In [13]:
z.logits.shape

torch.Size([1, 1, 11])

In [14]:
scores = model.compute_transition_scores(
    sequences=greedy_output.sequences,
    scores=greedy_output.scores,

#     beam_indices=greedy_output.beam_indices,
)

In [15]:
softmax(scores)

tensor([[0.8508, 0.1492],
        [0.7090, 0.2910],
        [0.7090, 0.2910],
        [0.7090, 0.2910],
        [0.7090, 0.2910]], device='cuda:0')

In [16]:
greedy_output.keys()

odict_keys(['sequences', 'scores', 'hidden_states'])

In [17]:
greedy_output.sequences[-1]

tensor([0, 0, 0], device='cuda:0')

In [18]:
greedy_output.scores

(tensor([[  0.0000, -21.3255, -32.1786, -33.3579, -22.9971, -26.4076, -17.0534,
          -24.9093, -21.2638, -22.4059, -21.0662],
         [  0.0000, -21.3255, -32.1786, -33.3579, -22.9971, -26.4076, -17.0534,
          -24.9093, -21.2638, -22.4059, -21.0662],
         [  0.0000, -21.3255, -32.1786, -33.3579, -22.9971, -26.4076, -17.0534,
          -24.9093, -21.2638, -22.4059, -21.0662],
         [  0.0000, -21.3255, -32.1786, -33.3579, -22.9971, -26.4076, -17.0534,
          -24.9093, -21.2638, -22.4059, -21.0662],
         [  0.0000, -21.3255, -32.1786, -33.3579, -22.9971, -26.4076, -17.0534,
          -24.9093, -21.2638, -22.4059, -21.0662]], device='cuda:0'),
 tensor([[-0.8904, -1.7407, -5.4421, -5.5557, -4.5824, -5.7360, -2.2392, -3.1334,
          -1.5789, -3.7138, -4.4431],
         [-0.8904, -1.7407, -5.4421, -5.5557, -4.5824, -5.7360, -2.2392, -3.1334,
          -1.5789, -3.7138, -4.4431],
         [-0.8904, -1.7407, -5.4421, -5.5557, -4.5824, -5.7360, -2.2392, -3.1334,
    

In [20]:
greedy_output.scores[1].shape

torch.Size([5, 11])

In [21]:
greedy_output.hidden_states[0][0].shape

torch.Size([5, 1, 768])

In [22]:
torch.exp(greedy_output.scores[0][0]).sum()

tensor(1., device='cuda:0')

In [23]:
greedy_output.scores[0].shape

torch.Size([5, 11])

In [24]:
greedy_output.hidden_states[0][0].shape

torch.Size([5, 1, 768])

In [25]:
probs = greedy_output.sequences_scores()

AttributeError: 'SampleDecoderOnlyOutput' object has no attribute 'sequences_scores'

In [26]:
P = greedy_output.scores

In [27]:
P[0].shape

torch.Size([5, 11])

In [28]:
probs

NameError: name 'probs' is not defined

In [40]:
model.eval()
2

2

In [41]:
q = torch.tensor([[0]]).to(device)

outputs =  model.generate(
            q, 
            max_new_tokens=d,
#             trace_log_probs=True,
            do_sample=True,
            num_beams=1,
            num_return_sequences=k,
            top_k=0,
            temperature=0.6,
            length_penalty=0,

            output_scores=True,
            return_dict_in_generate=True, 
            renormalize_logits=True, 
            output_hidden_states=True,
        )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [42]:
with torch.no_grad():
    model.eval()
    model = model.to(device)
    q = q.to(device)
    outputs =  model.greedy_search(
                q, 
                max_new_tokens=d,
    #             trace_log_probs=True,
                do_sample=True,
                num_beams=1,
                num_return_sequences=k,
                top_k=0,
                temperature=0.6,
                length_penalty=0,

                output_scores=True,
                return_dict_in_generate=True, 
                renormalize_logits=True, 
                output_hidden_states=True,
            )

ValueError: If `eos_token_id` is defined, make sure that `pad_token_id` is defined.

In [40]:
outputs.keys()

odict_keys(['sequences', 'scores', 'hidden_states'])

In [41]:
p = outputs.scores[0]
idx = outputs.sequences
print(p.shape, idx.shape)

torch.Size([5, 50257]) torch.Size([5, 3])


In [42]:
scores = model.compute_transition_scores(
    sequences=outputs.sequences,
    scores=outputs.scores,

#     beam_indices=greedy_output.beam_indices,
)
scores[[1, 2]][:, -1]

tensor([0., 0.], device='cuda:0')

In [43]:
q

tensor([[0]], device='cuda:0')

In [19]:
# outputs.sequences
# I = outputs.sequences[:, 1:]
# func(I)

# _, idx = torch.topk(func(I), largest=is_max, k=k_top)

# idx

# func(I).argmin(2)

In [63]:
2

2

# Sources
- https://huggingface.co/blog/how-to-generate
- https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration
- https://github.com/huggingface/transformers/issues/3720
- https://discuss.huggingface.co/t/how-to-output-loss-from-model-generate/16999/7
- https://github.com/huggingface/transformers/issues/15552 **try to read**
- https://github.com/Vision-CAIR/MiniGPT-4/issues/129
- https://stackoverflow.com/questions/45196631/how-to-upload-a-cloned-git-repository-to-an-own-git-repository-on-github

## Drafts, thrash

In [None]:


# generate_with_grad = undecorated(model.generate)
# model.generate_with_grad = MethodType(generate_with_grad, model)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
tokenizer = AutoTokenizer.from_pretrained('t5-small')
# 
input_ids = tokenizer("propose new indexes the previous were 30, 40", return_tensors="pt").input_ids
# input_ids = torch.tensor([[0]])
encoder_outputs = model.encoder(input_ids)

decoder_input_ids = torch.ones_like(input_ids)[:, :1] * model.config.decoder_start_token_id
model_kwargs = {"encoder_outputs": encoder_outputs}

outputs = model.greedy_search(decoder_input_ids,
                                encoder_outputs=encoder_outputs,
                                max_new_tokens=d,
                                do_sample=True,
                                num_beams=1,
                                num_return_sequences=k,
                                top_k=0,
                                temperature=0.6,
                                length_penalty=0,

                                output_scores=True,
                                return_dict_in_generate=True, 
                                renormalize_logits=True, 
                                output_hidden_states=True,
                               )

print("Output:", tokenizer.batch_decode(outputs.sequences))
# => prints `['<pad> Heute ist ein schöner Tag.</s>']



In [None]:
from undecorated import undecorated
from types import MethodType

generate_with_grad = undecorated(model.generate)
model.generate_with_grad = MethodType(generate_with_grad, model)


sequences = model.greedy_search(decoder_input_ids, encoder_outputs=encoder_outputs, 
            max_new_tokens=d,

            do_sample=True,
            num_beams=1,
            num_return_sequences=k,
            top_k=0,
            temperature=0.6,
            length_penalty=0,

            output_scores=True,
            return_dict_in_generate=True, 
            renormalize_logits=True, 
            output_hidden_states=True,)

# print("Output:", tokenizer.batch_decode(sequences))
# => prints `['<pad> Heute ist ein schöner Tag.</s>']


encoder_outputs