In [1]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
text = """Positional control of pneumatic manipulators for construction tasks
This paper describes solutions that can be applied to pneumatic manipulator
	problems in positioning, both for angle trajectories and for long
	linear trajectories, used in construction tasks. Optimal positioning of
	a pneumatic manipulator along angle trajectories with minimum control
	energy consumption is given. The implementation of the control system
	is presented. Control algorithms for a long linear trajectory
	manipulator based on two-phase and three-phase motion modes of the
	end-effector are investigated. Conventional and fuzzy logic controls of
	a pneumatic manipulator were applied and experimental testing was
	carried out. The obtained results allow widening the application range
	of pneumatic manipulators in construction, particularly in gantry type
	machines"""

enc_tokenizer = encoder.tokenizer

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel

gpt_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
generator = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
example_text = "I really like eating ice cream and drinking hot"
inputs = gpt_tokenizer(example_text, return_tensors="pt")
outputs = generator(**inputs, labels=inputs["input_ids"])
logits = outputs.logits
print(logits.shape)
am_tokens = torch.argmax(logits, dim=-1)
print(am_tokens.shape)
decoded_tokens = am_tokens[0].tolist()
last_token = decoded_tokens[-1]
decoded_token = gpt_tokenizer.decode(last_token)
print(decoded_token)
print("next token in the sentence is:", decoded_token)

torch.Size([1, 9, 50257])
torch.Size([1, 9])
 chocolate
next token in the sentence is:  chocolate


In [5]:
import torch.nn.functional as F

def get_top_k_next_tokens(text, k=1):
    assert k > 0 and k <= len(gpt_tokenizer.vocab)
    # Tokenize the input text
    encoded_input = gpt_tokenizer(text, return_tensors='pt')

    # Get model output
    output = generator(**encoded_input)
    logits = output.logits
    probabilities = F.softmax(logits, dim=-1)[0][-1]

    # Get top k indices for each position in the sequence
    top_k_indices = torch.topk(probabilities, k, dim=-1).indices
    # Decode each index using the tokenizer
    decoded_tokens = [gpt_tokenizer.decode(int(idx)) for idx in top_k_indices]

    return decoded_tokens

# Specify the number of top-k tokens to retrieve
k = 3
result = get_top_k_next_tokens(example_text, k)
print(result)


[' chocolate', ' coffee', ' water']


In [17]:
unused_token = next(
        k for k, v in enc_tokenizer.vocab.items()
        if "[unused" in k)
unused_token_id = enc_tokenizer.vocab[unused_token]
new_token_id = unused_token_id
non_new_token_ids = torch.tensor([i for i in range(len(enc_tokenizer.vocab)) if i != new_token_id])

for param in encoder.parameters():
    param.requires_grad = False
print(len(enc_tokenizer))
print(type(encoder[0].auto_model.embeddings.word_embeddings))

dot_prods = []

for idx, p in enumerate(encoder[0].auto_model.embeddings.word_embeddings.parameters()):
    p.requires_grad = True
    t = torch.tensor(p)
    for x in range(t.shape[0]):
        subt = t[x]
        dot_prods.append(torch.dot(subt, subt).item())

print(min(dot_prods), max(dot_prods))

assert any(p.requires_grad for p in encoder.parameters())

30522
<class 'torch.nn.modules.sparse.Embedding'>
0.0599682554602623 2.869729995727539


  t = torch.tensor(p)


In [20]:
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch import nn
from tqdm import tqdm

criterion = nn.CosineEmbeddingLoss()

def soft_prompt_for_text(text, target):
    tokenized_text = enc_tokenizer.encode(text)
    tokenized_text[-1] = new_token_id
    tokenized_text.append(102)
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)
    attention_mask = torch.ones_like(input_ids)
    optimizer = optim.Adam(encoder.parameters(), lr=1)
    scheduler = CosineAnnealingLR(optimizer, T_max=100, eta_min=0.0001)
    best = float("inf")
    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        output = encoder({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        })["sentence_embedding"]
        
        loss = criterion(output.squeeze(), target, torch.tensor(1.0))
        loss.backward()
        # set grad of non-new token to 0
        # all ids except new_token_id
        
        encoder[0].auto_model.embeddings.word_embeddings.weight.grad[non_new_token_ids] = 0

        optimizer.step()
        scheduler.step()

        if loss.item() < best:
            best = loss.item()
    return best

    
target = torch.tensor(encoder.encode(text))
soft_prompt_for_text(example_text, target)

100%|██████████| 200/200 [00:21<00:00,  9.16it/s]


0.09973382949829102

In [None]:
from queue import PriorityQueue

def dot_prod(text, target):
    vec = encoder.encode(text)
    return vec @ target

def iterative_soft_prompt(text, target, k=3):
    pq = PriorityQueue()
    pq.put((0, text))
    best_text = None
    best_cossim = float('-inf')
    while True:
        text = pq.get()
        this_cossim = dot_prod(text, target)
        if this_cossim > best_cossim:
            best_cossim = this_cossim
            best_text = text
        top_k_next_tokens = get_top_k_next_tokens(text, k)
        top_k_words = [text + token for token in top_k_next_tokens]
        scores = [(next_word, soft_prompt_for_text(next_word, target)) for next_word in top_k_words]
        for next_word, score in scores:
            pq.put((score, next_word))
        
            
