In [4]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import torch

test_df = pd.read_csv("./flan-t5-base-correct/test/abstract_algebra_test.csv", header=None)
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").cuda()
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

def hotflip_attack(averaged_grad, embedding_matrix, trigger_token_ids,
                   increase_loss=False, num_candidates=1):
    averaged_grad = averaged_grad.cpu()
    embedding_matrix = embedding_matrix.cpu()
    trigger_token_embeds = torch.nn.functional.embedding(torch.LongTensor(trigger_token_ids),
                                                         embedding_matrix).detach().unsqueeze(0)
    averaged_grad = averaged_grad.unsqueeze(0)
    gradient_dot_embedding_matrix = torch.einsum("bij,kj->bik",
                                                 (averaged_grad, embedding_matrix))        
    if not increase_loss:
        gradient_dot_embedding_matrix *= -1    # lower versus increase the class probability.
    if num_candidates > 1: # get top k options
        _, best_k_ids = torch.topk(gradient_dot_embedding_matrix, num_candidates, dim=2)
        return best_k_ids.detach().cpu().numpy()[0]
    _, best_at_each_step = gradient_dot_embedding_matrix.max(2)
    return best_at_each_step[0].detach().cpu().numpy()

def hotflop_attack_my_version(model, token, k=5, order=1):
    grad_adv = model.encoder.embed_tokens.weight.grad[token].detach()
    emb_all = model.encoder.embed_tokens.weight.detach()
    emb_adv = model.encoder.embed_tokens(token).detach().unsqueeze(0).repeat(emb_all.shape[0], 1)
    estimation = torch.matmul(emb_all - emb_adv, grad_adv)
    candidates = torch.topk(estimation, dim=-1, k=k, largest=False).indices
    return candidates

In [6]:
from utils import format_example, gen_prompt

input_instances = list(range(16)) # The instances to train the equivalent instruction on
subject = "global_facts" # the dataset to train on
beam_size = 50 # the beam size for hotflip algo

test_df = pd.read_csv(f"./flan-t5-base-correct/test/{subject}_test.csv", header=None)
prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(subject.replace("_", " "))
instruction_length = len(tokenizer(prompt, return_tensors="pt")["input_ids"][0]) - 2

examples, answers = [], []
for input_index in input_instances:
    example = format_example(test_df, input_index, False)
    answer = test_df.iloc[input_index, test_df.shape[1] - 1]
    examples.append(prompt + example)
    answers.append(answer)

tokenized_example_with_prompt = tokenizer(text=examples, text_target=answers, padding=True, return_tensors="pt").to("cuda")

IndexError: index 27 is out of bounds for axis 0 with size 27

In [21]:
# model.encoder.embed_tokens.weight.retain_grad()
from tqdm import tqdm
from copy import deepcopy

for i in range(10):

    instruction_before_epoch = deepcopy(tokenized_example_with_prompt["input_ids"][0][:instruction_length])

    for token_to_flip in range(instruction_length):
        outputs = model.forward(**tokenized_example_with_prompt)
        loss = outputs.loss
        if token_to_flip == 0:
            print("Loss before epoch {}: {}".format(i, loss))
        loss.backward()
        prompt_tokens = tokenized_example_with_prompt["input_ids"][0][:instruction_length]

        candidates = hotflop_attack_my_version(model, prompt_tokens[token_to_flip], k=beam_size)

        best_loss = loss.detach().clone()
        best_candidate = prompt_tokens[token_to_flip]

        for candidate in candidates:
            new_tokenized_example_with_prompt = deepcopy(tokenized_example_with_prompt)
            new_tokenized_example_with_prompt["input_ids"][:, token_to_flip] = candidate

            model.zero_grad()
            outputs = model.forward(**new_tokenized_example_with_prompt)
            cur_loss = outputs.loss
            if cur_loss < best_loss:
                best_loss = cur_loss
                best_candidate = candidate
        tokenized_example_with_prompt["input_ids"][:, token_to_flip] = best_candidate
        model.zero_grad()
    
    instruction_after_epoch = deepcopy(tokenized_example_with_prompt["input_ids"][0][:instruction_length])
    if torch.sum(instruction_before_epoch == instruction_after_epoch) == instruction_length:
        print("Converged after {} epochs".format(i))
        break

print("Loss after epoch {}: {}".format(i, best_loss))

Loss before epoch 0: 1.7888720035552979
Loss before epoch 1: 0.6467841863632202
Loss before epoch 2: 0.43043237924575806
Loss before epoch 3: 0.3242107331752777
Loss before epoch 4: 0.291056752204895
Loss before epoch 5: 0.27672454714775085
Loss before epoch 6: 0.27253037691116333
Converged after 6 epochs
Loss after epoch 6: 0.27253037691116333


In [22]:
print(prompt)
print(tokenizer.decode(tokenized_example_with_prompt["input_ids"][0][:instruction_length]))

The following are multiple choice questions (with answers) about human sexuality.


renseignementspleasantly 5 Apostol obedience $150 ( gegenüber answers) favour « folgendeencies.


In [None]:
# freuenBJP Kolkata wennù geometric klickendécouvrez downward compromise Chanccessed Dominoity4,5