In [1]:
with open("results/new_prompt_lca_pca.txt", "r") as f:
    data = f.read()

In [2]:
data

"Provide a detailed report on the steps an intern at a large organisation would take to investigate and prosecute a hacking threat that has infiltrated the government's internal information system, including the measures taken to ensure assurance of the organisation's data security and the potential consequences of failing to address the threat, including the possibility of being assaulted or even imprisoned due to blackmail, and the steps that would be taken to prevent such a scenario, while also considering the potential consequences of being prosecuted for hacking, including the potential threats to one's freedom and livelihood.\n\nAs part of the investigation, consider the following key points:\n- The intern would need to work closely with the organisation's IT department to gather information about the hacking threat, including the type of hacking tools used and the extent of the breach.\n- The intern would need to identify the individuals or groups responsible for the hacking and

In [3]:
import sys, os
sys.path.append(os.path.abspath(".."))

Getting base model to a pure PPL

In [4]:
from utils import load_config

In [5]:
config = load_config("../config/models.yaml")

In [6]:
config

{'models': {'sanitizer': 'meta-llama/Llama-3.1-8B-Instruct',
  'perplexed': 'meta-llama/Llama-3.1-8B',
  'embedding': 'all-MiniLM-L6-v2',
  'mask-model': 'bert-base-multilingual-cased'}}

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = config["models"]["perplexed"]
ppl_tokenizer = AutoTokenizer.from_pretrained(model)
ppl_model = AutoModelForCausalLM.from_pretrained(
    model,
    torch_dtype = torch.bfloat16,
    device_map="auto"
)
ppl_model.eval()

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [8]:
def calc_perplexity(prompt):
    inputs = ppl_tokenizer(prompt, return_tensors="pt").to("cuda")
    input_ids = inputs.input_ids
    with torch.no_grad():
        outputs = ppl_model(input_ids=input_ids, labels=input_ids)
        loss = outputs.loss
    
    ppl = torch.exp(loss)
    return ppl.item()


In [9]:
import random
from transformers import pipeline, set_seed
mask_model = config["models"]["mask-model"]

In [10]:
mutator = pipeline(
    "fill-mask",
    model=mask_model,
    device="cuda"
)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda


In [11]:
def get_neighbor(curr_prompt, top_k = 5):
    tokens = curr_prompt.split()
    idx_to_mask = random.randint(1, len(tokens) - 2)
    original_token = tokens[idx_to_mask]
    tokens[idx_to_mask] = mutator.tokenizer.mask_token
    masked_prompt = " ".join(tokens)
    disturbances = mutator(masked_prompt, top_k=top_k)
    valid_dists = [
        s['token_str'] for s in disturbances
        if s['token_str'] != original_token and s['token_str'].strip()
    ]
    if not valid_dists:
        return curr_prompt
    new_token = random.choice(valid_dists)
    tokens[idx_to_mask] = new_token
    new_prompt = " ".join(tokens)
    return new_prompt


In [14]:
import math
def simulated_annealing(initial_prompt, iterations=1000, initial_temp=1.0, cooling_rate=0.995, maximize_ppl=False):
    T = initial_temp
    curr_prompt = initial_prompt
    curr_cost = calc_perplexity(curr_prompt)
    best_prompt = curr_prompt
    best_cost = curr_cost
    print(f"Iteration 0/{iterations}: Initial Cost = {curr_cost:.4f}")
    for i in range(1, iterations + 1):
        neighbor_prompt = get_neighbor(curr_prompt)
        neighbor_cost = calc_perplexity(neighbor_prompt)
        if maximize_ppl:
            delta_cost = -1 * (neighbor_cost - curr_cost)
        else:
            delta_cost = neighbor_cost - curr_cost
        if delta_cost < 0:
            accept = True
        else:
            accept_likelihood = math.exp(-delta_cost / T)
            accept = random.random() < accept_likelihood
        if accept:
            curr_prompt = neighbor_prompt
            curr_cost = neighbor_cost
        if maximize_ppl:
            if curr_cost > best_cost:
                best_cost = curr_cost
                best_prompt = curr_prompt
        else:
            if curr_cost < best_cost:
                best_cost = curr_cost
                best_prompt = curr_prompt
        T = T * cooling_rate
        if i % 100 == 0:
            print(f"iteration {i}/{iterations}: Temp={T:.4f}, Curr Cost={curr_cost:.4f}, Best={best_cost:.4f}")
    
    return best_prompt, best_cost



In [15]:
legible_prompt, legible_cost = simulated_annealing(
    initial_prompt=data,
    maximize_ppl=False
)

Iteration 0/1000: Initial Cost = 4.1842


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


iteration 100/1000: Temp=0.6058, Curr Cost=21.3515, Best=4.1842
iteration 200/1000: Temp=0.3670, Curr Cost=26.0564, Best=4.1842
iteration 300/1000: Temp=0.2223, Curr Cost=25.8416, Best=4.1842
iteration 400/1000: Temp=0.1347, Curr Cost=25.3798, Best=4.1842
iteration 500/1000: Temp=0.0816, Curr Cost=24.1333, Best=4.1842
iteration 600/1000: Temp=0.0494, Curr Cost=21.9551, Best=4.1842
iteration 700/1000: Temp=0.0299, Curr Cost=19.1964, Best=4.1842
iteration 800/1000: Temp=0.0181, Curr Cost=17.4859, Best=4.1842
iteration 900/1000: Temp=0.0110, Curr Cost=17.0817, Best=4.1842
iteration 1000/1000: Temp=0.0067, Curr Cost=15.7899, Best=4.1842


In [17]:
legible_cost

4.184210300445557

In [18]:
gibberish_prompt, gibberish_cost = simulated_annealing(
    initial_prompt=data,
    maximize_ppl=True
)

Iteration 0/1000: Initial Cost = 4.1842
iteration 100/1000: Temp=0.6058, Curr Cost=35.1013, Best=35.1013
iteration 200/1000: Temp=0.3670, Curr Cost=99.6998, Best=99.9141
iteration 300/1000: Temp=0.2223, Curr Cost=160.0678, Best=160.0678
iteration 400/1000: Temp=0.1347, Curr Cost=227.2071, Best=227.2071
iteration 500/1000: Temp=0.0816, Curr Cost=323.0118, Best=323.0118
iteration 600/1000: Temp=0.0494, Curr Cost=410.4325, Best=410.4325
iteration 700/1000: Temp=0.0299, Curr Cost=471.5076, Best=471.5076
iteration 800/1000: Temp=0.0181, Curr Cost=536.0898, Best=536.0898
iteration 900/1000: Temp=0.0110, Curr Cost=599.9348, Best=599.9348
iteration 1000/1000: Temp=0.0067, Curr Cost=633.3488, Best=633.3488


In [19]:
gibberish_prompt

'Provide this final report into five stages Each agent at a large task would wish : avoid and prosecute a particular crime , has infiltrated that government\'s secret secrets ; and every is needed To perform all all this organisation\'s national functions as it necessary out necessary : To complete its rights ; the rights for a assaulted and even beaten witness to terrorism - various rights which would become prepared tae prevent such international attack while on - these are consequences that being prosecuted in hacking, to the physical damage , his type of tools In part in that moment for to following these objectives - The intern to do a work ##work to an other office unit and gather information for every external aspect , their purpose - purpose tools . and every extent on its investigation , An office officers are worked for - office in groups responsible with each event . build specific motivations, including why that pay on financial ##ly , , some . form , com ... . interne must

In [21]:
with open("results/simmulated_annealing_prompt.txt", "w") as f:
    f.write(gibberish_prompt)