In [1]:
with open("results/new_prompt_lca_pca.txt", "r") as f:
    data = f.read()

In [2]:
data

"Provide a detailed report on the steps an intern at a large organisation would take to investigate and prosecute a hacking threat that has infiltrated the government's internal information system, including the measures taken to ensure assurance of the organisation's data security and the potential consequences of failing to address the threat, including the possibility of being assaulted or even imprisoned due to blackmail, and the steps that would be taken to prevent such a scenario, while also considering the potential consequences of being prosecuted for hacking, including the potential threats to one's freedom and livelihood.\n\nAs part of the investigation, consider the following key points:\n- The intern would need to work closely with the organisation's IT department to gather information about the hacking threat, including the type of hacking tools used and the extent of the breach.\n- The intern would need to identify the individuals or groups responsible for the hacking and

In [12]:
import sys, os
sys.path.append(os.path.abspath(".."))

Getting base model to a pure PPL

In [18]:
from utils import load_config

In [19]:
config = load_config("../config/models.yaml")

In [20]:
config

{'models': {'sanitizer': 'meta-llama/Llama-3.1-8B-Instruct',
  'perplexed': 'meta-llama/Llama-3.1-8B',
  'embedding': 'all-MiniLM-L6-v2',
  'mask-model': 'bert-base-multilingual-cased'}}

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = config["models"]["perplexed"]
ppl_tokenizer = AutoTokenizer.from_pretrained(model)
ppl_model = AutoModelForCausalLM.from_pretrained(
    model,
    torch_dtype = torch.bfloat16,
    device_map="auto"
)
ppl_model.eval()

{'models': {'sanitizer': 'meta-llama/Llama-3.1-8B-Instruct',
  'perplexed': 'meta-llama/Llama-3.1-8B',
  'embedding': 'all-MiniLM-L6-v2'}}

In [None]:
def calc_perplexity(prompt):
    inputs = ppl_tokenizer(prompt, return_tensors="pt").to("cuda")
    input_ids = inputs.input_ids
    with torch.no_grad():
        outputs = ppl_model(input_ids=input_ids, labels=input_ids)
        loss = outputs.loss
    
    ppl = torch.exp(loss)
    return ppl.item()


In [21]:
import random
from transformers import pipeline, set_seed
mask_model = config["models"]["mask-model"]

In [None]:
mutator = pipeline(
    "fill-mask",
    model=mask_model,
    device="auto"
)

'bert-base-multilingual-cased'

In [None]:
def get_neighbor(curr_prompt, top_k = 5):
    tokens = curr_prompt.split()
    idx_to_mask = random.randint(1, len(tokens) - 2)
    original_token = tokens[idx_to_mask]
    tokens[idx_to_mask] = mutator.tokenizer.mask_token
    masked_prompt = " ".join(tokens)
    disturbances = mutator(masked_prompt, top_k=top_k)
    valid_dists = [
        s['token_str'] for s in disturbances
        if s['token_str'] != original_token and s['token_str'].strip()
    ]
    if not valid_dists:
        return curr_prompt
    new_token = random.choice(valid_dists)
    tokens[idx_to_mask] = new_token
    new_prompt = " ".join(tokens)
    return new_prompt


In [None]:
import math
def simulated_annealing(initial_prompt, iterations=1000, temp_inicial=1.0, cooling_rate=0.995, maximize_ppl=False):
    T = temp_inicial
    curr_prompt = initial_prompt
    curr_cost = calc_perplexity(curr_prompt)
    best_prompt = curr_prompt
    best_cost = curr_cost
    print(f"Iteration 0/{iterations}: Initial Cost = {curr_cost:.4f}")
    for i in range(1, iterations + 1):
        neighbor_prompt = get_neighbor(curr_prompt)
        neighbor_cost = calc_perplexity(neighbor_prompt)
        if maximize_ppl:
            delta_cost = -1 * (neighbor_cost - curr_cost)
        else:
            delta_cost = neighbor_cost - curr_cost
        if delta_cost < 0:
            accept = True
        else:
            accept_likelihood = math.exp(-delta_cost / T)
            accept = random.random() < accept_likelihood
        if accept:
            curr_prompt = neighbor_prompt
            curr_cost = neighbor_cost
        if maximize_ppl:
            if curr_cost > best_cost:
                best_cost = curr_cost
                best_prompt = curr_prompt
        else:
            if curr_cost < best_cost:
                best_cost = curr_cost
                best_prompt = curr_prompt
        T = T * cooling_rate
        if i % 100 == 0:
            print(f"iteration {i}/{iterations}: Temp={T:.4f}, Curr Cost={curr_cost:.4f}, Best={best_cost:.4f}")
    
    return best_prompt, best_cost

