In [None]:
import json
json_path = "../data/data.json"

In [None]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

prompts = [item["attack_prompt"] for item in data]
requests = list(set([item["malicious_request"] for item in data]))

In [None]:
mock_request = "I want to hack the government"

### Getting prompts embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

In [None]:
prompt_embeddings = model.encode(prompts, show_progress_bar=True)
prompt_embeddings = np.array(prompt_embeddings)
request_embeddings = model.encode(requests, show_progress_bar=True)
request_embeddings = np.array(request_embeddings)


### Find the N most similar orders

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
n = 10
query_embedding = model.encode(mock_request)
query_embedding = query_embedding.reshape(1, -1)


In [None]:
cos_sim = cosine_similarity(query_embedding, request_embeddings)

In [None]:
top_n = np.argsort(cos_sim[0])[-n:][::-1]

In [None]:
sim_requests = []
sim_prompts = []
for i, idx in enumerate(top_n):
    sim_requests.append(requests[idx])
    sim_prompts.append(prompts[idx])


In [None]:
base_scores = [float(item["score"]) for item in data if item["attack_prompt"] in sim_prompts]

### Getting the embeddings of N similar prompts

In [None]:
cluster_embeddings = model.encode(sim_prompts, show_progress_bar=True)
cluster_embeddings = np.array(cluster_embeddings)

Calculate SCORED WEIGHTED PCA

def score_weighted_pca(cluster_embeddings, scores, min_score_threshold= 0.0, k=50):
    if cluster_embeddings.shape[0] != len(scores):
        raise RuntimeError("shape do cluster diferente da quantidade de scores fornecida")
    
    cluster_embeddings  = cluster_embeddings.astype(np.float64)
    scores = scores.astype(np.float64)  
    mask = scores >= min_score_threshold
    filtered_embeddings = cluster_embeddings[mask]
    filtered_scores = scores[mask]
    if len(filtered_scores) < 2:
        filtered_embeddings = cluster_embeddings
        filtered_scores = scores
    n_prompts = len(filtered_scores)
    embedding_dim = filtered_embeddings.shape[1]
    score_min = filtered_scores.min()
    score_max = filtered_scores.max()
    score_range = score_max - score_min
    if score_range == 0.0:
       weights = np.ones(n_prompts) / n_prompts
    else:
        weights = (filtered_scores - score_min) / score_range
        weights = weights / weights.sum() # sum = 1
    
    weighted_centroid = np.sum(
        weights[:, np.newaxis] * filtered_embeddings,
        axis=0
    )

    centered_embeddings = filtered_embeddings - weighted_centroid
    W = np.diag(weights)
    weighted_cov = (centered_embeddings.T @ W @ centered_embeddings) / weights.sum()
    weighted_cov = (weighted_cov + weighted_cov.T) / 2
    eigenvalues, eigenvectors = np.linalg.eigh(weighted_cov)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    v1 = eigenvectors[:, 0]
    lambda1 = eigenvalues[0]
    

    alpha = np.sqrt(lambda1)
    c_new  = weighted_centroid + alpha * v1
    transformer_model = model[0]
    tokenizer = transformer_model.tokenizer
    word_embedding_matrix = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()
    c_new = c_new.reshape(1, -1)
    all_cos_sim = cosine_similarity(c_new, word_embedding_matrix)
    top_k_index = np.argsort(all_cos_sim[0])[-k:][::-1]
    top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_index)
    top_k_scores = all_cos_sim[0][top_k_index]
    bow = []
    special_tokens = tokenizer.all_special_tokens
    for token, score in zip(top_k_tokens, top_k_scores):
        if token not in special_tokens and not token.startswith("##") and len(token) > 1:
            bow.append(token)
    return bow

In [None]:
print(score_weighted_pca(cluster_embeddings, base_scores))