In [1]:
import json
json_path = "../data/data.json"
import sys, os
sys.path.append(os.path.abspath(".."))

In [2]:
from utils import load_config
config = load_config("../config/models.yaml")

In [3]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

prompts = [item["attack_prompt"] for item in data]
requests = list(set([item["malicious_request"] for item in data]))

In [4]:
mock_request = "I want to hack the government"

In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer(config["models"]["embedding"])


In [6]:
prompt_embeddings = np.array(model.encode(prompts, show_progress_bar=True))
request_embeddings = np.array(model.encode(requests, show_progress_bar=True))

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
n = 10
query_embeddings = model.encode(mock_request).reshape(1, -1)

In [8]:
cos_sim = cosine_similarity(query_embeddings, request_embeddings)

In [9]:
top_n = np.argsort(cos_sim[0])[-n:][::-1]

In [10]:
sim_requests = []
sim_prompts = []
for i, idx in enumerate(top_n):
    sim_requests.append(requests[idx])
    sim_prompts.append(prompts[idx])

In [11]:
cluster_embeddings = np.array(model.encode(sim_prompts, show_progress_bar=True))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

## LCA

In [12]:
success_threshold = 8.5
features_list = []
embeddings_list = []
for i, original_index in enumerate(top_n):
    score = data[original_index]['score']
    is_successful = 1 if score >= success_threshold else 0
    features_list.append([is_successful])

lca_features = np.array(features_list)
aligned_embeddings = cluster_embeddings

In [13]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(aligned_embeddings)


0,1,2
,n_components,2
,covariance_type,'full'
,tol,0.001
,reg_covar,1e-06
,max_iter,100
,n_init,1
,init_params,'kmeans'
,weights_init,
,means_init,
,precisions_init,


In [14]:
prompt_classes = gmm.predict(aligned_embeddings)

In [15]:
prompt_classes.shape

(10,)

In [16]:
prompt_classes

array([1, 0, 1, 0, 1, 0, 1, 1, 1, 0])

In [17]:
success_labels = lca_features.flatten()

In [18]:
success_labels

array([1, 0, 0, 0, 1, 0, 1, 0, 0, 0])

In [19]:
success_rate_class_0 = np.mean(success_labels[prompt_classes == 0])
success_rate_class_1 = np.mean(success_labels[prompt_classes == 1])

In [20]:
success_rate_class_1

np.float64(0.5)

In [21]:
if success_rate_class_1 >= success_rate_class_0:
    elite_class_label = 1
else:
    elite_class_label = 0

In [22]:
prompt_classes

array([1, 0, 1, 0, 1, 0, 1, 1, 1, 0])

In [23]:
elite_mask = (prompt_classes == elite_class_label)
elite_cluster_embeddings = aligned_embeddings[elite_mask]

In [24]:
elite_mask

array([ True, False,  True, False,  True, False,  True,  True,  True,
       False])

In [25]:
print(len(aligned_embeddings))
print(len(elite_cluster_embeddings))

10
6


In [26]:
sum([1 for item in features_list if item[0] == 1])

3

Calculate CENTROID and PCA

In [27]:
centroid = np.mean(elite_cluster_embeddings, axis=0)

In [28]:
centered_embeddings = elite_cluster_embeddings - centroid

In [29]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)

In [30]:
pca.fit(centered_embeddings)

0,1,2
,n_components,1
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [31]:
v1 = pca.components_[0]

In [32]:
v1.shape

(384,)

In [33]:
pca.explained_variance_[0]

np.float32(0.17747447)

In [34]:
alpha_scale = np.sqrt(pca.explained_variance_[0])
alpha = 1.0 * alpha_scale

In [35]:
c_new = centroid + (alpha * v1)

In [36]:
k = 50
transformer_model = model[0]

In [37]:
tokenizer = transformer_model.tokenizer

In [38]:
word_embedding_matrix = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()

In [39]:
c_new = c_new.reshape(1, -1)

In [40]:
all_cos_sim = cosine_similarity(c_new, word_embedding_matrix)

In [41]:
top_k_index = np.argsort(all_cos_sim[0])[-k:][::-1]

In [42]:
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_index)
top_k_scores = all_cos_sim[0][top_k_index]

In [43]:
bow = []
special_tokens = tokenizer.all_special_tokens

for token, score in zip(top_k_tokens, top_k_scores):
    if token not in special_tokens and not token.startswith("##") and len(token) > 1:
        bow.append(token)

In [44]:
len(bow)

38

In [45]:
with open("../data/lca_pca_bow.csv", "w", encoding='utf-8') as f:
    f.write(",".join(bow))