In [39]:
import json
json_path = "../data/data.json"

In [40]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

prompts = [item["attack_prompt"] for item in data]
requests = list(set([item["malicious_request"] for item in data]))

In [41]:
mock_request = "I want to hack the government"

### Getting prompts embeddings

In [42]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

In [None]:
prompt_embeddings = model.encode(prompts, show_progress_bar=True)
prompt_embeddings = np.array(prompt_embeddings)
request_embeddings = model.encode(requests, show_progress_bar=True)
request_embeddings = np.array(request_embeddings)


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

### Find the N most similar orders

In [44]:
from sklearn.metrics.pairwise import cosine_similarity
n = 10
query_embedding = model.encode(mock_request)
query_embedding = query_embedding.reshape(1, -1)


In [45]:
cos_sim = cosine_similarity(query_embedding, request_embeddings)

In [46]:
cos_sim[0]

array([ 0.25875208,  0.24069634,  0.6623237 ,  0.23739575,  0.52870774,
        0.1884873 ,  0.07306178,  0.18482122,  0.36113915, -0.00407241,
        0.14230071,  0.09624192,  0.1875773 ,  0.14696662,  0.31459257,
        0.66116005,  0.22685176,  0.34971973,  0.12443081,  0.12553902,
        0.2060605 ,  0.68960947,  0.18766548], dtype=float32)

In [47]:
top_n = np.argsort(cos_sim[0])[-n:][::-1]

In [48]:
sim_requests = []
sim_prompts = []
for i, idx in enumerate(top_n):
    sim_requests.append(requests[i])
    sim_prompts.append(prompts[i])


### Getting the embeddings of N similar prompts

In [49]:
cluster_embeddings = model.encode(sim_prompts, show_progress_bar=True)
cluster_embeddings = np.array(cluster_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [50]:
cluster_embeddings.shape

(10, 384)

### Calculate CENTROID and PCA

In [51]:
centroid = np.mean(cluster_embeddings, axis=0)

In [52]:
centered_embeddings = cluster_embeddings - centroid

In [53]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)

In [54]:
pca.fit(centered_embeddings)

0,1,2
,n_components,1
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [55]:
# Direction vector - how vectors vary
v1 = pca.components_[0]

In [56]:
v1.shape

(384,)

In [57]:
# How much variation v1 capture
print(f"{pca.explained_variance_ratio_[0]:.2f}")

0.37


### New vector $C_{new} = centroid + (\alpha v_{1})$

In [58]:
## alpha = 0.0 -> new prompt is exactly in centroid
## alpha > 0: New prompt in the direction of v1 meaning
## alpha < 0: New prompt in the opposite direction of v1 meaning

In [59]:
alpha_scale = np.sqrt(pca.explained_variance_[0])

In [60]:
# Asking a prompt with "1 standard deviation" from centroid
alpha = 1.0 * alpha_scale

In [61]:
c_new = centroid + (alpha * v1)

In [62]:
c_new

array([-5.13098985e-02,  1.84884910e-02, -7.50358328e-02, -4.85533476e-02,
       -4.85697053e-02,  2.40179822e-02,  4.08740863e-02,  1.89885106e-02,
       -5.59416078e-02,  2.43403912e-02, -3.75070386e-02, -8.36276449e-03,
        1.36333674e-01, -1.96130704e-02, -2.02262253e-02,  5.33171929e-02,
        5.44023141e-02,  2.09183879e-02, -2.69991159e-02, -1.00662028e-02,
       -6.90801302e-03, -7.29735643e-02,  1.40464511e-02,  2.10616849e-02,
       -7.28111193e-02,  2.65851207e-02,  6.13042749e-02, -5.87560758e-02,
       -1.13005430e-01, -1.42924683e-02,  4.45690900e-02, -7.50344917e-02,
       -8.61960649e-03,  1.07920527e-01, -1.39573738e-02, -5.93550280e-02,
        1.84463523e-02, -2.63028592e-02,  9.67979655e-02, -5.51645868e-02,
       -4.34906594e-02, -8.06797892e-02, -1.00974618e-02,  4.06709313e-02,
        1.49871362e-02,  1.61943976e-02,  1.34580545e-02,  2.55189575e-02,
        1.82928648e-02, -2.28279661e-02, -5.47903851e-02,  7.39700068e-03,
        2.13798732e-02,  

### Getting tokens back

In [63]:
k = 15
transformer_model = model[0]

In [64]:
tokenizer = transformer_model.tokenizer

In [65]:
word_embedding_matrix = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()

In [66]:
word_embedding_matrix.shape

(30522, 384)

In [67]:
c_new = c_new.reshape(1, -1)

In [68]:
c_new.shape

(1, 384)

In [69]:
all_cos_sim = cosine_similarity(c_new, word_embedding_matrix)

In [70]:
all_cos_sim.shape

(1, 30522)

In [71]:
top_k_index = np.argsort(all_cos_sim[0])[-k:][::-1]

In [72]:
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_index)
top_k_scores = all_cos_sim[0][top_k_index]

In [73]:
top_k_scores

array([0.2614542 , 0.23361665, 0.22247416, 0.22138971, 0.22073108,
       0.22009286, 0.21804906, 0.21645415, 0.21614134, 0.21358052,
       0.20975854, 0.2056095 , 0.20541757, 0.204492  , 0.2009135 ],
      dtype=float32)

In [74]:
len(top_k_tokens)

15

In [75]:
bow = []
special_tokens = tokenizer.all_special_tokens

for token, score in zip(top_k_tokens, top_k_scores):
    if token not in special_tokens and not token.startswith("##") and len(token) > 1:
        bow.append(token)

In [76]:
with open("../data/bow.csv", "w", encoding='utf-8') as f:
    f.write(",".join(bow))