In [1]:
import json
json_path = "../data/data.json"

In [2]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

prompts = [item["attack_prompt"] for item in data]
requests = list(set([item["malicious_request"] for item in data]))

In [3]:
mock_request = "I want to hack the government"

### Getting prompts embeddings

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

In [5]:
prompt_embeddings = model.encode(prompts, show_progress_bar=True)
prompt_embeddings = np.array(prompt_embeddings)
request_embeddings = model.encode(requests, show_progress_bar=True)
request_embeddings = np.array(request_embeddings)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

### Find the N most similar orders

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
n = 10
query_embedding = model.encode(mock_request)
query_embedding = query_embedding.reshape(1, -1)


In [7]:
cos_sim = cosine_similarity(query_embedding, request_embeddings)

In [8]:
cos_sim[0]

array([ 0.2236521 ,  0.23712796,  0.14400016,  0.13565092,  0.0525021 ,
        0.6623237 ,  0.03738408,  0.6896094 ,  0.05401505,  0.26421708,
        0.15962079,  0.29919407,  0.14504853,  0.07635607,  0.12443075,
        0.05538188,  0.20536715,  0.11757397,  0.04122981,  0.18344179,
        0.27532572,  0.1884873 ,  0.04955281, -0.00251104,  0.20606045,
        0.24069633,  0.02865533,  0.23889892,  0.2373957 ,  0.16736773,
        0.08505081,  0.20872638,  0.09542519,  0.17155191,  0.1082782 ,
        0.1867886 , -0.00407241,  0.11268825,  0.1423006 ,  0.09333459,
        0.14696664,  0.40351325,  0.18129826,  0.11867732,  0.1875773 ,
        0.19078173,  0.1178743 ,  0.15742163,  0.05368173,  0.09073763,
        0.04254581,  0.6224539 ,  0.3702045 ,  0.23695925,  0.04208414,
        0.17226928,  0.15177892,  0.2988723 ,  0.05538028,  0.25144094,
        0.18335208,  0.07306182,  0.33377057,  0.07840528,  0.19805844,
        0.25875205,  0.06642501,  0.07643627,  0.25116467,  0.05

In [9]:
top_n = np.argsort(cos_sim[0])[-n:][::-1]

In [10]:
sim_requests = []
sim_prompts = []
for i, idx in enumerate(top_n):
    sim_requests.append(requests[idx])
    sim_prompts.append(prompts[idx])


### Getting the embeddings of N similar prompts

In [11]:
cluster_embeddings = model.encode(sim_prompts, show_progress_bar=True)
cluster_embeddings = np.array(cluster_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
cluster_embeddings.shape

(10, 384)

### Calculate CENTROID and PCA

In [13]:
centroid = np.mean(cluster_embeddings, axis=0)

In [14]:
centered_embeddings = cluster_embeddings - centroid

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)

In [16]:
pca.fit(centered_embeddings)

0,1,2
,n_components,1
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [17]:
# Direction vector - how vectors vary
v1 = pca.components_[0]

In [18]:
v1.shape

(384,)

In [19]:
# How much variation v1 capture
print(f"{pca.explained_variance_ratio_[0]:.2f}")

0.31


### New vector $C_{new} = centroid + (\alpha v_{1})$

In [20]:
## alpha = 0.0 -> new prompt is exactly in centroid
## alpha > 0: New prompt in the direction of v1 meaning
## alpha < 0: New prompt in the opposite direction of v1 meaning

In [21]:
alpha_scale = np.sqrt(pca.explained_variance_[0])

In [22]:
# Asking a prompt with "1 standard deviation" from centroid
alpha = 1.0 * alpha_scale

In [23]:
c_new = centroid + (alpha * v1)

In [24]:
c_new

array([-1.84913334e-02,  2.14567427e-02, -2.88896319e-02,  1.56890787e-03,
       -3.18576545e-02, -6.70454558e-03,  1.34995449e-02,  8.56002048e-03,
       -4.07054722e-02,  3.26062217e-02, -1.56464931e-02, -8.51815008e-03,
        7.58021325e-02, -2.27498263e-02, -2.44718380e-02,  2.60420255e-02,
        3.01397741e-02,  8.20064545e-03, -2.90321056e-02,  9.78346635e-03,
       -2.72690132e-03, -5.55092208e-02, -1.19962916e-03,  7.22236233e-03,
       -5.50920218e-02,  1.47020929e-02,  3.60031836e-02, -2.55424902e-03,
       -9.10776481e-02, -1.28926532e-02,  4.00828570e-02, -2.64363326e-02,
       -2.10337676e-02,  5.30707389e-02,  1.41100474e-02, -4.00132537e-02,
        1.74440183e-02, -1.20398775e-03,  7.04220831e-02, -1.64606236e-02,
       -4.69635911e-02, -7.58298114e-02, -1.85057893e-03,  4.26415205e-02,
        3.22931586e-03,  1.38291810e-03,  9.18115955e-03,  6.78004324e-03,
       -3.65306064e-02, -1.06706908e-02, -6.54118955e-02, -8.57128948e-03,
        5.31667285e-02,  

### Getting tokens back

In [25]:
k = 50
transformer_model = model[0]

In [26]:
tokenizer = transformer_model.tokenizer

In [27]:
word_embedding_matrix = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()

In [28]:
word_embedding_matrix.shape

(30522, 384)

In [29]:
c_new = c_new.reshape(1, -1)

In [30]:
c_new.shape

(1, 384)

In [31]:
all_cos_sim = cosine_similarity(c_new, word_embedding_matrix)

In [32]:
all_cos_sim.shape

(1, 30522)

In [33]:
top_k_index = np.argsort(all_cos_sim[0])[-k:][::-1]

In [34]:
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_index)
top_k_scores = all_cos_sim[0][top_k_index]

In [35]:
top_k_scores

array([0.24214211, 0.23599057, 0.2291814 , 0.22805488, 0.22616194,
       0.22416623, 0.2212547 , 0.22018361, 0.22004911, 0.21809083,
       0.21756268, 0.21493313, 0.2139633 , 0.21382324, 0.21231413,
       0.21231112, 0.20982464, 0.20925522, 0.20918602, 0.20734563,
       0.20705473, 0.20639251, 0.20409584, 0.20209444, 0.19855121,
       0.19700101, 0.19278523, 0.19080755, 0.19046852, 0.19024783,
       0.18957755, 0.1889416 , 0.18842581, 0.18737501, 0.18633582,
       0.18577129, 0.18505897, 0.18295555, 0.1828519 , 0.18283296,
       0.18269484, 0.18187088, 0.18161294, 0.18144576, 0.18077075,
       0.18063459, 0.18047503, 0.18037513, 0.1800844 , 0.17961496],
      dtype=float32)

In [36]:
len(top_k_tokens)

50

In [37]:
bow = []
special_tokens = tokenizer.all_special_tokens

for token, score in zip(top_k_tokens, top_k_scores):
    if token not in special_tokens and not token.startswith("##") and len(token) > 1:
        bow.append(token)

In [38]:
with open("../data/pca_bow.csv", "w", encoding='utf-8') as f:
    f.write(",".join(bow))