In [1]:
import json
json_path = "../data/data.json"

In [2]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

prompts = [item["attack_prompt"] for item in data]
requests = list(set([item["malicious_request"] for item in data]))

In [3]:
mock_request = "I want to hack the government"

### Getting prompts embeddings

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

In [5]:
prompt_embeddings = model.encode(prompts, show_progress_bar=True)
prompt_embeddings = np.array(prompt_embeddings)
request_embeddings = model.encode(requests, show_progress_bar=True)
request_embeddings = np.array(request_embeddings)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

### Find the N most similar orders

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
n = 10
query_embedding = model.encode(mock_request)
query_embedding = query_embedding.reshape(1, -1)


In [7]:
cos_sim = cosine_similarity(query_embedding, request_embeddings)

In [8]:
cos_sim[0]

array([ 0.1434813 ,  0.15588304,  0.18129826,  0.34971973,  0.20872638,
        0.20536715,  0.08505081,  0.15742163,  0.07840528,  0.18482122,
        0.11268825,  0.05538188,  0.05538028,  0.52870774,  0.15962079,
        0.23712796,  0.16773164,  0.1178743 ,  0.22685175,  0.24069633,
        0.16736773,  0.24686386,  0.14696664,  0.18335208,  0.1884873 ,
        0.36113915,  0.09155187,  0.31007522,  0.04122981,  0.07635607,
        0.05587318,  0.12189902,  0.23695925,  0.1867886 ,  0.23889892,
        0.15411736,  0.14400016,  0.31459257,  0.03738408,  0.23589191,
        0.6611601 ,  0.17155191,  0.18766548,  0.0525021 ,  0.04254581,
        0.07488019,  0.2886979 ,  0.3702045 ,  0.25144094,  0.05368173,
        0.09073763,  0.11867732,  0.19078173,  0.03894866,  0.6623237 ,
        0.27532572,  0.0962419 ,  0.04955281,  0.11757397,  0.14504853,
        0.1423006 ,  0.09542519, -0.00407241,  0.05401505,  0.02865533,
        0.15177892,  0.07643627,  0.25116467,  0.13058805,  0.22

In [9]:
top_n = np.argsort(cos_sim[0])[-n:][::-1]

In [10]:
sim_requests = []
sim_prompts = []
for i, idx in enumerate(top_n):
    sim_requests.append(requests[idx])
    sim_prompts.append(prompts[idx])


### Getting the embeddings of N similar prompts

In [11]:
cluster_embeddings = model.encode(sim_prompts, show_progress_bar=True)
cluster_embeddings = np.array(cluster_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
cluster_embeddings.shape

(10, 384)

### Calculate CENTROID and PCA

In [13]:
centroid = np.mean(cluster_embeddings, axis=0)

In [14]:
centered_embeddings = cluster_embeddings - centroid

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)

In [16]:
pca.fit(centered_embeddings)

0,1,2
,n_components,1
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [17]:
# Direction vector - how vectors vary
v1 = pca.components_[0]

In [18]:
v1.shape

(384,)

In [19]:
# How much variation v1 capture
print(f"{pca.explained_variance_ratio_[0]:.2f}")

0.29


### New vector $C_{new} = centroid + (\alpha v_{1})$

In [20]:
## alpha = 0.0 -> new prompt is exactly in centroid
## alpha > 0: New prompt in the direction of v1 meaning
## alpha < 0: New prompt in the opposite direction of v1 meaning

In [21]:
alpha_scale = np.sqrt(pca.explained_variance_[0])

In [22]:
# Asking a prompt with "1 standard deviation" from centroid
alpha = 1.0 * alpha_scale

In [23]:
c_new = centroid + (alpha * v1)

In [24]:
c_new

array([-2.10427772e-02,  3.95858176e-02, -5.25624640e-02, -1.93917826e-02,
       -2.75731273e-02,  2.21763216e-02,  3.23482752e-02,  2.81516593e-02,
       -4.88765426e-02,  1.09860711e-02, -3.13603394e-02, -2.89490633e-02,
        1.08694449e-01, -3.74915153e-02, -1.72163192e-02,  4.69383299e-02,
        5.20753078e-02,  2.57770736e-02, -4.04421240e-02, -2.58807419e-03,
       -5.73205203e-03, -3.11116949e-02, -8.59290361e-04,  2.12393776e-02,
       -8.28415900e-02,  3.09029985e-02,  4.64664660e-02, -3.55563387e-02,
       -8.31983387e-02, -1.25267422e-02,  4.30646427e-02, -3.34435739e-02,
        2.30993442e-02,  9.27898437e-02, -1.51947094e-02, -1.84393078e-02,
        6.53180294e-03, -7.24066235e-03,  7.07202703e-02, -6.40839562e-02,
       -3.69689576e-02, -6.19075373e-02, -1.69508569e-02,  3.55683081e-02,
        1.17789172e-02, -1.01345703e-02,  2.77447049e-04,  3.23115960e-02,
       -3.20880562e-02, -5.04429676e-02, -6.63091317e-02, -2.63587832e-02,
        2.34407578e-02,  

### Getting tokens back

In [25]:
def iterative_orthogonal_decoding(target_vector, model, k=50,decay=0.5):
    transformer_model = model[0]
    tokenizer = transformer_model.tokenizer
    word_embeddings = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()
    target_vector = target_vector.reshape(1, -1)
    found_tokens = []
    current_vector = target_vector.copy()
    special_tokens = set(tokenizer.all_special_tokens)
    found_tokens_count = 0
    while found_tokens_count < 50:
        sims = cosine_similarity(current_vector, word_embeddings)
        best_idx = np.argmax(sims[0]).item()
        token = tokenizer.convert_ids_to_tokens(best_idx)
        print(token)
        if token not in found_tokens and len(token) > 2 and token not in special_tokens and not token.startswith("##"):
            found_tokens.append(token)
        token_vec = word_embeddings[best_idx].reshape(1, -1)
        scalar_proj = np.dot(current_vector, token_vec.T) / np.dot(token_vec, token_vec.T)
        current_vector = current_vector - (scalar_proj * token_vec * decay)
        found_tokens_count += 1
    return found_tokens

In [26]:
k = 50
transformer_model = model[0]
tokenizer = transformer_model.tokenizer


In [28]:
iterative_orthogonal_decoding(target_vector=c_new, model=model)

interrogation
secret
implicated
さ
hacking
educate
gangster
ordinance
havoc
raf
protege
sandra
openly
er
busted
##idia
##ets
##tv
##yas
loosely
taking
theodore
flash
onto
iv
hysteria
##lls
barron
action
invading
regards
##bon
securely
bowen
id
malcolm
block
##forth
instituto
##mon
jed
initiatives
##tered
lore
east
nexus
have
brave
piccolo
dump


['interrogation',
 'secret',
 'implicated',
 'hacking',
 'educate',
 'gangster',
 'ordinance',
 'havoc',
 'raf',
 'protege',
 'sandra',
 'openly',
 'busted',
 'loosely',
 'taking',
 'theodore',
 'flash',
 'onto',
 'hysteria',
 'barron',
 'action',
 'invading',
 'regards',
 'securely',
 'bowen',
 'malcolm',
 'block',
 'instituto',
 'jed',
 'initiatives',
 'lore',
 'east',
 'nexus',
 'have',
 'brave',
 'piccolo',
 'dump']

In [None]:
word_embedding_matrix = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()

In [None]:
word_embedding_matrix.shape

(30522, 384)

In [None]:
c_new = c_new.reshape(1, -1)

In [None]:
c_new.shape

(1, 384)

In [None]:
all_cos_sim = cosine_similarity(c_new, word_embedding_matrix)

In [None]:
all_cos_sim.shape

(1, 30522)

In [None]:
top_k_index = np.argsort(all_cos_sim[0])[-k:][::-1]

In [None]:
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_index)
top_k_scores = all_cos_sim[0][top_k_index]

In [None]:
top_k_scores

array([ 0.24783191,  0.2473989 ,  0.2287492 , ..., -0.1967194 ,
       -0.19890015, -0.19975102], shape=(30522,), dtype=float32)

In [None]:
len(top_k_tokens)

30522

In [None]:
bow = []
special_tokens = tokenizer.all_special_tokens

for token, score in zip(top_k_tokens, top_k_scores):
    if token not in special_tokens and not token.startswith("##") and len(token) > 1:
        bow.append(token)

In [None]:
bow

['investigation',
 'surveillance',
 'investigations',
 'hacking',
 'investigated',
 'covert',
 'cia',
 'busted',
 'gangster',
 'criminal',
 'investigative',
 'secret',
 'interrogation',
 'infiltrate',
 'spying',
 'undercover',
 'mob',
 'freaked',
 'gestapo',
 'anal',
 'mole',
 'mafia',
 'investigate',
 'prank',
 'implicated',
 'investigator',
 'attack',
 'hacked',
 'investigating',
 'informant',
 'clandestine',
 '911',
 'sanctioned',
 'hacker',
 'attacks',
 'apparatus',
 'paranoid',
 'havoc',
 'mischievous',
 'intelligence',
 'terror',
 'terrorism',
 'suspiciously',
 'paramilitary',
 'mischief',
 'blindly',
 'gang',
 'blackmail',
 'manipulating',
 'laundering',
 'clue',
 'operations',
 'stalking',
 'fbi',
 'spy',
 'information',
 'erich',
 'snoop',
 'dangerously',
 'terrorist',
 'indictment',
 'patrolling',
 'executing',
 'robber',
 'patsy',
 'info',
 'assassinate',
 'officer',
 'malcolm',
 'intellect',
 'hysteria',
 'paranoia',
 'nsa',
 'investigators',
 'hack',
 'suspicious',
 'preca

In [None]:
with open("../data/pca_bow.csv", "w", encoding='utf-8') as f:
    f.write(",".join(bow))