In [1]:
import json
json_path = "../data/data.json"
import sys 
import os
sys.path.append(os.path.abspath(".."))


In [2]:
from utils import load_config
config = load_config("../config/models.yaml")

In [3]:
config

{'models': {'sanitizer': 'meta-llama/Llama-3.1-8B-Instruct',
  'perplexed': 'meta-llama/Llama-3.1-8B',
  'embedding': 'all-MiniLM-L6-v2',
  'mask-model': 'bert-base-multilingual-cased'}}

In [4]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

prompts = [item["attack_prompt"] for item in data]
requests = list(set([item["malicious_request"] for item in data]))

In [5]:
mock_request = "I want to hack the government"


In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer(config["models"]["embedding"])



In [7]:
prompt_embeddings = np.array(model.encode(prompts, show_progress_bar=True))
request_embeddings = np.array(model.encode(requests, show_progress_bar=True))



Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
n = 10
query_embeddings = model.encode(mock_request).reshape(1, -1)

In [9]:
cos_sim = cosine_similarity(query_embeddings, request_embeddings)

In [10]:
top_n = np.argsort(cos_sim[0])[-n:][::-1]

In [11]:
sim_requests = []
sim_prompts = []
for i, idx in enumerate(top_n):
    sim_requests.append(requests[idx])
    sim_prompts.append(prompts[idx])


In [12]:
cluster_embeddings = np.array(model.encode(sim_prompts, show_progress_bar=True))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Calculate CENTROID and ICA

In [13]:
from sklearn.decomposition import FastICA
k = 5
component_index_to_use = 0


In [14]:
centroid = np.mean(cluster_embeddings, axis=0)

In [15]:
centered_embeddings = cluster_embeddings - centroid

In [16]:
ica = FastICA(
    n_components=k,
    max_iter=500,
    tol=1e-3,
    random_state=42
)

In [17]:
ica.fit(centered_embeddings)

0,1,2
,n_components,5
,algorithm,'parallel'
,whiten,'unit-variance'
,fun,'logcosh'
,fun_args,
,max_iter,500
,tol,0.001
,w_init,
,whiten_solver,'svd'
,random_state,42


In [18]:
ica_components = ica.components_

In [19]:
ica_components.shape

(5, 384)

In [20]:
v1 = ica_components[0]

Getting new vector

In [21]:
transformed_data = ica.transform(centered_embeddings)

In [22]:
component_scores = transformed_data[:, component_index_to_use]

In [23]:
alpha_scale = np.std(component_scores)

In [24]:
alpha = 1.0 * alpha_scale

In [25]:
c_new = centroid + (alpha * v1)

In [26]:
c_new

array([ 6.36450723e-02,  2.09418982e-01, -2.60849774e-01, -1.91048995e-01,
        1.45515800e-01,  3.23772907e-01, -1.80432498e-01, -3.23714949e-02,
       -1.64259180e-01, -1.93823069e-01,  1.03891589e-01,  1.72919892e-02,
       -5.45245893e-02,  4.59679455e-01, -2.10998803e-01, -1.30422413e-01,
        5.11906505e-01, -9.87507924e-02,  1.04791716e-01, -3.26756030e-01,
       -1.41365707e-01,  2.25841016e-01,  6.07533574e-01,  2.46249482e-01,
        3.73421967e-01,  2.34339952e-01,  2.22564965e-01,  2.27144763e-01,
        2.51330674e-01, -2.51045913e-01,  4.21517581e-01,  1.32385835e-01,
        3.76022398e-01, -3.62329096e-01, -1.22098468e-01, -2.32080787e-01,
        2.32120484e-01, -1.70853674e-01,  2.53691822e-01, -2.16227204e-01,
       -4.87948149e-01,  3.78094256e-01, -1.28712535e-01, -2.85641670e-01,
       -1.74311757e-01,  1.34071149e-02, -1.13072425e-01, -5.27207375e-01,
        5.16750097e-01, -2.46600598e-01,  3.59712183e-01,  6.26750756e-03,
       -1.62143379e-01,  

Getting tokens back

In [27]:
k = 50
transformer_model = model[0]

In [28]:
tokenizer = transformer_model.tokenizer

In [29]:
word_embedding_matrix = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()

In [30]:
word_embedding_matrix.shape

(30522, 384)

In [31]:
c_new = c_new.reshape(1, -1)

In [32]:
all_cos_sim = cosine_similarity(c_new, word_embedding_matrix)

In [33]:
top_k_index = np.argsort(all_cos_sim[0])[-k:][::-1]

In [34]:
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_index)
top_k_scores = all_cos_sim[0][top_k_index]

In [35]:
bow = []
special_tokens = tokenizer.all_special_tokens

for token, score in zip(top_k_tokens, top_k_scores):
    if token not in special_tokens and not token.startswith("##") and len(token) > 2:
        bow.append(token)

In [36]:
with open("../data/ica_bow.csv", "w", encoding='utf-8') as f:
    f.write(",".join(bow))

In [37]:
print(len(bow))

40
