In [1]:
import json
json_path = "../data/data.json"
import sys 
import os
sys.path.append(os.path.abspath(".."))


In [2]:
from utils import load_config
config = load_config("../config/models.yaml")

In [3]:
config

{'models': {'sanitizer': 'meta-llama/Llama-3.1-8B-Instruct',
  'perplexed': 'meta-llama/Llama-3.1-8B',
  'embedding': 'all-MiniLM-L6-v2'}}

In [4]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

prompts = [item["attack_prompt"] for item in data]
requests = list(set([item["malicious_request"] for item in data]))

In [5]:
mock_request = "I want to hack the government"


In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer(config["models"]["embedding"])



In [8]:
prompt_embeddings = np.array(model.encode(prompts, show_progress_bar=True))
request_embeddings = np.array(model.encode(requests, show_progress_bar=True))



Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
n = 10
query_embeddings = model.encode(mock_request).reshape(1, -1)

In [11]:
cos_sim = cosine_similarity(query_embeddings, request_embeddings)

In [None]:
top_n = np.argsort(cos_sim[0])[-n:][::-1]

In [15]:
sim_requests = []
sim_prompts = []
for i, idx in enumerate(top_n):
    sim_requests.append(requests[i])
    sim_prompts.append(prompts[i])


In [17]:
cluster_embeddings = np.array(model.encode(sim_prompts, show_progress_bar=True))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Calculate CENTRODI and ICA

In [18]:
from sklearn.decomposition import FastICA
k = 5
component_index_to_use = 0


In [19]:
centroid = np.mean(cluster_embeddings, axis=0)

In [21]:
centered_embeddings = cluster_embeddings - centroid

In [22]:
ica = FastICA(
    n_components=k,
    max_iter=500,
    tol=1e-3,
    random_state=42
)

In [23]:
ica.fit(centered_embeddings)

0,1,2
,n_components,5
,algorithm,'parallel'
,whiten,'unit-variance'
,fun,'logcosh'
,fun_args,
,max_iter,500
,tol,0.001
,w_init,
,whiten_solver,'svd'
,random_state,42


In [24]:
ica_components = ica.components_

In [26]:
ica_components.shape

(5, 384)

In [27]:
v1 = ica_components[0]

Getting new vector

In [31]:
transformed_data = ica.transform(centered_embeddings)

In [32]:
component_scores = transformed_data[:, component_index_to_use]

In [33]:
alpha_scale = np.std(component_scores)

In [35]:
alpha = 1.0 * alpha_scale

In [37]:
c_new = centroid + (alpha * v1)

In [38]:
c_new

array([ 4.06655073e-01, -1.63179219e-01,  4.05958503e-01,  6.73668534e-02,
        2.93068975e-01,  1.69410452e-01,  2.26264775e-01, -2.76660472e-01,
       -2.60712691e-02,  5.99602982e-02, -3.06095839e-01, -1.29740804e-01,
        4.81915206e-01,  4.65953171e-01,  9.48285460e-02,  1.89967364e-01,
       -2.21231617e-02, -2.90399849e-01,  3.46581005e-02,  1.43983841e-01,
        3.02829109e-02,  2.15463802e-01,  5.19029260e-01,  4.01171893e-01,
       -3.70134920e-01, -1.31330088e-01,  1.63265280e-02, -5.75492345e-03,
        5.74355274e-02,  6.77354217e-01, -4.56582457e-01,  2.24777192e-01,
       -4.07063991e-01,  1.84932977e-01, -2.85444975e-01, -2.13404268e-01,
        9.23739821e-02,  2.58879989e-01, -5.25176078e-02,  1.16268389e-01,
       -1.68865070e-01, -1.65259838e-01,  6.29751533e-02,  1.90210924e-01,
        3.70545179e-01, -1.08679168e-01, -2.20230848e-01, -7.64778793e-01,
       -2.57421564e-02,  4.63415943e-02, -2.19614506e-01,  1.20605668e-02,
        5.85702620e-02,  

Getting tokens back

In [39]:
k = 15
transformer_model = model[0]

In [41]:
tokenizer = transformer_model.tokenizer

In [42]:
word_embedding_matrix = transformer_model.auto_model.get_input_embeddings().weight.detach().cpu().numpy()

In [44]:
word_embedding_matrix.shape

(30522, 384)

In [45]:
c_new = c_new.reshape(1, -1)

In [46]:
all_cos_sim = cosine_similarity(c_new, word_embedding_matrix)

In [47]:
top_k_index = np.argsort(all_cos_sim[0])[-k:][::-1]

In [48]:
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_index)
top_k_scores = all_cos_sim[0][top_k_index]

In [49]:
bow = []
special_tokens = tokenizer.all_special_tokens

for token, score in zip(top_k_tokens, top_k_scores):
    if token not in special_tokens and not token.startswith("##") and len(token) > 1:
        bow.append(token)

In [50]:
with open("../data/ica_bow.csv", "w", encoding='utf-8') as f:
    f.write(",".join(bow))