### Semantic Analysis
- Fine tune the embedding model: https://sbert.net/docs/sentence_transformer/training_overview.html

In [6]:
%pip install -U sentence-transformers seaborn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
import pandas as pd
from datasets import Dataset

user_prompts = pd.read_csv('./datasets/user_prompts.csv')
system_prompts = pd.read_csv('./datasets/system_prompts.csv')

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

system_prompts_list = []
user_prompts_list = []
is_injected_list = []

system_prompts_dict = {row['id']: row['system_prompt'] for _, row in system_prompts.iloc[:100].iterrows()}
user_prompts_by_system = user_prompts.groupby('system_prompt_id')

for system_id, system_prompt in system_prompts_dict.items():
    if system_id in user_prompts_by_system.groups:
        matching_user_prompts = user_prompts_by_system.get_group(system_id)
        system_prompts_list.extend([system_prompt] * len(matching_user_prompts))
        user_prompts_list.extend(matching_user_prompts['user_input'].tolist())
        is_injected_list.extend(matching_user_prompts['is_injection'].tolist())

train_dataset = Dataset.from_dict({
    "sentence1": system_prompts_list,
    "sentence2": user_prompts_list,
    "label": is_injected_list,
})

loss = losses.ContrastiveLoss(model)

trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    loss=loss,
)

trainer.train()

                                                                     

Step,Training Loss
500,0.0025


TrainOutput(global_step=726, training_loss=0.0017738862081171725, metrics={'train_runtime': 114.9419, 'train_samples_per_second': 50.504, 'train_steps_per_second': 6.316, 'total_flos': 0.0, 'train_loss': 0.0017738862081171725, 'epoch': 3.0})

In [22]:
# Select 20 system prompts that weren't in training
test_system_prompts = system_prompts.iloc[100:120]  # Get 20 prompts after the first 100 used in training
test_system_ids = test_system_prompts['id'].tolist()

# Get corresponding user prompts
test_user_prompts_df = user_prompts[user_prompts['system_prompt_id'].isin(test_system_ids)]

# Create lists for evaluation
test_system_prompts_list = []
test_user_prompts_list = []
test_is_injected_list = []

# Group user prompts by system prompt
for _, system_row in test_system_prompts.iterrows():
    matching_users = test_user_prompts_df[test_user_prompts_df['system_prompt_id'] == system_row['id']]
    test_system_prompts_list.extend([system_row['system_prompt']] * len(matching_users))
    test_user_prompts_list.extend(matching_users['user_input'].tolist())
    test_is_injected_list.extend(matching_users['is_injection'].tolist())

# Encode test prompts
test_system_embeddings = model.encode(test_system_prompts_list)
test_user_embeddings = model.encode(test_user_prompts_list)

# Calculate cosine similarities
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(test_system_embeddings, test_user_embeddings)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Convert similarities to binary predictions
threshold = 0.5
predictions = (similarities.diagonal() > threshold).astype(int)

# Calculate metrics
accuracy = accuracy_score(test_is_injected_list, predictions)
precision = precision_score(test_is_injected_list, predictions)
recall = recall_score(test_is_injected_list, predictions)
f1 = f1_score(test_is_injected_list, predictions)

print(f"Evaluation Results on Test Set:")
print(f"Number of system prompts tested: {len(test_system_prompts)}")
print(f"Number of user prompts tested: {len(test_user_prompts_list)}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}") 
print(f"F1 Score: {f1:.4f}")



Evaluation Results on Test Set:
Number of system prompts tested: 20
Number of user prompts tested: 390
Accuracy: 0.9615
Precision: 0.9524
Recall: 1.0000
F1 Score: 0.9756
