### Semantic Analysis
- Fine tune the embedding model: https://sbert.net/docs/sentence_transformer/training_overview.html

In [21]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
import pandas as pd
from datasets import Dataset

user_prompts = pd.read_csv('./datasets/user_prompts.csv')
system_prompts = pd.read_csv('./datasets/system_prompts.csv')

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

system_prompts_list = []
user_prompts_list = []
is_injected_list = []

system_prompts_dict = {row['id']: row['system_prompt'] for _, row in system_prompts.iloc[:100].iterrows()}
user_prompts_by_system = user_prompts.groupby('system_prompt_id')

for system_id, system_prompt in system_prompts_dict.items():
    if system_id in user_prompts_by_system.groups:
        matching_user_prompts = user_prompts_by_system.get_group(system_id)
        system_prompts_list.extend([system_prompt] * len(matching_user_prompts))
        user_prompts_list.extend(matching_user_prompts['user_input'].tolist())
        is_injected_list.extend(matching_user_prompts['is_injection'].tolist())

train_dataset = Dataset.from_dict({
    "sentence1": system_prompts_list,
    "sentence2": user_prompts_list,
    "label": is_injected_list,
})

loss = losses.ContrastiveLoss(model)

trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    loss=loss,
)

trainer.train()

                                                                     

Step,Training Loss
500,0.0025


TrainOutput(global_step=726, training_loss=0.0017738862081171725, metrics={'train_runtime': 114.9419, 'train_samples_per_second': 50.504, 'train_steps_per_second': 6.316, 'total_flos': 0.0, 'train_loss': 0.0017738862081171725, 'epoch': 3.0})

In [22]:
# Select 20 system prompts that weren't in training
test_system_prompts = system_prompts.iloc[100:120]  # Get 20 prompts after the first 100 used in training
test_system_ids = test_system_prompts['id'].tolist()

# Get corresponding user prompts
test_user_prompts_df = user_prompts[user_prompts['system_prompt_id'].isin(test_system_ids)]

# Create lists for evaluation
test_system_prompts_list = []
test_user_prompts_list = []
test_is_injected_list = []

# Group user prompts by system prompt
for _, system_row in test_system_prompts.iterrows():
    matching_users = test_user_prompts_df[test_user_prompts_df['system_prompt_id'] == system_row['id']]
    test_system_prompts_list.extend([system_row['system_prompt']] * len(matching_users))
    test_user_prompts_list.extend(matching_users['user_input'].tolist())
    test_is_injected_list.extend(matching_users['is_injection'].tolist())

# Encode test prompts
test_system_embeddings = model.encode(test_system_prompts_list)
test_user_embeddings = model.encode(test_user_prompts_list)

# Calculate cosine similarities
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(test_system_embeddings, test_user_embeddings)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Convert similarities to binary predictions
threshold = 0.5
predictions = (similarities.diagonal() > threshold).astype(int)

# Calculate metrics
accuracy = accuracy_score(test_is_injected_list, predictions)
precision = precision_score(test_is_injected_list, predictions)
recall = recall_score(test_is_injected_list, predictions)
f1 = f1_score(test_is_injected_list, predictions)

print(f"Evaluation Results on Test Set:")
print(f"Number of system prompts tested: {len(test_system_prompts)}")
print(f"Number of user prompts tested: {len(test_user_prompts_list)}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}") 
print(f"F1 Score: {f1:.4f}")



Evaluation Results on Test Set:
Number of system prompts tested: 20
Number of user prompts tested: 390
Accuracy: 0.9615
Precision: 0.9524
Recall: 1.0000
F1 Score: 0.9756


In [16]:
from datasets import Dataset
from models.utils import get_training_and_validation_splits
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from setfit import SetFitModel, TrainingArguments, Trainer
import torch

class SetFitPromptEmbeddingTrainer:
    def __init__(self, model_name):
        self.device = "mps" if torch.backends.mps.is_available() else "cpu"
        self.model = SetFitModel.from_pretrained(model_name).to(self.device)
        self.trainer = None
        
    def prepare_dataset(self, system_prompts_df, user_prompts_df):
        system_prompts_list = system_prompts_df["system_prompt"].tolist()
        user_prompts_list = user_prompts_df["user_input"].tolist()
        is_injected_list = user_prompts_df["is_injection"].tolist()
        
        text_pairs = [
            f"System: {system_prompt} \n User: {user_prompt}"
            for system_prompt, user_prompt in zip(system_prompts_list, user_prompts_list)
        ]
        
        dataset = Dataset.from_dict({
            "text": text_pairs,
            "label": is_injected_list,
        })
        
        return dataset
        
    def train(self, epochs=3, iterations=20):
        (train_system_prompts, train_user_prompts), (val_system_prompts, val_user_prompts) = get_training_and_validation_splits(total_size=10)

        train_dataset = self.prepare_dataset(train_system_prompts, train_user_prompts)
        val_dataset = self.prepare_dataset(val_system_prompts, val_user_prompts)
        
        args = TrainingArguments(
            num_epochs=epochs,
            num_iterations=iterations,
        )
        
        self.trainer = Trainer(
            model=self.model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            metric="f1"
        )
        
        self.trainer.train()
        self.model.save_pretrained("./saved_models/setfit_model")
        
    def evaluate(self):
        if self.trainer is None:
            raise ValueError("Model must be trained before evaluation")
            
        (_, _), (val_system_prompts, val_user_prompts) = get_training_and_validation_splits()
        val_dataset = self.prepare_dataset(val_system_prompts, val_user_prompts)
        
        predictions = self.model.predict(val_dataset["text"])
        
        accuracy = accuracy_score(val_user_prompts["is_injection"].tolist(), predictions)
        precision = precision_score(val_user_prompts["is_injection"].tolist(), predictions)
        recall = recall_score(val_user_prompts["is_injection"].tolist(), predictions)
        f1 = f1_score(val_user_prompts["is_injection"].tolist(), predictions)
        
        results = {
            "num_system_prompts": len(val_system_prompts),
            "num_user_prompts": len(val_user_prompts),
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }
        
        return results
        
    def get_model(self):
        return self.model


In [17]:
# Create SetFit trainer with default model
trainer = SetFitPromptEmbeddingTrainer(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Train the model with default parameters (3 epochs, 20 iterations)
trainer.train()

# Evaluate the model and print results
eval_results = trainer.evaluate()
print("\nEvaluation Results:")
for metric, value in eval_results.items():
    print(f"{metric}: {value}")


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 160/160 [00:00<00:00, 26041.47 examples/s]
***** Running training *****
  Num unique pairs = 6400
  Batch size = 16
  Num epochs = 3


Step,Training Loss
1,0.2934
50,0.2681
100,0.2459
150,0.1913
200,0.1725
250,0.1351
300,0.1587
350,0.1462
400,0.1486
450,0.1486



Evaluation Results:
num_system_prompts: 3254
num_user_prompts: 3254
accuracy: 0.8976644130301168
precision: 0.9066220238095238
recall: 0.9674473997618103
f1: 0.9360476281928174
