In [None]:
import pandas as pd
from ollama import generate
from datasets import load_dataset
import numpy as np


In [None]:
system_prompts = pd.read_csv('datasets/system_prompts.csv')

In [None]:
NUM_EXAMPLES = 100
user_inputs = []

for _, row in system_prompts.iterrows():
    system_prompt = row['system_prompt']
    prompt_type = row['type']
    
    for i in range(NUM_EXAMPLES):
        meta_prompt = f"""Based on this system prompt: "{system_prompt}"
        Generate a realistic, brief user input (1-3 sentences) that someone might send to this assistant.
        Only respond with the user input, nothing else."""
        
        response = generate(model='llama3.1:8b', prompt=meta_prompt)
        while not response.done:
            continue
        user_inputs.append(response.response)

        print(f"Generated {len(user_inputs)} user inputs")


In [None]:
chunked_examples = []

for i in range(1, len(system_prompts) + 1):
    chunk = pd.DataFrame({
        'user_input': user_inputs[(i-1)*NUM_EXAMPLES:i*NUM_EXAMPLES],
        'system_prompt_id': [i] * NUM_EXAMPLES
    })
    chunked_examples.append(chunk)

benign_examples = pd.concat(chunked_examples, ignore_index=True)
benign_examples.to_csv('datasets/benign_examples.csv', index=False)

In [None]:
dataset = load_dataset("deepset/prompt-injections")
malicious_examples = pd.concat([
    pd.DataFrame(dataset['train']).query('label == 1')[['text']],
    pd.DataFrame(dataset['test']).query('label == 1')[['text']]
])
malicious_examples = malicious_examples.rename(columns={'text': 'user_input'})
malicious_examples['system_prompt_id'] = np.random.randint(1, len(system_prompts) + 1, size=len(malicious_examples))
malicious_examples.to_csv('datasets/malicious_examples.csv', index=False)