### Benchmark Prompts for ChatGPT, Claude, and internal chatbot

In [9]:
# Packages needed
#!pip install openai anthropic sentence-transformers pandas --quiet
#!pip install anthropic 
#!pip install openai --upgrade


In [None]:

import os
import pandas as pd
import openai
import anthropic
from sentence_transformers import SentenceTransformer, util

# Load embedding model for scoring
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load prompts from CSV
def load_prompts(file_path):
    df = pd.read_csv(file_path)
    # Select only the 'Prompt' column and convert to strings
    return df['Prompt'].astype(str).dropna().tolist()
    

# Compute cosine similarity between prompt and response
def similarity_score(prompt, response):
    prompt_embed = embedding_model.encode(prompt, convert_to_tensor=True)
    response_embed = embedding_model.encode(response, convert_to_tensor=True)
    return util.pytorch_cos_sim(prompt_embed, response_embed).item()

# Save results to CSV
def save_results(data, filename):
    df = pd.DataFrame(data, columns=["Prompt", "Response", "SimilarityScore"])
    df.to_csv(filename, index=False)
    print(f"✅ Saved to {filename}")




In [8]:
# OpenAI GPT test
# 🔑 Get OpenAI API key from environment variable
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

if not client.api_key:
    raise ValueError("❌ OPENAI_API_KEY environment variable not found.")

# Load prompts
code_prompts = load_prompts("benchmark-prompts-top10_code-generation.csv")
research_prompts = load_prompts("benchmark-prompts-top10_research-assistance.csv")
all_prompts = code_prompts + research_prompts

all_prompts

chatgpt_results = []

for prompt in all_prompts:
    try:
        # Send just the prompt text to the API
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1024,
            temperature=0
        )
        content = response.choices[0].message.content
        score = similarity_score(prompt, content)
        print(f"🟢 Prompt: {prompt}\n🔁 Response: {content[:200]}...\n🧠 Score: {score:.4f}\n{'-'*60}")
        chatgpt_results.append((prompt, content, score))
    except Exception as e:
        print(f"[OpenAI ERROR] {e}")
        chatgpt_results.append((prompt, "[ERROR]", 0.0))

save_results(chatgpt_results, "benchmark-prompts_chatgpt_results.csv")


🟢 Prompt: Write a Python script to process log files and identify anomalies based on time gaps (Prompt 1).
🔁 Response: Here is a simple Python script that processes a log file and identifies anomalies based on time gaps. This script assumes that the log file contains timestamps in the format "YYYY-MM-DD HH:MM:SS".

``...
🧠 Score: 0.8756
------------------------------------------------------------
🟢 Prompt: Create an SPL query to detect potential data exfiltration via large outbound transfers (Prompt 2).
🔁 Response: index=network_traffic sourcetype=stream:tcp dest_port!=80 AND dest_port!=443 
| stats sum(bytes_out) as total_bytes_out by dest_ip 
| where total_bytes_out > 1000000000 
| sort - total_bytes_out...
🧠 Score: 0.2670
------------------------------------------------------------
🟢 Prompt: Create an SPL query to detect potential data exfiltration via large outbound transfers (Prompt 2).
🔁 Response: index=network_traffic sourcetype=stream:tcp dest_port!=80 AND dest_port!=443 
| sta