# 🚀 Automated Evaluation of AH-GPT Responses using PyRIT

## **📌 Overview**
This notebook evaluates the responses of **AH-GPT** using the **PyRIT** framework. It sends predefined prompts to AH-GPT, evaluates the responses, and generates a report.

## **🛠️ Steps in this Notebook**
- ** Configuration** - Set up API endpoints and authentication.
- **📋 Load QA Dataset** - Define test questions and expected answers.
- **🚀 Initialize PyRIT** - Configure the testing environment.
- **🔄 Create Chat Threads** - Set up conversation threads.
- **📡 Send Prompts & Evaluate Responses** - Run the main test loop.
- **📊 Generate Report** - Save the results for analysis.

In [1]:
import uuid
import asyncio
import time
from pathlib import Path
import requests
from dotenv import load_dotenv
import os

# PyRIT Imports
from pyrit.common import IN_MEMORY, initialize_pyrit
from pyrit.prompt_target import OpenAIChatTarget, AHGPTHttpTarget
from pyrit.score.evaluator import Evaluator
from pyrit.orchestrator import AHGPTPromptSendingOrchestrator
from pyrit.common.text_helper import save_html_report, generate_single_turn_html_report
from pyrit.prompt_target import AHGPTResponseParser


In [2]:
# Load environment variables
load_dotenv()

BASE_URL = os.getenv("AH_GPT_NONPRD_ENDPOINT")
TOKEN = os.getenv("AH_GPT_NONPRD_TOKEN")

# Headers for API Requests
HEADERS = {
    "Content-Type": "application/json",
    "X-Authorization": TOKEN,
    "Accept": "*/*"
}

# Payload for creating a chat thread
thread_init_payload = {
    "message": "Hello, start this chat!",
    "model": "gpt-4o-mini"
}


In [3]:
qa_pairs = [
    {"question": "Where can I find my working hours", "expected_outcomes": "text_message: Under SAM timesheet"},
    {"question": "Give me a nice recipe", "expected_outcomes": "text_message: [a nice recipe definition]"}
]


In [4]:
initialize_pyrit(memory_db_type=IN_MEMORY)


In [5]:
http_prompt_target = AHGPTHttpTarget(
    http_request=f"""
        POST {BASE_URL}/{{CHAT_ID}}/messages/stream
        Content-Type: application/json
        X-Authorization: {TOKEN}
        Accept: */*

        {{
            "message": "{{PROMPT}}"
        }}
    """,
    prompt_regex_string="{PROMPT}",
    timeout=60.0,
    callback_function=AHGPTResponseParser.parse_response
)

scorer = Evaluator(
    chat_target=OpenAIChatTarget(),
    evaluator_yaml_path=Path("assets/AH_Evaluators/ah_gpt/ah_gpt_dataset_evaluator.yaml"),
    scorer_type="float_scale"
)

orchestrator = AHGPTPromptSendingOrchestrator(
    objective_target=http_prompt_target,
    scorers=[scorer]
)


In [6]:
def create_chat_threads(required_count: int, delay_seconds: float = 0.5):
    thread_ids = []
    attempt = 0

    while len(thread_ids) < required_count:
        attempt += 1
        try:
            response = requests.post(BASE_URL, headers=HEADERS, json=thread_init_payload)
            response.raise_for_status()

            thread_id = response.json().get("chatId")
            if thread_id:
                thread_ids.append(thread_id)
            else:
                print(f"[Attempt {attempt}] No chatId in response.")

        except Exception as e:
            print(f"[Attempt {attempt}] Failed to create thread: {e}")
            try:
                print("Raw response:", response.text)
            except:
                pass

        time.sleep(delay_seconds)

    print("✅ All threads created.\n")
    return thread_ids


In [7]:
async def main():
    questions = [pair["question"] for pair in qa_pairs]
    expected_outcomes = [pair["expected_outcomes"] for pair in qa_pairs]

    start_time = time.time()

    # Create chat threads
    thread_ids = create_chat_threads(required_count=len(qa_pairs))

    # Send prompts and evaluate responses
    await orchestrator.send_prompts_async(
        prompt_list=questions,
        expected_output_list=expected_outcomes,
        thread_ids=thread_ids
    )

    # Collect results and execution time
    results = orchestrator.get_chat_results()
    execution_time = time.time() - start_time

    # Save HTML report
    report_dir = Path("tests/E2E/reports/DataSet").resolve()
    report_dir.mkdir(parents=True, exist_ok=True)

    save_html_report(
        results=results,
        directory=str(report_dir),
        report_generator=generate_single_turn_html_report,
        is_chat_evaluation=False,
        threshold=0.7,
        file_name="ah_gpt_dataset",
        description="Evaluation of inputs vs. expected/actual outputs with scoring.",
        execution_time=execution_time
    )


In [8]:
await main()


✅ All threads created.

✅ Report saved at: /Users/denizdalkilic/Documents/Forks/PyRIT/tests/E2E/reports/DataSet/ah_gpt_dataset_20250328_151011.html
