## Embedding Set for RAG Model

In [1]:
import os
from flask import Flask, request, jsonify
from werkzeug.serving import run_simple
from threading import Thread

# LangChain imports
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import SystemMessage, HumanMessage

# ----------------------------------------------------------------------------
# 1) Environment Variables for Javelin + GPT
# ----------------------------------------------------------------------------
llm_api_key = os.environ["OPENAI_API_KEY"] = ""

javelin_api_key = os.environ["JAVELIN_API_KEY"] = ""


# ----------------------------------------------------------------------------
# 2) Javelin Chat Model (GPT-3.5-turbo)
# ----------------------------------------------------------------------------
class JavelinOpenAI(ChatOpenAI):
    def __init__(self, temperature=0.7, route="testing"):
        javelin_headers = {"x-api-key": os.environ["JAVELIN_API_KEY"]}
        super().__init__(
            openai_api_base=f"https://api-dev.javelin.live/v1/query/{route}",
            openai_api_key=os.environ["OPENAI_API_KEY"],
            model_name="gpt-3.5-turbo",
            temperature=temperature,
            default_headers=javelin_headers
        )

# ----------------------------------------------------------------------------
# 3) Javelin Embeddings Model (If Supported)
# ----------------------------------------------------------------------------
class JavelinOpenAIEmbeddings(OpenAIEmbeddings):
    def __init__(self, model="text-embedding-ada-002", route="embeddings_route"):
        javelin_headers = {"x-api-key": os.environ["JAVELIN_API_KEY"]}
        super().__init__(
            openai_api_base=f"https://api-dev.javelin.live/v1/query/{route}",
            openai_api_key=os.environ["OPENAI_API_KEY"],
            model=model,
            default_headers=javelin_headers
        )

# ----------------------------------------------------------------------------
# 4) Flask App Setup
# ----------------------------------------------------------------------------
app = Flask(__name__)

# ----------------------------------------------------------------------------
# 5) Initialize Javelin LLM & Embeddings
# ----------------------------------------------------------------------------
llm = JavelinOpenAI(temperature=0.7, route="testing")

# If Javelin supports embeddings, use this:
embedding_model_custom = JavelinOpenAIEmbeddings(
    model="text-embedding-ada-002",
    route="embeddings_route"
)

# If Javelin doesn't support embeddings, fallback to OpenAI:
# from langchain.embeddings.openai import OpenAIEmbeddings
# embedding_model_custom = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])

vector_store = Chroma(
    persist_directory="chroma_store_custom",
    embedding_function=embedding_model_custom
)

# ----------------------------------------------------------------------------
# 6) Sample Text
# ----------------------------------------------------------------------------
sample_text = """
Artificial Intelligence (AI) is a rapidly advancing technology ...
... enabling early intervention.
"""

# ----------------------------------------------------------------------------
# 7) /store Endpoint: Split & Store
# ----------------------------------------------------------------------------
@app.route('/store', methods=['POST'])
def store_document():
    documents = [Document(page_content=sample_text)]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    split_docs = text_splitter.split_documents(documents)
    vector_store.add_documents(split_docs)
    return jsonify({"message": "Document stored successfully!"})

# ----------------------------------------------------------------------------
# 8) /query Endpoint: Similarity Search + GPT
# ----------------------------------------------------------------------------
@app.route('/query', methods=['POST'])
def get_final_response():
    data = request.get_json()
    query = data.get('query', '')

    # 1) Retrieve relevant docs
    relevant_docs = vector_store.similarity_search(query, k=2)
    combined_docs = "\n".join(doc.page_content for doc in relevant_docs)

    # 2) Build chat messages
    messages = [
        SystemMessage(content="You are a helpful scientific assistant."),
        HumanMessage(content=f"Question: {query}\nDocuments:\n{combined_docs}\nAnswer:")
    ]

    # 3) Invoke GPT-3.5-turbo via Javelin
    response = llm.invoke(messages)

    # 4) Ensure response is safe to access
    response_text = response.content if hasattr(response, "content") else str(response)

    return jsonify({"response": response_text})

# ----------------------------------------------------------------------------
# 9) Run App in Background Thread
# ----------------------------------------------------------------------------
def run_app():
    run_simple('localhost', 5001, app, use_reloader=False, use_debugger=False)

thread = Thread(target=run_app)
thread.start()


 * Running on http://localhost:5001
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [05/Feb/2025 13:47:23] "POST /store HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 13:47:27] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 13:56:15] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 13:56:17] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 13:56:18] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 13:56:20] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 13:56:22] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 14:01:05] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 14:01:07] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 14:01:08] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 14:01:09] "POST /query HTTP/1.1" 200 -
127.0.0.1 - - [05/Feb/2025 14:01:11] "POST /query HTTP/1.1" 200 -


## testing the storing

In [2]:
import requests
base_url = "http://localhost:5001"
store_resp = requests.post(f"{base_url}/store")
print("Store Response:", store_resp.text)


Store Response: {"message":"Document stored successfully!"}



## testing the retrieving

In [3]:
query_data = {"query": "What impact does AI have on healthcare?"}
resp = requests.post(f"{base_url}/query", json=query_data)
print("Status Code:", resp.status_code)
print("Response Text:", resp.text)
print("Query Response:", resp.json())


Status Code: 200
Response Text: {"response":"AI has a significant impact on healthcare by enabling early intervention through the rapid advancement of technology. It helps in early detection of diseases, personalized treatment plans, and improved patient outcomes. The ability of AI to analyze large amounts of data quickly and accurately can lead to more precise diagnoses and treatment options, ultimately benefiting both patients and healthcare providers."}

Query Response: {'response': 'AI has a significant impact on healthcare by enabling early intervention through the rapid advancement of technology. It helps in early detection of diseases, personalized treatment plans, and improved patient outcomes. The ability of AI to analyze large amounts of data quickly and accurately can lead to more precise diagnoses and treatment options, ultimately benefiting both patients and healthcare providers.'}


In [3]:
import os 

## Langgraph for Rag using Re-act agent code using OPen Ai

In [8]:
import os
import json
import requests
import pandas as pd
from pydantic import BaseModel

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.tools import tool

# Import DeepEval modules
from deepeval import login_with_confident_api_key, evaluate  # evaluate helper
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase


# -----------------------------
# Configure API Keys and Log In
# -----------------------------
login_with_confident_api_key(os.environ["DEEPEVAL_API_KEY"])

# -----------------------------
# Define AppParams model
# -----------------------------
class AppParams(BaseModel):
    app_name: str
    description: str
    system_prompt: str
    endpoint: str
    extra_definition: str
    k: int

# -----------------------------
# Utility: Custom Parsing Function
# -----------------------------
def custom_parsing_for_rag(response_text: str) -> str:
    """
    Parse the response from the RAG endpoint (expected as JSON).
    Return the value associated with "answer" if available; if not, then "response".
    Otherwise, return the original text.
    """
    try:
        data = json.loads(response_text)
        return data.get("answer") or data.get("response") or response_text
    except Exception:
        return response_text

# -----------------------------
# Tool: Generate Test Cases
# -----------------------------
@tool
def generate_test_cases(app_name: str, description: str, system_prompt: str,
                        endpoint: str, extra_definition: str, k: int) -> dict:
    """
    Name: generate_test_cases
    Description: Generate exactly k plain test queries for the specified app details using an LLM.
    Input Arguments:
        - app_name (str): The name of the application.
        - description (str): A brief description of the app.
        - system_prompt (str): The system prompt or instructions for generating queries.
        - endpoint (str): The API endpoint to be tested.
        - extra_definition (str): Any extra definitions or clarifications needed.
        - k (int): The number of test queries to generate.
    Output:
        - A dictionary with {"result": "<success or error message>"}.

    How it works:
    1) Calls an LLM to generate exactly k queries in JSON array format.
    2) Saves queries in CSV with columns "Test_Cases" (the query) and "Response" (empty).
    """
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

    prompt_template = """
    Application Name: {app_name}
    Description: {description}
    System Prompt: {system_prompt}
    Extra Definitions: {extra_definition}
    Endpoint: {endpoint}
    Number of Test Queries: {k}

    Based on the above details, generate exactly {k} test queries for automated testing.
    Each test query should be a plain text string that represents a query to test the endpoint.
    Do not include any extra information, explanations, or expected output.
    Return the result strictly as a JSON array of strings, for example:
    ["Query 1", "Query 2", ...]
    """
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["app_name", "description", "system_prompt", "extra_definition", "endpoint", "k"]
    )

    llm_chain = LLMChain(llm=llm, prompt=prompt)
    print("[DEBUG] Generating test queries...")
    generated = llm_chain.run({
        "app_name": app_name,
        "description": description,
        "system_prompt": system_prompt,
        "extra_definition": extra_definition,
        "endpoint": endpoint,
        "k": k
    })
    print("[DEBUG] Raw LLM output:", generated)

    # Attempt to parse as JSON
    try:
        queries = json.loads(generated)
        if not isinstance(queries, list):
            raise ValueError("JSON output is not a list.")
    except Exception as e:
        print("[ERROR] Parsing JSON failed. Splitting by newlines. Error:", e)
        queries = [q.strip() for q in generated.split("\n") if q.strip()]

    print("[DEBUG] Parsed queries:", queries)
    df = pd.DataFrame({"Test_Cases": queries, "Response": [""] * len(queries)})
    csv_file = f"{app_name}_test_cases.csv"
    df.to_csv(csv_file, index=False)
    return {"result": f"Test queries saved in {csv_file}."}

# -----------------------------
# Tool: Execute Test Cases
# -----------------------------
@tool
def execute_test_cases(app_name: str, endpoint: str, extra_headers: dict = None) -> dict:
    """
    Name: execute_test_cases
    Description: Execute existing test queries for the specified app by sending them to the given endpoint.
    Input Arguments:
        - app_name (str): The name of the application.
        - endpoint (str): The API endpoint to be tested.
        - extra_headers (dict, optional): Additional HTTP headers for requests.
    Output:
        - A dictionary with {"result": "<success or error message>"}.

    How it works:
    1) Reads the CSV file named "<app_name>_test_cases.csv".
    2) POSTs each query (from "Test_Cases" column) to 'endpoint' (JSON payload: {"query": <test_query>}).
    3) Parses the JSON response if possible, or returns the raw text otherwise.
    4) Saves the response to the same CSV (under "Response" column).
    5) Calls evaluate_test_cases to measure quality via DeepEval and prints the evaluation result.
    """
    csv_file = f"{app_name}_test_cases.csv"
    print(f"[DEBUG] Reading test queries from {csv_file}...")
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        return {"result": f"Could not read CSV {csv_file}: {str(e)}"}
    
    headers = extra_headers if extra_headers is not None else {}
    responses = []
    for query in df["Test_Cases"]:
        try:
            res = requests.post(endpoint, json={"query": query}, headers=headers)
            if res.status_code == 200:
                text = res.text
            else:
                text = f"Error {res.status_code}: {res.text}"
        except Exception as e:
            text = f"Request failed: {str(e)}"
        parsed = custom_parsing_for_rag(text)
        responses.append(parsed)
        print(f"[DEBUG] Query: {query} -> Response: {parsed}")

    df["Response"] = responses
    df.to_csv(csv_file, index=False)

    # Evaluate after execution
    eval_result = evaluate_test_cases(app_name)
    print("Evaluation Result:")
    print(eval_result["result"])

    return {"result": f"Executed queries; responses saved in {csv_file}."}

# -----------------------------
# Function: Evaluate Test Cases (Direct Function Call)
# -----------------------------
def evaluate_test_cases(app_name: str) -> dict:
    """
    For each test case in the CSV, generate a reference answer using a reference LLM,
    evaluate the actual output against the reference using DeepEval,
    and save the per-test score and reason in new CSV columns.

    This function is called directly (not as a tool) to avoid recursion issues.
    """
    csv_file = f"{app_name}_test_cases.csv"
    marker_file = f"{app_name}_evaluation.marker"

    if os.path.exists(marker_file):
        return {"result": "Evaluation already completed; skipping re-evaluation."}
    
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        return {"result": f"Could not read CSV {csv_file}: {str(e)}"}
    
    scores = []
    reasons = []
    reference_llm = ChatOpenAI(
        openai_api_key=os.environ["OPENAI_API_KEY"],
        model="gpt-3.5-turbo",
        temperature=0
    )
    
    for idx, row in df.iterrows():
        query = row["Test_Cases"]
        rag_answer = row["Response"]
        ref_prompt = f"Answer the following query in detail: {query}"
        reference_answer = str(reference_llm(ref_prompt))
        print("[DEBUG] Reference answer:", reference_answer)

        test_case = LLMTestCase(
            input=query,
            actual_output=rag_answer,
            retrieval_context=[reference_answer]
        )
        metric = AnswerRelevancyMetric(threshold=0.7)
        metric.measure(test_case)
        scores.append(metric.score)
        reasons.append(metric.reason)
    
    df["Answer_Score"] = scores
    df["Answer_Reason"] = reasons
    df.to_csv(csv_file, index=False)
    with open(marker_file, "w") as f:
        f.write("Evaluation complete.")

    avg_score = sum(scores) / len(scores) if scores else 0
    return {"result": f"Evaluation complete. Avg Score: {avg_score:.2f}. Results stored in {csv_file}."}

# -----------------------------
# Agent Setup
# -----------------------------
react_instructions = """
You are a test-case generation and execution agent. 
You have the following tools available:

1) generate_test_cases
   - Accepts parameters: app_name (str), description (str), system_prompt (str), endpoint (str), extra_definition (str), k (int)
   - Use it to generate test queries and save them in a CSV.

2) execute_test_cases
   - Accepts parameters: app_name (str), endpoint (str), extra_headers (dict, optional)
   - Use it to execute queries from the CSV and evaluate them.

Guidance:
- If the user wants you to create new test queries, call generate_test_cases.
- If the user wants you to run or execute queries, call execute_test_cases.
- Return short, direct answers once the tool is done.
- Only call the tools if relevant.
- Provide the final result to the user.
"""

tools = [generate_test_cases, execute_test_cases]
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
checkpointer = MemorySaver()

# Instead of .run(...) we do .invoke(...)
agent = create_react_agent(
    model=model,
    tools=tools,
    prompt=react_instructions,
    checkpointer=checkpointer
)

# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    params = AppParams(
        app_name="RAG_FOR_AI",
        description="My app provides information about documents in a RAG model.",
        system_prompt=(
            "Hello, you are a helpful scientific assistant. Based on the provided documents, answer the user's query. "
            "Document: Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries "
            "and societies across the globe. In recent years, AI models have revolutionized sectors such as healthcare, "
            "automotive, finance, and entertainment. These technologies enable machines to simulate human-like cognitive "
            "functions with unprecedented accuracy."
        ),
        endpoint="http://127.0.0.1:5001/query",
        extra_definition="",
        k=5
    )

    # Single user instruction: generate & execute
    single_message = (
        f"Please generate exactly {params.k} test queries for my app, and then immediately execute them. "
        f"App Name: {params.app_name}, Description: {params.description}, "
        f"System Prompt: {params.system_prompt}, Endpoint: {params.endpoint}, "
        f"Extra Definitions: {params.extra_definition}."
    )

    conversation = [{"role": "user", "content": single_message}]

    print("=== Agent Input ===")
    print(single_message)
    print("=== Agent Output ===")

    # Use .invoke(...) to process the conversation
    final_state = agent.invoke({"messages": conversation}, config={"configurable": {"thread_id": 1}})

    # The final reply from the agent
    print("Final State of Agent Returns >>>",final_state["messages"][-1].content)


=== Agent Input ===
Please generate exactly 5 test queries for my app, and then immediately execute them. App Name: RAG_FOR_AI, Description: My app provides information about documents in a RAG model., System Prompt: Hello, you are a helpful scientific assistant. Based on the provided documents, answer the user's query. Document: Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, AI models have revolutionized sectors such as healthcare, automotive, finance, and entertainment. These technologies enable machines to simulate human-like cognitive functions with unprecedented accuracy., Endpoint: http://127.0.0.1:5001/query, Extra Definitions: .
=== Agent Output ===


  llm_chain = LLMChain(llm=llm, prompt=prompt)
  generated = llm_chain.run({


[DEBUG] Generating test queries...
[DEBUG] Raw LLM output: [
    "What is Artificial Intelligence?",
    "How is AI transforming industries?",
    "Which sectors have been revolutionized by AI models?",
    "What functions can machines simulate with AI technologies?",
    "Can AI accurately simulate human-like cognitive functions?"
]
[DEBUG] Parsed queries: ['What is Artificial Intelligence?', 'How is AI transforming industries?', 'Which sectors have been revolutionized by AI models?', 'What functions can machines simulate with AI technologies?', 'Can AI accurately simulate human-like cognitive functions?']
[DEBUG] Reading test queries from RAG_FOR_AI_test_cases.csv...
[DEBUG] Query: What is Artificial Intelligence? -> Response:  Artificial Intelligence (AI) is a technology that allows machines to imitate human-like cognitive functions, such as problem-solving, learning, and decision-making, with high accuracy and efficiency. It involves the use of data and advanced algorithms to enabl

Query >>> What is Artificial Intelligence?
Relevant docs (raw): [Document(id='b97b4ec4-7673-4774-9a35-150c277f3f71', metadata={}, page_content='Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized'), Document(id='3ff65d1b-bf05-41fd-99e6-7815344307ca', metadata={}, page_content='enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data')]
Relevant docs are >>> Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized
enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency.

127.0.0.1 - - [05/Feb/2025 13:05:48] "POST /query HTTP/1.1" 200 -


Query >>> How is AI transforming industries?
Relevant docs (raw): [Document(id='5b0b7d97-e28b-4101-b0a5-5990e55887d4', metadata={}, page_content='unimaginable. As AI continues to evolve, it is reshaping the workforce, influencing economic trends, and even altering the way people interact with technology on a day-to-day basis.'), Document(id='b97b4ec4-7673-4774-9a35-150c277f3f71', metadata={}, page_content='Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized')]
Relevant docs are >>> unimaginable. As AI continues to evolve, it is reshaping the workforce, influencing economic trends, and even altering the way people interact with technology on a day-to-day basis.
Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revoluti

127.0.0.1 - - [05/Feb/2025 13:05:51] "POST /query HTTP/1.1" 200 -


Query >>> Which sectors have been revolutionized by AI models?
Relevant docs (raw): [Document(id='b516b2d1-083c-4db4-853c-550e5ceb57fb', metadata={}, page_content='implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like'), Document(id='b97b4ec4-7673-4774-9a35-150c277f3f71', metadata={}, page_content='Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized')]
Relevant docs are >>> implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like
Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the im

127.0.0.1 - - [05/Feb/2025 13:05:53] "POST /query HTTP/1.1" 200 -


Query >>> What functions can machines simulate with AI technologies?
Relevant docs (raw): [Document(id='3ff65d1b-bf05-41fd-99e6-7815344307ca', metadata={}, page_content='enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data'), Document(id='b516b2d1-083c-4db4-853c-550e5ceb57fb', metadata={}, page_content='implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like')]
Relevant docs are >>> enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data
implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many other

127.0.0.1 - - [05/Feb/2025 13:05:54] "POST /query HTTP/1.1" 200 -


Query >>> Can AI accurately simulate human-like cognitive functions?
Relevant docs (raw): [Document(id='3ff65d1b-bf05-41fd-99e6-7815344307ca', metadata={}, page_content='enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data'), Document(id='b516b2d1-083c-4db4-853c-550e5ceb57fb', metadata={}, page_content='implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like')]
Relevant docs are >>> enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data
implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many other

127.0.0.1 - - [05/Feb/2025 13:05:55] "POST /query HTTP/1.1" 200 -


  reference_answer = str(reference_llm(ref_prompt))


[DEBUG] Query: Can AI accurately simulate human-like cognitive functions? -> Response:  Yes, AI has the ability to accurately simulate human-like cognitive functions with advanced algorithms and access to vast amounts of data. This has led to significant advancements in various industries and has the potential to continue improving and expanding in the future. However, there are still limitations and challenges in fully replicating the complexity and nuance of human cognition.


[DEBUG] Reference answer: content='Artificial Intelligence (AI) is a branch of computer science that focuses on creating intelligent machines that can perform tasks that typically require human intelligence. These tasks include learning, reasoning, problem-solving, perception, language understanding, and decision-making.\n\nAI systems are designed to mimic human cognitive functions such as learning from experience, adapting to new situations, and understanding natural language. AI technologies can be classified into two main categories: Narrow AI and General AI. Narrow AI, also known as Weak AI, is designed to perform specific tasks, such as speech recognition or image recognition. General AI, also known as Strong AI, is a hypothetical form of AI that can understand, learn, and apply knowledge in a wide range of tasks, similar to human intelligence.\n\nAI technologies are used in a variety of applications, including virtual assistants like Siri and Alexa, self-driving cars, medical dia

[DEBUG] Reference answer: content='Artificial Intelligence (AI) is transforming industries in a multitude of ways, revolutionizing the way businesses operate and making processes more efficient and effective. Some of the key ways in which AI is transforming industries include:\n\n1. Automation: AI is enabling automation of repetitive tasks and processes, freeing up human workers to focus on more strategic and creative tasks. This is particularly evident in industries such as manufacturing, where AI-powered robots are being used to perform tasks that are dangerous or monotonous for humans.\n\n2. Data analysis: AI is able to analyze vast amounts of data at a speed and accuracy that is beyond human capability. This is enabling businesses to gain valuable insights from their data, leading to better decision-making and improved efficiency. Industries such as finance, healthcare, and marketing are using AI to analyze data and make predictions that were previously impossible.\n\n3. Personaliz

[DEBUG] Reference answer: content='AI models have revolutionized a wide range of sectors across industries, transforming the way businesses operate and improving efficiency, productivity, and decision-making processes. Some of the sectors that have been significantly impacted by AI models include:\n\n1. Healthcare: AI models have been used to analyze medical images, diagnose diseases, predict patient outcomes, and personalize treatment plans. AI-powered tools have also been developed to improve patient care, streamline administrative tasks, and enhance drug discovery processes.\n\n2. Finance: AI models have been used in the finance sector to detect fraud, automate trading, predict market trends, and personalize customer experiences. AI-powered chatbots and virtual assistants have also been deployed to provide customer support and financial advice.\n\n3. Retail: AI models have transformed the retail sector by enabling personalized recommendations, optimizing pricing strategies, forecast

[DEBUG] Reference answer: content='Machines can simulate a wide range of functions using AI technologies. Some of the key functions that machines can simulate with AI include:\n\n1. Pattern recognition: Machines can be trained to recognize patterns in data, images, and text. This can be used in various applications such as facial recognition, handwriting recognition, and speech recognition.\n\n2. Natural language processing: Machines can be programmed to understand and generate human language. This can be used in applications such as chatbots, language translation, and sentiment analysis.\n\n3. Predictive analytics: Machines can analyze large amounts of data to make predictions about future events. This can be used in applications such as forecasting sales, predicting customer behavior, and identifying potential risks.\n\n4. Image and video analysis: Machines can analyze images and videos to identify objects, people, and activities. This can be used in applications such as surveillance

[DEBUG] Reference answer: content="AI has made significant advancements in simulating human-like cognitive functions, but it is still not able to accurately replicate all aspects of human cognition. \n\nAI systems are able to perform tasks such as natural language processing, image recognition, and decision-making with a high level of accuracy. These systems use algorithms and machine learning techniques to analyze data and make predictions or decisions based on that data. They can learn from experience and improve their performance over time.\n\nHowever, there are still limitations to AI's ability to simulate human-like cognitive functions. One of the main challenges is in understanding and replicating the complex and nuanced ways in which humans think and reason. Human cognition is influenced by emotions, intuition, creativity, and social interactions, which are difficult for AI systems to fully grasp.\n\nAdditionally, AI systems lack the ability to truly understand context, make con

Evaluation Result:
Evaluation complete. Avg Score: 1.00. Results stored in RAG_FOR_AI_test_cases.csv.
Final State of Agent Returns >>> Test queries have been generated and executed successfully for the app "RAG_FOR_AI".


## Langgraph agent using Rea-Act --> Javelin Rout

In [7]:
from langchain.chat_models import ChatOpenAI

def get_javelin_llm(
    model_name: str = "gpt-3.5-turbo",
    temperature: float = 0.7
) -> ChatOpenAI:
    """
    Returns a ChatOpenAI instance that routes requests to the Javelin endpoint.
    """

    # Javelin headers
    javelin_headers = {
        "x-api-key": javelin_api_key,
    }

    # Put headers under model_kwargs to avoid the warning
    return ChatOpenAI(
        model_name=model_name,
        temperature=temperature,
        openai_api_key=llm_api_key,
        openai_api_base="https://api-dev.javelin.live/v1/query/testing",
        request_timeout=180, 
        default_headers = javelin_headers
    )


In [8]:
import os
import json
import requests
import pandas as pd
from pydantic import BaseModel

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.tools import tool

# Import DeepEval modules
from deepeval import login_with_confident_api_key, evaluate  # evaluate helper
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase


# -----------------------------
# Configure API Keys and Log In
# -----------------------------
login_with_confident_api_key(os.environ["DEEPEVAL_API_KEY"])

# -----------------------------
# Define AppParams model
# -----------------------------
class AppParams(BaseModel):
    app_name: str
    description: str
    system_prompt: str
    endpoint: str
    extra_definition: str
    k: int

# -----------------------------
# Utility: Custom Parsing Function
# -----------------------------
def custom_parsing_for_rag(response_text: str) -> str:
    """
    Parse the response from the RAG endpoint (expected as JSON).
    Return the value associated with "answer" if available; if not, then "response".
    Otherwise, return the original text.
    """
    try:
        data = json.loads(response_text)
        return data.get("answer") or data.get("response") or response_text
    except Exception:
        return response_text

# -----------------------------
# Tool: Generate Test Cases
# -----------------------------
@tool
def generate_test_cases(app_name: str, description: str, system_prompt: str,
                        endpoint: str, extra_definition: str, k: int) -> dict:
    """
    Name: generate_test_cases
    Description: Generate exactly k plain test queries for the specified app details using an LLM.
    Input Arguments:
        - app_name (str): The name of the application.
        - description (str): A brief description of the app.
        - system_prompt (str): The system prompt or instructions for generating queries.
        - endpoint (str): The API endpoint to be tested.
        - extra_definition (str): Any extra definitions or clarifications needed.
        - k (int): The number of test queries to generate.
    Output:
        - A dictionary with {"result": "<success or error message>"}.

    How it works:
    1) Calls an LLM to generate exactly k queries in JSON array format.
    2) Saves queries in CSV with columns "Test_Cases" (the query) and "Response" (empty).
    """
    llm = get_javelin_llm(model_name="gpt-3.5-turbo", temperature=0.7)

    prompt_template = """
    Application Name: {app_name}
    Description: {description}
    System Prompt: {system_prompt}
    Extra Definitions: {extra_definition}
    Endpoint: {endpoint}
    Number of Test Queries: {k}

    Based on the above details, generate exactly {k} test queries for automated testing.
    Each test query should be a plain text string that represents a query to test the endpoint.
    Do not include any extra information, explanations, or expected output.
    Return the result strictly as a JSON array of strings, for example:
    ["Query 1", "Query 2", ...]
    """
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["app_name", "description", "system_prompt", "extra_definition", "endpoint", "k"]
    )

    llm_chain = LLMChain(llm=llm, prompt=prompt)
    print("[DEBUG] Generating test queries...")
    generated = llm_chain.run({
        "app_name": app_name,
        "description": description,
        "system_prompt": system_prompt,
        "extra_definition": extra_definition,
        "endpoint": endpoint,
        "k": k
    })
    print("[DEBUG] Raw LLM output:", generated)

    # Attempt to parse as JSON
    try:
        queries = json.loads(generated)
        if not isinstance(queries, list):
            raise ValueError("JSON output is not a list.")
    except Exception as e:
        print("[ERROR] Parsing JSON failed. Splitting by newlines. Error:", e)
        queries = [q.strip() for q in generated.split("\n") if q.strip()]

    print("[DEBUG] Parsed queries:", queries)
    df = pd.DataFrame({"Test_Cases": queries, "Response": [""] * len(queries)})
    csv_file = f"{app_name}_test_cases.csv"
    df.to_csv(csv_file, index=False)
    return {"result": f"Test queries saved in {csv_file}."}

# -----------------------------
# Tool: Execute Test Cases
# -----------------------------
@tool
def execute_test_cases(app_name: str, endpoint: str, extra_headers: dict = None) -> dict:
    """
    Name: execute_test_cases
    Description: Execute existing test queries for the specified app by sending them to the given endpoint.
    Input Arguments:
        - app_name (str): The name of the application.
        - endpoint (str): The API endpoint to be tested.
        - extra_headers (dict, optional): Additional HTTP headers for requests.
    Output:
        - A dictionary with {"result": "<success or error message>"}.

    How it works:
    1) Reads the CSV file named "<app_name>_test_cases.csv".
    2) POSTs each query (from "Test_Cases" column) to 'endpoint' (JSON payload: {"query": <test_query>}).
    3) Parses the JSON response if possible, or returns the raw text otherwise.
    4) Saves the response to the same CSV (under "Response" column).
    5) Calls evaluate_test_cases to measure quality via DeepEval and prints the evaluation result.
    """
    csv_file = f"{app_name}_test_cases.csv"
    print(f"[DEBUG] Reading test queries from {csv_file}...")
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        return {"result": f"Could not read CSV {csv_file}: {str(e)}"}
    
    headers = extra_headers if extra_headers is not None else {}
    responses = []
    for query in df["Test_Cases"]:
        try:
            res = requests.post(endpoint, json={"query": query}, headers=headers)
            if res.status_code == 200:
                text = res.text
            else:
                text = f"Error {res.status_code}: {res.text}"
        except Exception as e:
            text = f"Request failed: {str(e)}"
        parsed = custom_parsing_for_rag(text)
        responses.append(parsed)
        print(f"[DEBUG] Query: {query} -> Response: {parsed}")

    df["Response"] = responses
    df.to_csv(csv_file, index=False)

    # Evaluate after execution
    eval_result = evaluate_test_cases(app_name)
    print("Evaluation Result:")
    print(eval_result["result"])

    return {"result": f"Executed queries; responses saved in {csv_file}."}

# -----------------------------
# Function: Evaluate Test Cases (Direct Function Call)
# -----------------------------
def evaluate_test_cases(app_name: str) -> dict:
    """
    For each test case in the CSV, generate a reference answer using a reference LLM,
    evaluate the actual output against the reference using DeepEval,
    and save the per-test score and reason in new CSV columns.

    This function is called directly (not as a tool) to avoid recursion issues.
    """
    csv_file = f"{app_name}_test_cases.csv"
    marker_file = f"{app_name}_evaluation.marker"

    if os.path.exists(marker_file):
        return {"result": "Evaluation already completed; skipping re-evaluation."}
    
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        return {"result": f"Could not read CSV {csv_file}: {str(e)}"}
    
    scores = []
    reasons = []
    reference_llm = get_javelin_llm(model_name="gpt-3.5-turbo", temperature=0.7)
    
    for idx, row in df.iterrows():
        query = row["Test_Cases"]
        rag_answer = row["Response"]
        ref_prompt = f"Answer the following query in detail: {query}"
        reference_answer = str(reference_llm(ref_prompt))
        print("[DEBUG] Reference answer:", reference_answer)

        test_case = LLMTestCase(
            input=query,
            actual_output=rag_answer,
            retrieval_context=[reference_answer]
        )
        metric = AnswerRelevancyMetric(threshold=0.7)
        metric.measure(test_case)
        scores.append(metric.score)
        reasons.append(metric.reason)
    
    df["Answer_Score"] = scores
    df["Answer_Reason"] = reasons
    df.to_csv(csv_file, index=False)
    with open(marker_file, "w") as f:
        f.write("Evaluation complete.")

    avg_score = sum(scores) / len(scores) if scores else 0
    return {"result": f"Evaluation complete. Avg Score: {avg_score:.2f}. Results stored in {csv_file}."}

# -----------------------------
# Agent Setup
# -----------------------------
react_instructions = """
You are a test-case generation and execution agent. 
You have the following tools available:

1) generate_test_cases
   - Accepts parameters: app_name (str), description (str), system_prompt (str), endpoint (str), extra_definition (str), k (int)
   - Use it to generate test queries and save them in a CSV.

2) execute_test_cases
   - Accepts parameters: app_name (str), endpoint (str), extra_headers (dict, optional)
   - Use it to execute queries from the CSV and evaluate them.

Guidance:
- If the user wants you to create new test queries, call generate_test_cases.
- If the user wants you to run or execute queries, call execute_test_cases.
- Return short, direct answers once the tool is done.
- Only call the tools if relevant.
- Provide the final result to the user.
"""

tools = [generate_test_cases, execute_test_cases]
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
checkpointer = MemorySaver()

# Instead of .run(...) we do .invoke(...)
agent = create_react_agent(
    model=model,
    tools=tools,
    prompt=react_instructions,
    checkpointer=checkpointer,
)

# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    params = AppParams(
        app_name="RAG_FOR_AI",
        description="My app provides information about documents in a RAG model.",
        system_prompt=(
            "Hello, you are a helpful scientific assistant. Based on the provided documents, answer the user's query. "
            "Document: Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries "
            "and societies across the globe. In recent years, AI models have revolutionized sectors such as healthcare, "
            "automotive, finance, and entertainment. These technologies enable machines to simulate human-like cognitive "
            "functions with unprecedented accuracy."
        ),
        endpoint="http://127.0.0.1:5001/query",
        extra_definition="",
        k=5
    )

    # Single user instruction: generate & execute
    single_message = (
        f"Please generate exactly {params.k} test queries for my app, and then immediately execute them. "
        f"App Name: {params.app_name}, Description: {params.description}, "
        f"System Prompt: {params.system_prompt}, Endpoint: {params.endpoint}, "
        f"Extra Definitions: {params.extra_definition}."
    )

    conversation = [{"role": "user", "content": single_message}]

    print("=== Agent Input ===")
    print(single_message)
    print("=== Agent Output ===")

    # Use .invoke(...) to process the conversation
    final_state = agent.invoke({"messages": conversation}, config={"configurable": {"thread_id": 1}})

    # The final reply from the agent
    print("Final State of Agent Returns >>>",final_state["messages"][-1].content)


=== Agent Input ===
Please generate exactly 5 test queries for my app, and then immediately execute them. App Name: RAG_FOR_AI, Description: My app provides information about documents in a RAG model., System Prompt: Hello, you are a helpful scientific assistant. Based on the provided documents, answer the user's query. Document: Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, AI models have revolutionized sectors such as healthcare, automotive, finance, and entertainment. These technologies enable machines to simulate human-like cognitive functions with unprecedented accuracy., Endpoint: http://127.0.0.1:5001/query, Extra Definitions: .
=== Agent Output ===


  llm_chain = LLMChain(llm=llm, prompt=prompt)
  generated = llm_chain.run({


[DEBUG] Generating test queries...
[DEBUG] Raw LLM output: [
    "What technology is rapidly advancing and transforming industries and societies globally?",
    "Which sectors have been revolutionized by AI models in recent years?",
    "What do AI technologies enable machines to do with unprecedented accuracy?",
    "What is the main focus of Artificial Intelligence (AI)?",
    "How have AI models impacted the healthcare sector?"
]
[DEBUG] Parsed queries: ['What technology is rapidly advancing and transforming industries and societies globally?', 'Which sectors have been revolutionized by AI models in recent years?', 'What do AI technologies enable machines to do with unprecedented accuracy?', 'What is the main focus of Artificial Intelligence (AI)?', 'How have AI models impacted the healthcare sector?']
[DEBUG] Reading test queries from RAG_FOR_AI_test_cases.csv...
[DEBUG] Query: What technology is rapidly advancing and transforming industries and societies globally? -> Response: Art

  reference_answer = str(reference_llm(ref_prompt))


[DEBUG] Query: How have AI models impacted the healthcare sector? -> Response: AI models have had a significant impact on the healthcare sector by revolutionizing patient care, diagnosis, and treatment. Machine learning algorithms are being utilized to analyze vast amounts of medical data, leading to more accurate and timely diagnoses. Additionally, AI models are helping healthcare professionals in making informed decisions, personalizing treatment plans, and improving overall patient outcomes. This technology has the potential to enhance efficiency, reduce errors, and ultimately transform the way healthcare is delivered.


[DEBUG] Reference answer: content='One technology that is rapidly advancing and transforming industries and societies globally is artificial intelligence (AI). AI refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation.\n\nAI is being integrated into a wide range of industries, including healthcare, finance, transportation, manufacturing, and retail. In healthcare, AI is being used to analyze medical images, predict patient outcomes, and improve diagnostics. In finance, AI is being used to detect fraud, automate trading, and personalize customer experiences. In transportation, AI is being used to optimize routes, improve safety, and develop autonomous vehicles. In manufacturing, AI is being used to optimize production processes, improve quality control, and predict equipment failures. In retail, AI is being used to personalize marketing, optim

[DEBUG] Reference answer: content='Artificial Intelligence (AI) has revolutionized numerous sectors in recent years, transforming the way businesses operate and improving efficiency and productivity. Some of the key sectors that have been significantly impacted by AI models include:\n\n1. Healthcare: AI has revolutionized the healthcare sector by enabling the development of advanced diagnostic tools, personalized treatment plans, and predictive analytics for disease prevention. AI-powered applications can analyze medical images, genetic data, and patient records to assist healthcare professionals in making accurate diagnoses and treatment decisions.\n\n2. Finance: AI has transformed the finance sector by automating tasks such as fraud detection, risk assessment, and investment analysis. AI models can process vast amounts of financial data in real-time, enabling financial institutions to make faster and more informed decisions. Robo-advisors powered by AI algorithms have also become inc

[DEBUG] Reference answer: content='AI technologies enable machines to perform a wide range of complex tasks with unprecedented accuracy. Some of the key capabilities that AI technologies enable machines to do include:\n\n1. Pattern recognition: AI technologies allow machines to analyze and identify patterns in large amounts of data with great accuracy. This can be used in various fields such as healthcare, finance, and marketing to detect trends and make predictions.\n\n2. Natural language processing: AI technologies enable machines to understand and generate human language with a high level of accuracy. This can be used in applications such as chatbots, virtual assistants, and language translation services.\n\n3. Image and video recognition: AI technologies enable machines to accurately analyze and interpret images and videos. This can be used in applications such as facial recognition, object detection, and autonomous driving.\n\n4. Predictive analytics: AI technologies enable machin

[DEBUG] Reference answer: content='The main focus of Artificial Intelligence (AI) is to create intelligent machines that can simulate human intelligence and perform tasks that typically require human cognition. This includes tasks such as learning, reasoning, problem solving, understanding natural language, and perception.\n\nAI aims to develop machines that can think, learn, and adapt like humans, and ultimately surpass human intelligence in certain areas. The goal is to create machines that can automate complex tasks, make decisions based on data and algorithms, and improve their performance over time through learning.\n\nThere are several subfields within AI that focus on different aspects of intelligence, such as machine learning, natural language processing, computer vision, robotics, and expert systems. These subfields work together to create intelligent systems that can perform a wide range of tasks, from playing chess to driving a car to diagnosing medical conditions.\n\nOveral

[DEBUG] Reference answer: content='AI models have had a significant impact on the healthcare sector in a variety of ways. Some of the key impacts include:\n\n1. Improved diagnosis and treatment: AI models have been developed to analyze medical images, such as X-rays and MRIs, to detect patterns and abnormalities that may be missed by human radiologists. This can lead to earlier and more accurate diagnosis of conditions such as cancer and other diseases, improving patient outcomes.\n\n2. Personalized medicine: AI models can analyze large amounts of data, such as genetic information and patient records, to identify personalized treatment plans for individual patients. This can lead to more effective and targeted treatments, reducing the likelihood of adverse reactions and improving patient outcomes.\n\n3. Predictive analytics: AI models can analyze data from electronic health records and other sources to predict patient outcomes and identify individuals at risk of developing certain cond

Evaluation Result:
Evaluation complete. Avg Score: 0.87. Results stored in RAG_FOR_AI_test_cases.csv.
Final State of Agent Returns >>> Test queries have been generated and executed successfully for the app "RAG_FOR_AI".
