In [4]:
import os
import getpass
import openai
from flask import Flask, request, jsonify
from werkzeug.serving import run_simple
from threading import Thread
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set OpenAI API key and organization
os.environ["OPENAI_API_KEY"] = ""

openai.api_key = os.environ["OPENAI_API_KEY"]

# Initialize Flask app
app = Flask(__name__)

# Instantiate the LLM once (using the recommended invocation method)
llm = OpenAI()

# Initialize embeddings and vector store
embedding_model_custom = OpenAIEmbeddings()
vector_store = Chroma(persist_directory="chroma_store_custom", embedding_function=embedding_model_custom)

# Sample text to demonstrate document storage and querying
sample_text = """
Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data to provide insights, automate complex tasks, and enhance productivity in ways that were once considered unimaginable. As AI continues to evolve, it is reshaping the workforce, influencing economic trends, and even altering the way people interact with technology on a day-to-day basis.

In the healthcare industry, AI applications have the potential to revolutionize patient care, diagnosis, and treatment. Machine learning algorithms are being used to analyze medical data, such as medical images, patient records, and clinical trial results, to assist doctors in making more accurate and timely diagnoses. For example, AI-powered tools can detect early signs of diseases such as cancer, heart conditions, and neurological disorders, improving patient outcomes by enabling early intervention.
"""

@app.route('/store', methods=['POST'])
def store_document():
    # Split the sample text into chunks and add them to the vector store
    documents = [Document(page_content=sample_text)]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    split_docs = text_splitter.split_documents(documents)
    vector_store.add_documents(split_docs)
    return jsonify({"message": "Document stored successfully!"})

def retrieve_relevant_docs(query):
    # Retrieve a list of Document objects using similarity search
    relevant_docs = vector_store.similarity_search(query, k=2)
    print("Relevant docs (raw):", relevant_docs)
    # Combine the retrieved document texts into a single string
    documents = "\n".join([doc.page_content for doc in relevant_docs])
    return documents

@app.route('/query', methods=['POST'])
def get_final_response():
    data = request.get_json()
    query = data.get('query')
    print("Query >>>", query)
    relevant_documents = retrieve_relevant_docs(query)
    print("Relevant docs are >>>", relevant_documents)
    
    # Create a combined prompt from the query and the retrieved documents
    prompt = f"Question: {query}\nDocuments:\n{relevant_documents}\nAnswer:"
    
    # Invoke the LLM using the recommended interface
    response = llm.invoke(prompt)
    
    return jsonify({"response": response})

def run_app():
    run_simple('localhost', 5001, app, use_reloader=False, use_debugger=False)

# Run the Flask server in a background thread so that you can use the notebook concurrently
thread = Thread(target=run_app)
thread.start()


 * Running on http://localhost:5001
[33mPress CTRL+C to quit[0m


In [6]:
import requests

# URL setup for the Flask app
base_url = "http://localhost:5001"

# Triggering the document storage
store_response = requests.post(f"{base_url}/store")
print("Store Response:", store_response.text)

# Querying the stored documents
query_url = f"{base_url}/query"
query_data = {
    "query": "What impact does AI have on healthcare?"
}
headers = {'Content-Type': 'application/json'}
query_response = requests.post(query_url, json=query_data, headers=headers)
print("Query Response:", query_response)


Store Response: {"message":"Document stored successfully!"}

Query Response: <Response [200]>


Query >>> What impact does AI have on healthcare?
Relevant docs (raw): [Document(id='bdbc442d-d3d5-4e2d-b917-d006566d88d9', metadata={}, page_content='In the healthcare industry, AI applications have the potential to revolutionize patient care, diagnosis, and treatment. Machine learning algorithms are being used to analyze medical data, such as'), Document(id='35a7c708-9758-42fd-bc24-eff4499505ce', metadata={}, page_content='implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like')]
Relevant docs are >>> In the healthcare industry, AI applications have the potential to revolutionize patient care, diagnosis, and treatment. Machine learning algorithms are being used to analyze medical data, such as
implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enabl

127.0.0.1 - - [04/Feb/2025 17:28:11] "POST /query HTTP/1.1" 200 -


In [3]:
import os 

In [4]:
os.environ["OPENAI_API_KEY"] = ""
# Set your DeepEval (Confident AI) API key
os.environ["DEEPEVAL_API_KEY"] = ""

In [10]:
import os
import json
import requests
import pandas as pd
from pydantic import BaseModel

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.tools import tool

# Import DeepEval modules
from deepeval import login_with_confident_api_key, evaluate  # evaluate helper
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

# -----------------------------
# Configure API Keys and Log In
# -----------------------------

login_with_confident_api_key(os.environ["DEEPEVAL_API_KEY"])

# -----------------------------
# Define AppParams model
# -----------------------------
class AppParams(BaseModel):
    app_name: str
    description: str
    system_prompt: str
    endpoint: str
    extra_definition: str
    k: int

# -----------------------------
# Utility: Custom Parsing Function
# -----------------------------
def custom_parsing_for_rag(response_text: str) -> str:
    """
    Parse the response from the RAG endpoint (expected as JSON).
    Return the value associated with "answer" if available; if not, then "response".
    Otherwise, return the original text.
    """
    try:
        data = json.loads(response_text)
        # Try "answer" key; if missing try "response"
        return data.get("answer") or data.get("response") or response_text
    except Exception:
        return response_text

# -----------------------------
# Tool: Generate Test Cases
# -----------------------------
@tool
def generate_test_cases(app_name: str, description: str, system_prompt: str,
                        endpoint: str, extra_definition: str, k: int) -> dict:
    """
    Generate exactly k plain test queries for the app details using an LLM.
    Save the queries (and an empty "Response" column) to a CSV.
    """
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
    prompt_template = """
    Application Name: {app_name}
    Description: {description}
    System Prompt: {system_prompt}
    Extra Definitions: {extra_definition}
    Endpoint: {endpoint}
    Number of Test Queries: {k}

    Based on the above details, generate exactly {k} test queries for automated testing.
    Each test query should be a plain text string that represents a query to test the endpoint.
    Do not include any extra information, explanations, or expected output.
    Return the result as a JSON array of strings.
    """
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["app_name", "description", "system_prompt", "extra_definition", "endpoint", "k"]
    )
    llm_chain = LLMChain(llm=llm, prompt=prompt)
    print("[DEBUG] Generating test queries...")
    generated = llm_chain.run({
        "app_name": app_name,
        "description": description,
        "system_prompt": system_prompt,
        "extra_definition": extra_definition,
        "endpoint": endpoint,
        "k": k
    })
    print("[DEBUG] Raw LLM output:", generated)
    try:
        queries = json.loads(generated)
        if not isinstance(queries, list):
            raise ValueError("JSON output is not a list.")
    except Exception as e:
        print("[ERROR] Parsing JSON failed. Splitting by newlines. Error:", e)
        queries = [q.strip() for q in generated.split("\n") if q.strip()]
    print("[DEBUG] Parsed queries:", queries)
    df = pd.DataFrame({"Test_Cases": queries, "Response": [""] * len(queries)})
    csv_file = f"{app_name}_test_cases.csv"
    df.to_csv(csv_file, index=False)
    return {"result": f"Test queries saved in {csv_file}."}

# -----------------------------
# Tool: Execute Test Cases
# -----------------------------
@tool
def execute_test_cases(app_name: str, endpoint: str, extra_headers: dict = None) -> dict:
    """
    For each test query in the CSV, POST to the endpoint and parse the response.
    Save the responses back to the CSV.
    """
    csv_file = f"{app_name}_test_cases.csv"
    print(f"[DEBUG] Reading test queries from {csv_file}...")
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        return {"result": f"Could not read CSV {csv_file}: {str(e)}"}
    
    headers = extra_headers if extra_headers is not None else {}
    responses = []
    for query in df["Test_Cases"]:
        try:
            res = requests.post(endpoint, json={"query": query}, headers=headers)
            if res.status_code == 200:
                text = res.text
            else:
                text = f"Error {res.status_code}: {res.text}"
        except Exception as e:
            text = f"Request failed: {str(e)}"
        parsed = custom_parsing_for_rag(text)
        responses.append(parsed)
        print(f"[DEBUG] Query: {query} -> Response: {parsed}")
    df["Response"] = responses
    df.to_csv(csv_file, index=False)
    eval_result = evaluate_test_cases(params.app_name)
    print("Evaluation Result:")
    print(eval_result["result"])

    return {"result": f"Executed queries; responses saved in {csv_file}."}

# -----------------------------
# Function: Evaluate Test Cases (Direct Function Call)
# -----------------------------
def evaluate_test_cases(app_name: str) -> dict:
    """
    For each test case in the CSV, generate a reference answer using a reference LLM,
    evaluate the actual output against the reference using DeepEval,
    and save the per-test score and reason in new CSV columns.
    This function is called directly (not as a tool) to avoid recursion issues.
    """
    csv_file = f"{app_name}_test_cases.csv"
    marker_file = f"{app_name}_evaluation.marker"
    if os.path.exists(marker_file):
        return {"result": "Evaluation already completed; skipping re-evaluation."}
    
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        return {"result": f"Could not read CSV {csv_file}: {str(e)}"}
    
    scores = []
    reasons = []
    reference_llm = ChatOpenAI(
        openai_api_key=os.environ["OPENAI_API_KEY"],
        model="gpt-3.5-turbo",
        temperature=0
    )
    
    for idx, row in df.iterrows():
        query = row["Test_Cases"]
        rag_answer = row["Response"]
        ref_prompt = f"Answer the following query in detail: {query}"
        # Force the reference LLM to return a string.
        reference_answer = str(reference_llm(ref_prompt))
        print("[DEBUG] Reference answer:", reference_answer)
        test_case = LLMTestCase(
            input=query,
            actual_output=rag_answer,
            retrieval_context=[reference_answer]
        )
        metric = AnswerRelevancyMetric(threshold=0.7)
        metric.measure(test_case)
        scores.append(metric.score)
        reasons.append(metric.reason)
    
    df["Answer_Score"] = scores
    df["Answer_Reason"] = reasons
    df.to_csv(csv_file, index=False)
    with open(marker_file, "w") as f:
        f.write("Evaluation complete.")
    avg_score = sum(scores) / len(scores) if scores else 0
    return {"result": f"Evaluation complete. Avg Score: {avg_score:.2f}. Results stored in {csv_file}."}

# -----------------------------
# Agent Setup: For Generation and Execution Only
# -----------------------------
tools = [generate_test_cases, execute_test_cases]
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
checkpointer = MemorySaver()
agent = create_react_agent(model, tools, checkpointer=checkpointer)

# -----------------------------
# Main: Single Agent for Generation & Execution, then Direct Evaluation
# -----------------------------
if __name__ == "__main__":
    params = AppParams(
        app_name="RAG_FOR_AI",
        description="My app provides information about documents in a RAG model.",
        system_prompt=(
            "Hello, you are a helpful scientific assistant. Based on the provided documents, answer the user's query. "
            "Document: Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. "
            "In recent years, AI models have revolutionized sectors such as healthcare, automotive, finance, and entertainment. "
            "These technologies enable machines to simulate human-like cognitive functions with unprecedented accuracy."
        ),
        endpoint="http://127.0.0.1:5001/query",
        extra_definition="",
        k=5
    )
    
    # 1) Generate test queries using the agent.
    input_message_generate = {
        "messages": [
            {
                "role": "user",
                "content": (
                    f"Generate custom test queries for my app. "
                    f"App Name: {params.app_name}. "
                    f"Description: {params.description}. "
                    f"System Prompt: {params.system_prompt}. "
                    f"Endpoint: {params.endpoint}. "
                    f"Extra Definitions: {params.extra_definition}. "
                    f"Number of Test Queries: {params.k}."
                )
            }
        ]
    }
    gen_state = agent.invoke(input_message_generate, config={"configurable": {"thread_id": 1}})
    print("Generate Tool Response:")
    print(gen_state["messages"][-1].content)
    
    # 2) Execute test queries using the agent.
    input_message_execute = {
        "messages": [
            {
                "role": "user",
                "content": (
                    f"Execute test queries for my app. "
                    f"App Name: {params.app_name}. "
                    f"Endpoint: {params.endpoint}."
                )
            }
        ]
    }
    exe_state = agent.invoke(input_message_execute, config={"configurable": {"thread_id": 2}})
    print("Execute Tool Response:")
    print(exe_state["messages"][-1].content)


[DEBUG] Generating test queries...
[DEBUG] Raw LLM output: [
    "What is the RAG model?",
    "How has AI transformed industries?",
    "What sectors have AI models revolutionized?",
    "What functions can machines simulate with AI?",
    "What is the impact of AI on societies?"
]
[DEBUG] Parsed queries: ['What is the RAG model?', 'How has AI transformed industries?', 'What sectors have AI models revolutionized?', 'What functions can machines simulate with AI?', 'What is the impact of AI on societies?']
Generate Tool Response:
Custom test queries have been generated for the app "RAG_FOR_AI". You can download the test queries from the following link: [Download Test Queries](sandbox:/content/RAG_FOR_AI_test_cases.csv)
[DEBUG] Reading test queries from RAG_FOR_AI_test_cases.csv...
[DEBUG] Query: What is the RAG model? -> Response:  The RAG model, or Resource-Automated Generator model, is a type of artificial intelligence (AI) model that aims to mimic human cognitive functions by using l

Query >>> What is the RAG model?
Relevant docs (raw): [Document(id='ba283cc7-fc48-412e-9595-cb94db4e91ab', metadata={}, page_content='enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data'), Document(id='1297de00-0d86-4daa-b6b7-529dabbbc869', metadata={}, page_content='improving patient outcomes by enabling early intervention.')]
Relevant docs are >>> enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data
improving patient outcomes by enabling early intervention.


127.0.0.1 - - [04/Feb/2025 17:38:07] "POST /query HTTP/1.1" 200 -


Query >>> How has AI transformed industries?
Relevant docs (raw): [Document(id='0c3c7a78-eefa-4cc8-aaed-5e303a668797', metadata={}, page_content='unimaginable. As AI continues to evolve, it is reshaping the workforce, influencing economic trends, and even altering the way people interact with technology on a day-to-day basis.'), Document(id='808a58b8-8788-4ace-bef9-23e6ff60fdf3', metadata={}, page_content='Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized')]
Relevant docs are >>> unimaginable. As AI continues to evolve, it is reshaping the workforce, influencing economic trends, and even altering the way people interact with technology on a day-to-day basis.
Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revoluti

127.0.0.1 - - [04/Feb/2025 17:38:11] "POST /query HTTP/1.1" 200 -


Query >>> What sectors have AI models revolutionized?
Relevant docs (raw): [Document(id='35a7c708-9758-42fd-bc24-eff4499505ce', metadata={}, page_content='implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like'), Document(id='808a58b8-8788-4ace-bef9-23e6ff60fdf3', metadata={}, page_content='Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized')]
Relevant docs are >>> implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like
Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementat

127.0.0.1 - - [04/Feb/2025 17:38:15] "POST /query HTTP/1.1" 200 -


Query >>> What functions can machines simulate with AI?
Relevant docs (raw): [Document(id='ba283cc7-fc48-412e-9595-cb94db4e91ab', metadata={}, page_content='enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data'), Document(id='35a7c708-9758-42fd-bc24-eff4499505ce', metadata={}, page_content='implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These technologies enable machines to simulate human-like')]
Relevant docs are >>> enable machines to simulate human-like cognitive functions, such as problem-solving, learning, and decision-making, with unprecedented accuracy and efficiency. AI models leverage vast amounts of data
implementation of AI models has revolutionized sectors such as healthcare, automotive, finance, and entertainment, among many others. These tech

127.0.0.1 - - [04/Feb/2025 17:38:16] "POST /query HTTP/1.1" 200 -


Query >>> What is the impact of AI on societies?
Relevant docs (raw): [Document(id='0c3c7a78-eefa-4cc8-aaed-5e303a668797', metadata={}, page_content='unimaginable. As AI continues to evolve, it is reshaping the workforce, influencing economic trends, and even altering the way people interact with technology on a day-to-day basis.'), Document(id='808a58b8-8788-4ace-bef9-23e6ff60fdf3', metadata={}, page_content='Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revolutionized')]
Relevant docs are >>> unimaginable. As AI continues to evolve, it is reshaping the workforce, influencing economic trends, and even altering the way people interact with technology on a day-to-day basis.
Artificial Intelligence (AI) is a rapidly advancing technology that is transforming industries and societies across the globe. In recent years, the implementation of AI models has revo

127.0.0.1 - - [04/Feb/2025 17:38:19] "POST /query HTTP/1.1" 200 -


[DEBUG] Reference answer: content='The RAG model is a project management tool used to assess and communicate the status of a project or task. RAG stands for Red, Amber, and Green, which are the three colors used to indicate the status of a project or task.\n\n- Red: This indicates that the project or task is at risk and requires immediate attention. It means that the project is behind schedule, over budget, or facing other significant issues that need to be addressed urgently.\n\n- Amber: This indicates that the project or task is at risk of becoming red if action is not taken. It means that there are some issues or concerns that need to be addressed to prevent the project from falling behind schedule or going over budget.\n\n- Green: This indicates that the project or task is on track and progressing as planned. It means that everything is going well and there are no major issues or concerns that need to be addressed.\n\nThe RAG model is often used in project management to provide a q

[DEBUG] Reference answer: content='Artificial Intelligence (AI) has transformed industries in numerous ways, revolutionizing the way businesses operate and making processes more efficient and effective. Some of the key ways in which AI has transformed industries include:\n\n1. Automation: AI has enabled automation of repetitive tasks, allowing businesses to streamline their operations and reduce the need for manual labor. This has led to increased productivity and cost savings for many industries.\n\n2. Data analysis: AI has the ability to analyze large amounts of data quickly and accurately, providing businesses with valuable insights that can be used to make informed decisions. This has revolutionized industries such as finance, healthcare, and marketing, where data analysis is crucial for success.\n\n3. Personalization: AI has enabled businesses to personalize their products and services to meet the specific needs of individual customers. This has led to improved customer satisfacti

[DEBUG] Reference answer: content='AI models have revolutionized a wide range of sectors across industries, transforming the way businesses operate and improving efficiency, productivity, and decision-making processes. Some of the sectors that have been significantly impacted by AI models include:\n\n1. Healthcare: AI models have revolutionized the healthcare industry by enabling more accurate diagnosis and treatment of diseases, personalized medicine, and predictive analytics for patient outcomes. AI-powered tools such as medical imaging analysis, virtual health assistants, and predictive analytics have helped healthcare providers deliver better care to patients.\n\n2. Finance: AI models have transformed the finance industry by automating processes, detecting fraud, and providing personalized financial services. AI-powered tools such as algorithmic trading, risk management systems, and chatbots have helped financial institutions streamline operations and improve customer service.\n\n3

[DEBUG] Reference answer: content='Machines with artificial intelligence (AI) have the capability to simulate a wide range of functions across various industries and applications. Some of the key functions that machines can simulate with AI include:\n\n1. Data analysis and prediction: AI-powered machines can analyze large volumes of data to identify patterns, trends, and insights that can help businesses make informed decisions. They can also predict future outcomes based on historical data, enabling organizations to anticipate market trends, customer behavior, and other variables.\n\n2. Natural language processing: AI-powered machines can understand and generate human language, enabling them to interact with users through speech or text. This capability is used in chatbots, virtual assistants, and other applications that require communication with users in a natural and intuitive way.\n\n3. Image and video recognition: AI algorithms can analyze images and videos to identify objects, p

[DEBUG] Reference answer: content='Artificial Intelligence (AI) has had a significant impact on societies in various ways. Some of the key impacts of AI on societies include:\n\n1. Automation of tasks: AI has the ability to automate repetitive and mundane tasks, which has led to increased efficiency and productivity in various industries. This has resulted in job displacement in some sectors, but has also created new job opportunities in AI-related fields.\n\n2. Improved decision-making: AI algorithms can analyze large amounts of data and provide insights that can help businesses and governments make better decisions. This has led to improved efficiency and effectiveness in various sectors, such as healthcare, finance, and transportation.\n\n3. Personalization: AI technologies have enabled personalized experiences for consumers, such as personalized recommendations on streaming platforms, personalized healthcare treatments, and personalized shopping experiences. This has improved custo

Evaluation Result:
Evaluation complete. Avg Score: 0.91. Results stored in RAG_FOR_AI_test_cases.csv.
Execute Tool Response:
Test queries have been executed for the app "RAG_FOR_AI". The responses have been saved in the file "RAG_FOR_AI_test_cases.csv".
