# Evaluate RAG with LlamaIndex, Arize AI

In [None]:
!pip install llama-index
!pip install azure-identity
!pip install python-dotenv
!pip install llama-index-vector-stores-azureaisearch
!pip install llama-index-embeddings-azure-openai
!pip install llama-index-llms-azure-openai
!pip install arize-phoenix
!pip install nest-asyncio
!pip install "openinference-instrumentation-llama-index>=2.0.0"
!pip install -U llama-index-callbacks-arize-phoenix
!pip install azure-search-documents==11.4.0

## Initial Setup
Load environment variables and initialize the necessary clients and models.

In [4]:
import os
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore, IndexManagement

# Load environment variables
load_dotenv()

# Environment Variables
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME") # I'm using GPT-3.5-turbo
AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME") # I'm using text-embedding-ada-002
SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
SEARCH_SERVICE_API_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")
INDEX_NAME = "contoso-hr-docs"
# PHOENIX_API_KEY = os.getenv("PHOENIX_API_KEY")
# OTEL_EXPORTER_OTLP_HEADERS = f"api_key={PHOENIX_API_KEY}"
# PHOENIX_CLIENT_HEADERS = f"api_key={PHOENIX_API_KEY}"
  


# Initialize Azure OpenAI and embedding models
llm = AzureOpenAI(
    model=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
    deployment_name=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version="2024-02-01"
)

embed_model = AzureOpenAIEmbedding(
    model=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME,
    deployment_name=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME,
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version="2024-02-01"
)

# Initialize search clients
credential = AzureKeyCredential(SEARCH_SERVICE_API_KEY)
index_client = SearchIndexClient(endpoint=SEARCH_SERVICE_ENDPOINT, credential=credential)
search_client = SearchClient(endpoint=SEARCH_SERVICE_ENDPOINT, index_name=INDEX_NAME, credential=credential)


## Launch Phoenix

In [5]:
import phoenix as px
import nest_asyncio
import phoenix as px
from llama_index.core import VectorStoreIndex
import llama_index.core
from llama_index.core.evaluation import AnswerRelevancyEvaluator, ContextRelevancyEvaluator
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.llms.openai import OpenAI
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from phoenix.experiments import evaluate_experiment, run_experiment
from phoenix.experiments.types import Explanation, Score

nest_asyncio.apply()
px.launch_app()

WARNI [phoenix.session.session] Existing running Phoenix instance detected! Shutting it down and starting a new instance...


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


<phoenix.session.session.ThreadSession at 0x1b9c5336a50>

## Instrument LlamaIndex

In [6]:
import llama_index.core

llama_index.core.set_global_handler("arize_phoenix")

## Vector Store Initialization
Set up the vector store using Azure AI Search.

In [7]:
from llama_index.core.settings import Settings

Settings.llm = llm
Settings.embed_model = embed_model

# Initialize the vector store
vector_store = AzureAISearchVectorStore(
    search_or_index_client=index_client,
    index_name=INDEX_NAME,
    index_management=IndexManagement.VALIDATE_INDEX,
    id_field_key="id",
    chunk_field_key="text",
    embedding_field_key="embedding",
    embedding_dimensionality=1536,
    metadata_string_field_key="metadata",
    doc_id_field_key="doc_id",
    language_analyzer="en.lucene",
    vector_algorithm_type="exhaustiveKnn",
)


## Use Existing Index
I'm going to use my existing "contoso-hr-docs" index that I created. For how to create an index and load documents from scratch, see [here](https://github.com/farzad528/azure-ai-search-python-playground/blob/addb1a29e70ee9dbf1bb9a39bbe367aa15e4cf5f/azure-ai-search-rag-eval-trulens.ipynb#L145).

In [8]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    [],
    storage_context=storage_context,
)

## Query Execution
Execute a query to test the setup.

In [18]:
# Query execution
from llama_index.core.schema import MetadataMode
query = "Does my health plan cover scuba diving?"
query_engine = index.as_query_engine(llm, similarity_top_k=3)
response = query_engine.query(query)

# Print the response
display_response(response)
print("\n")

# Print what the LLM sees
for node in response.source_nodes:
    print(node.get_content(metadata_mode=MetadataMode.LLM))

**`Final Response:`** It is important to review the plan's evidence of coverage to determine if scuba diving is covered under the health plan. Additionally, discussing this with your healthcare provider and reviewing the list of excluded services and prescriptions will help clarify whether scuba diving is covered. If scuba diving is not covered under the plan, it is advisable to explore alternative coverage options or discuss payment options with your healthcare provider.



page_label: 90
file_path: c:\Dev\azure-ai-search-python-playground\data\pdf\Northwind_Health_Plus_Benefits_Details.pdf

benefits for mental health and 
substance abuse services as it does for medical and surgical benefits. This includes covering 
services that are medically necessary, suc h as inpatient and outpatient services, medication 
management, and psychological and psychosocial therapies.  
It is important to note that the plan may not provide coverage or impose any limits or 
exclusions that are not in compliance with applicable laws a nd regulations. Additionally, the 
plan may not discriminate against individuals based on their medical condition or health 
status. Individuals who feel they have been discriminated against should contact the 
Department of Labor, who can investigate the iss ue. 
Finally, it is important to note that the plan may not provide coverage or impose any limits 
or exclusions that are not in compliance with applicable laws and regulations. Additiona

## Compare Different Query Engines
Evaluate and compare the responses from different query engines.

In [54]:

from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import get_response_synthesizer
import pprint

# define response synthesizer
response_synthesizer = get_response_synthesizer()

# Initialize retrievers and query engines
keyword_retriever = index.as_retriever(vector_store_query_mode=VectorStoreQueryMode.SPARSE, similarity_top_k=10)
hybrid_retriever = index.as_retriever(vector_store_query_mode=VectorStoreQueryMode.HYBRID, similarity_top_k=10)
semantic_hybrid_retriever = index.as_retriever(vector_store_query_mode=VectorStoreQueryMode.SEMANTIC_HYBRID, similarity_top_k=10)

keyword_query_engine = RetrieverQueryEngine(retriever=keyword_retriever, response_synthesizer=response_synthesizer,)
hybrid_query_engine = RetrieverQueryEngine(retriever=hybrid_retriever, response_synthesizer=response_synthesizer,)
semantic_hybrid_query_engine = RetrieverQueryEngine(retriever=semantic_hybrid_retriever, response_synthesizer=response_synthesizer,)

# Evalaute RAG with Arize AI

## Run Your Query Engine and View Your Traces in Phoenix

In [55]:
from tqdm import tqdm
import json
from openinference.instrumentation import using_metadata
from phoenix.trace import using_project

# Load all evaluation questions from queries.jsonl
eval_questions = []
with open("eval/queries.jsonl", "r") as file:
    for line in file:
        # Parse each line as JSON and extract the query
        json_line = json.loads(line.strip())
        eval_questions.append(json_line)

# List of query engines and their respective project names
query_engines = [
    (keyword_query_engine, "Keyword"),
    (hybrid_query_engine, "Hybrid"),
    (semantic_hybrid_query_engine, "Semantic_Hybrid"),
]

# Loop through each question and query it against each engine
for query_data in tqdm(eval_questions):
    query = query_data["query"]
    query_classification = query_data.get("query_classification", "undefined")  # Default to 'undefined' if not present
    
    for engine, project_name in query_engines:
        try:
            metadata = query_classification
            with using_project(project_name), using_metadata(metadata):
                # Assuming the query method expects a string query and returns results
                engine.query(query)
        except Exception as e:
            print(f"Error querying {project_name} for query '{query}': {e}")


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [11:19<00:00,  6.80s/it]


## Export and Evaluate Your Trace Data

In [56]:
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    RelevanceEvaluator,
    run_evals,
)
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from tqdm import tqdm

# Create queries DataFrame for each project
keyword_queries_df = get_qa_with_reference(px.Client(), project_name="Keyword")
hybrid_queries_df = get_qa_with_reference(px.Client(), project_name="Hybrid")
semantic_hybrid_queries_df = get_qa_with_reference(px.Client(), project_name="Semantic_Hybrid")

# Create retrieved documents DataFrame for each project
keyword_retrieved_documents_df = get_retrieved_documents(px.Client(), project_name="Keyword")
hybrid_retrieved_documents_df = get_retrieved_documents(px.Client(), project_name="Hybrid")
semantic_hybrid_retrieved_documents_df = get_retrieved_documents(px.Client(), project_name="Semantic_Hybrid")

In [57]:
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    RelevanceEvaluator,
    run_evals,
)
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from tqdm import tqdm

# Define the evaluation model
eval_model = OpenAIModel(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_deployment="gpt-4o",  # I'm using gpt-4o for evaluation
    model=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2024-02-01"
)

# Define evaluators
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)

# List of project names
projects = ["Keyword", "Hybrid", "Semantic_Hybrid"]

# Loop through each project and perform evaluations
for project in projects:
    # Create queries and retrieved documents DataFrames for the project
    queries_df = get_qa_with_reference(px.Client(), project_name=project)
    retrieved_documents_df = get_retrieved_documents(px.Client(), project_name=project)
    
    # Run evaluations
    hallucination_eval_df, qa_correctness_eval_df = run_evals(
        dataframe=queries_df,
        evaluators=[hallucination_evaluator, qa_correctness_evaluator],
        provide_explanation=True,
    )
    relevance_eval_df = run_evals(
        dataframe=retrieved_documents_df,
        evaluators=[relevance_evaluator],
        provide_explanation=True,
    )[0]
    
    # Log evaluations
    px.Client().log_evaluations(
        SpanEvaluations(eval_name=f"Hallucination_{project}", dataframe=hallucination_eval_df),
        SpanEvaluations(eval_name=f"QA Correctness_{project}", dataframe=qa_correctness_eval_df),
        DocumentEvaluations(eval_name=f"Relevance_{project}", dataframe=relevance_eval_df),
    )

run_evals |████████▏ | 161/198 (81.3%) | ⏳ 05:12<01:31 |  2.49s/it 

Worker timeout, requeuing


run_evals |████████▏ | 162/198 (81.8%) | ⏳ 05:14<01:09 |  1.93s/it 

Worker timeout, requeuing


run_evals |██████████| 198/198 (100.0%) | ⏳ 06:24<00:00 |  1.94s/it
run_evals |██████████| 990/990 (100.0%) | ⏳ 06:50<00:00 |  2.41it/s 
run_evals |████▏     | 83/200 (41.5%) | ⏳ 02:31<03:22 |  1.73s/it 

Worker timeout, requeuing


run_evals |████▏     | 83/200 (41.5%) | ⏳ 02:32<03:22 |  1.73s/it 

Worker timeout, requeuing


run_evals |████▏     | 83/200 (41.5%) | ⏳ 02:33<03:22 |  1.73s/it 

Worker timeout, requeuing


run_evals |████▎     | 85/200 (42.5%) | ⏳ 02:34<09:41 |  5.06s/it 

Worker timeout, requeuing


run_evals |████▎     | 87/200 (43.5%) | ⏳ 02:35<05:56 |  3.15s/it 

Worker timeout, requeuing


run_evals |█████▋    | 113/200 (56.5%) | ⏳ 03:33<01:33 |  1.07s/it 

Worker timeout, requeuing


run_evals |██████████| 200/200 (100.0%) | ⏳ 05:57<00:00 |  1.79s/it
run_evals |██████████| 1000/1000 (100.0%) | ⏳ 06:32<00:00 |  2.55it/s
run_evals |██▊       | 56/200 (28.0%) | ⏳ 02:02<11:03 |  4.61s/it 

Worker timeout, requeuing


run_evals |██▊       | 56/200 (28.0%) | ⏳ 02:02<11:03 |  4.61s/it 

Worker timeout, requeuing


run_evals |██▊       | 56/200 (28.0%) | ⏳ 02:03<11:03 |  4.61s/it 

Worker timeout, requeuing
Worker timeout, requeuing


run_evals |██▉       | 59/200 (29.5%) | ⏳ 02:05<11:02 |  4.70s/it 

Worker timeout, requeuing


run_evals |███       | 60/200 (30.0%) | ⏳ 02:06<08:40 |  3.72s/it 

Worker timeout, requeuing


run_evals |███       | 60/200 (30.0%) | ⏳ 02:06<08:40 |  3.72s/it 

Worker timeout, requeuing


run_evals |████▉     | 99/200 (49.5%) | ⏳ 03:28<01:31 |  1.11it/s 

Worker timeout, requeuing


run_evals |█████▎    | 106/200 (53.0%) | ⏳ 03:38<03:39 |  2.34s/it 

Worker timeout, requeuing


run_evals |█████▍    | 108/200 (54.0%) | ⏳ 04:02<03:34 |  2.34s/it 

Worker timeout, requeuing


run_evals |█████▍    | 108/200 (54.0%) | ⏳ 04:04<03:34 |  2.34s/it 

Worker timeout, requeuing
Worker timeout, requeuing
Worker timeout, requeuing


run_evals |█████▌    | 112/200 (56.0%) | ⏳ 04:05<05:32 |  3.78s/it 

Worker timeout, requeuing


run_evals |█████▌    | 112/200 (56.0%) | ⏳ 04:06<05:32 |  3.78s/it 

Worker timeout, requeuing
Worker timeout, requeuing


run_evals |████████  | 162/200 (81.0%) | ⏳ 06:03<00:52 |  1.37s/it 

Worker timeout, requeuing


run_evals |█████████▍| 189/200 (94.5%) | ⏳ 07:05<00:11 |  1.08s/it 

Worker timeout, requeuing


run_evals |█████████▌| 191/200 (95.5%) | ⏳ 07:07<00:49 |  5.50s/it 

Worker timeout, requeuing


run_evals |██████████| 200/200 (100.0%) | ⏳ 07:39<00:00 |  2.30s/it
run_evals |██████████| 1000/1000 (100.0%) | ⏳ 07:02<00:00 |  2.37it/s
