# Weaviate RAG with Arize Hallucination Evaluation

This notebook demonstrates:
1. Loading E-Commerce FAQ data from HuggingFace into Weaviate
2. Creating a RAG system to answer user questions
3. Using Arize hallucination evaluator to check answer quality
4. Implementing a feedback loop to retry with anti-hallucination prompts


In [None]:

%pip install -q arize-phoenix-evals openai weaviate-client datasets huggingface-hub nest-asyncio

In [None]:
# Import required libraries
import os
import pandas as pd
import weaviate
import nest_asyncio
from datasets import load_dataset
from dotenv import load_dotenv
from phoenix.evals import llm_classify, OpenAIModel
from openai import OpenAI
import time
from typing import Dict, List, Tuple

load_dotenv()

In [None]:

url = os.environ.get("WEAVIATE_URL")
api_key = os.environ.get("WEAVIATE_API_KEY")
openai_key = os.environ.get("OPENAI_API_KEY")
aws_access_key = os.environ.get("AWS_ACCESS_KEY")
aws_secret_key = os.environ.get("AWS_SECRET_KEY")

weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=url,
    auth_credentials=weaviate.auth.AuthApiKey(api_key),
    headers={
        "X-OpenAI-Api-Key": openai_key,
        "X-AWS-Access-Key": aws_access_key,
        "X-AWS-Secret-Key": aws_secret_key,
    }
)

In [None]:
from weaviate.classes.config import Configure, Property, DataType

collection_name = "EcommerceFAQ"

try:
   weaviate_client.collections.delete(collection_name)
   print(f"Deleted existing collection: {collection_name}")
except:
   print(f"Collection {collection_name} doesn't exist, creating new one")

faq_collection = weaviate_client.collections.create(
   name=collection_name,
   properties=[
      Property(name="question", data_type=DataType.TEXT),
      Property(name="answer", data_type=DataType.TEXT),
      Property(name="category", data_type=DataType.TEXT),
      Property(name="parent_category", data_type=DataType.TEXT),
      Property(name="question_id", data_type=DataType.TEXT),
      Property(name="category_id", data_type=DataType.TEXT),
   ],
   vectorizer_config=Configure.Vectorizer.text2vec_openai(
      model="text-embedding-3-small"
   ),
   generative_config=Configure.Generative.aws(
      region="eu-west-2",
      service="bedrock",                     
      model="amazon.titan-text-express-v1"
   )
)
print(f"Created collection: {collection_name}")

In [None]:
print(" Loading E-Commerce FAQ dataset...")
dataset = load_dataset("NebulaByte/E-Commerce_FAQs")
faq_data = dataset['train'].to_pandas()

print(f"Dataset loaded: {len(faq_data)} FAQ entries")
print("Dataset columns:", faq_data.columns.tolist())

In [None]:
print("Importing FAQ data...")
with weaviate_client.batch.dynamic() as batch:
    for idx, row in faq_data.iterrows():
        batch.add_object(
            collection=collection_name,
            properties={
                "question": row["question"],
                "answer": row["answer"],
                "category": row["category"],
                "parent_category": row["parent_category"],
                "question_id": str(row["question_id"]),
                "category_id": str(row["category_id"])
            }
        )

print(f"Imported {len(faq_data)} FAQ entries")

Core Weaviate RAG function - returns only essential data needed for evaluation
    
    Args:
        question: User's question for search
        generative_prompt: The complete prompt template for generation
        limit: Number of search results to retrieve
    
    Returns:
        Dictionary with answer and context for evaluation

In [None]:
from weaviate.classes.config import Configure
from weaviate.classes.generate import GenerativeConfig

def weaviate_rag(question: str, generative_prompt: str, limit: int = 3) -> Dict:
    try:
        response = faq_collection.generate.hybrid(
            query=question,
            limit=limit,
            grouped_task=generative_prompt,
            return_metadata=["score"]
        )
        
        context_items = []
        for obj in response.objects:
            context_items.append(
                f"Q: {obj.properties['question']}\n"
                f"A: {obj.properties['answer']}\n"
                f"Category: {obj.properties['category']}"
            )

        context = "\n\n---\n\n".join(context_items)
        
        answer = response.generated if response.generated else ""
        return {
            "answer": answer,
            "context": context
        }      
    except Exception as e:
        return {
            "answer": f"Error occurred: {str(e)}",
            "context": ""
        }

Evaluate if the answer contains hallucinated information using llm_classify
    
    Args:
        question: User's question
        answer: RAG system's answer
        context: Context used to generate the answer
    
    Returns:
        Dictionary with evaluation results from llm_classify

In [None]:
RAG_HALLUCINATION_TEMPLATE = '''
You are evaluating whether an AI assistant's response contains hallucinated information when answering a question based on provided context.

[BEGIN DATA]
************
[Question]: {question}
************
[Context]: {context}
************
[Response]: {answer}
[END DATA]

Evaluate if the response contains information that is NOT supported by the provided context.

Hallucination occurs when:
- The response includes facts, numbers, or details not found in the context
- The response makes claims that contradict the context
- The response invents specific information not mentioned in the context

NOT hallucination when:
- The response says it doesn't have enough information
- The response only uses information from the context
- The response makes reasonable inferences clearly based on the context

Your answer must be a single word: "hallucinated" or "factual".
'''

eval_judge_model = OpenAIModel(
    model="gpt-4o", temperature=0, api_key=openai_key
)

def evaluate_hallucination_in_RAG_response(question: str, answer: str, context: str) -> Dict:
    
    user_query_df = pd.DataFrame({
        "question": [question],
        "answer": [answer],
        "context": [context]
    })
    
    eval_result = llm_classify(
        data=user_query_df,
        template=RAG_HALLUCINATION_TEMPLATE,
        model=eval_judge_model,
        rails=["hallucinated", "factual"],
        provide_explanation=True
    )
    
    result = eval_result.iloc[0].to_dict()
    
    if 'explanation' in result:
        print(f"  Explanation: {result['explanation']}")
    
    return result

print("Hallucination evaluator defined (using llm_classify)")

Main RAG workflow with hallucination detection and retry
    
    Args:
        question: User's question
        max_retries: Maximum retries if hallucination detected
    
    Returns:
        Dictionary with final answer and all attempt details

In [None]:
def complete_rag_with_hallucination_check(question: str, max_retries: int = 1) -> Dict:
    attempts = []
    
    base_prompt = """
    You are a helpful e-commerce customer service assistant. Answer the user's question based ONLY on the provided context from the FAQ database.
    
    IMPORTANT RULES:
    - Only use information from the provided FAQ context
    - Be concise and helpful
    - Don't make up information that's not in the context
    """
    
    print(f"Attempt 1:")
    
    # First attempt with base prompt
    generative_prompt = f"{base_prompt}\n\nUser Question: {{{{ question }}}}\n\nBased on the context provided, please answer the question:"
    weaviate_result = weaviate_rag(question, generative_prompt)
    
    rag_result = {
        "question": question,
        "answer": weaviate_result["answer"],
        "context": weaviate_result["context"]
    }
    
    print(f"Generated answer: {rag_result['answer']}")
    
    print("Evaluating for hallucination...")
    eval_result = evaluate_hallucination_in_RAG_response(
        question=rag_result["question"],
        answer=rag_result["answer"],
        context=rag_result["context"]
    )
    
    attempts.append({
        "attempt": 1,
        "rag_result": rag_result,
        "eval_result": eval_result
    })
    
    print(f"Evaluation result: {eval_result['label']}")
    
    # Retry loop if hallucination detected
    retry_count = 0
    while eval_result["label"] == "hallucinated" and retry_count < max_retries:
        retry_count += 1
        print(f"\nHallucination detected! Retrying (Attempt {retry_count + 1})...")
        
        # Add anti-hallucination instruction to base prompt
        retry_prompt = base_prompt + """
        
        CRITICAL WARNING:
        Your previous response was flagged as potentially containing hallucinated information.
        Be EXTREMELY careful to only use information explicitly stated in the provided context.
        If you cannot find the answer in the context, clearly state that you don't have enough information.
        Do NOT add any information that is not directly mentioned in the context.
        """
        
        # Retry with enhanced prompt
        generative_prompt = f"{retry_prompt}\n\nUser Question: {{{{ question }}}}\n\nBased on the context provided, please answer the question:"
        weaviate_result = weaviate_rag(question, generative_prompt)
        
        rag_result = {
            "question": question,
            "answer": weaviate_result["answer"],
            "context": weaviate_result["context"]
        }
        
        print(f"New answer: {rag_result['answer']}")
        
        print("Re-evaluating for hallucination...")
        eval_result = evaluate_hallucination_in_RAG_response(
            question=rag_result["question"],
            answer=rag_result["answer"],
            context=rag_result["context"]
        )
        
        attempts.append({
            "attempt": retry_count + 1,
            "rag_result": rag_result,
            "eval_result": eval_result
        })
        
        print(f"Re-evaluation result: {eval_result['label']}")
    
    final_result = {
        "question": question,
        "final_answer": rag_result["answer"],
        "final_evaluation": eval_result,
        "total_attempts": len(attempts),
        "hallucination_resolved": not eval_result["label"] == "hallucinated",
        "all_attempts": attempts
    }
    
    if eval_result["label"] == "hallucinated":
        print(f"\nFinal result: Hallucination still detected after {max_retries} retries")
    else:
        print(f"\nFinal result: Answer is factual (resolved in {len(attempts)} attempt{'s' if len(attempts) > 1 else ''})")
    
    return final_result

print("Complete RAG workflow with hallucination check defined")

In [None]:
test_questions = [
    "What should I do if I missed my delivery?",
    "Can I use my saved cards on mobile app?",
    "How do I return a product?",
    "What is your refund policy for electronics?",
    "Can you tell me about your space program?"  # Edge case: completely irrelevant
]

print("Testing RAG workflow with sample questions\n")

test_results = []
for i, question in enumerate(test_questions, 1):
    print(f"\n{'='*60}")
    print(f"Question {i}/{len(test_questions)}: {question}")
    print('='*60)
    
    result = complete_rag_with_hallucination_check(question)
    test_results.append(result)
    
    print(f"\nFinal Answer: {result['final_answer']}")
    print(f"Factual: {result['final_evaluation']['label'] == 'factual'}")
    print(f"Total Attempts: {result['total_attempts']}")
    
    time.sleep(1)

print(f"\n\nCompleted {len(test_results)} tests")

In [None]:
# Cleanup
try:
    weaviate_client.close()
    print("Weaviate connection closed")
except:
    print("Connection already closed")