In [1]:
from langsmith import Client
from langsmith.wrappers import wrap_openai
from langsmith import traceable, evaluate
from openai import OpenAI
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.query import MetadataQuery
from dotenv import load_dotenv
import os

In [3]:
def get_answer(query: str, 
               weaviate_client, 
               openai_client, 
               collection_name: str = "TeslaCybertruck",
               num_chunks: int = 3) -> str:
    """
    Get an answer to a question using RAG (Retrieval Augmented Generation).
    
    Args:
        query (str): The user's question
        weaviate_client: Initialized Weaviate client
        openai_client: Initialized OpenAI client
        collection_name (str): Name of the Weaviate collection to query
        num_chunks (int): Number of relevant chunks to retrieve
        
    Returns:
        str: The generated answer
    """
    # 1. Generate embedding for the query
    response = openai_client.embeddings.create(
        model="text-embedding-3-large",
        input=query
    )
    query_embedding = response.data[0].embedding

    # 2. Retrieve relevant chunks from Weaviate
    collection = weaviate_client.collections.get(collection_name)
    similar_texts = collection.query.near_vector(
        near_vector=query_embedding,
        limit=num_chunks,
        return_properties=["text"],
        return_metadata=MetadataQuery(distance=True)
    )

    # 3. Combine retrieved contexts
    context_str = "\n\n---\n\n".join(
        [doc.properties["text"] for doc in similar_texts.objects]
    )
    
    # 4. Create prompt for GPT
    prompt = f"""Answer the question using ONLY the information provided in the context below. 
    Do not add any general knowledge or information not contained in the context.

    Context:
    {context_str}

    Question: {query}

    Answer:"""

    # 5. Generate answer using GPT-4
    response = openai_client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return response.choices[0].message.content


In [4]:
@traceable
def rag_agent(inputs: dict, weaviate_client, openai_client) -> dict:
    """
    RAG agent that processes questions using the get_answer function.
    
    Args:
        inputs (dict): Input dictionary containing messages
        weaviate_client: Initialized Weaviate client
        openai_client: Initialized OpenAI client
    
    Returns:
        dict: Response in the format expected by LangSmith
    """
    # Extract the question from the input messages
    # if there is a message history, use the last message

    question = inputs["messages"][-1]["content"]
    
    # Get answer using the RAG pipeline
    answer = get_answer(
        query=question,
        weaviate_client=weaviate_client,
        openai_client=openai_client
    )
    
    return {
        "message": {
            "role": "assistant",
            "content": answer
        }
    }

In [5]:
def evaluate_rag_system(dataset_name: str = "rag_evaluation_dataset", 
                       experiment_prefix: str = "RAG Test Dataset Evaluation"):
    """
    Evaluate the RAG system using questions from a LangSmith dataset.
    
    Args:
        dataset_name (str): Name of the LangSmith dataset
        experiment_prefix (str): Prefix for the experiment name
    """
    # Setup
    load_dotenv()
    
    # Initialize clients
    wcd_url = os.environ["WCD_URL"]
    wcd_api_key = os.environ["WCD_API_KEY"]
    
    weaviate_client = weaviate.connect_to_weaviate_cloud(
        cluster_url=wcd_url,
        auth_credentials=Auth.api_key(wcd_api_key),
        skip_init_checks=True
    )
    
    base_openai_client = OpenAI()
    openai_client = wrap_openai(base_openai_client)
    
    # Define the evaluator
    def answer_relevance_evaluator(run, example) -> dict:
        """
        Evaluates the relevance and accuracy of the RAG system's answer
        """

        question = run.inputs["inputs"]["messages"][-1]["content"]
        generated_answer = run.outputs["message"]["content"]
        reference_answer = example.outputs["message"]["content"]
        
        print(question)
        print(generated_answer)
        print(reference_answer)
        
        evaluation_prompt = f"""
        Question: {question}
        
        Generated Answer: {generated_answer}
        
        Reference Answer: {reference_answer}
        
        Score the generated answer from 0-5:
        5 = Perfect match with reference, complete and accurate
        4 = Very good, minor differences from reference
        3 = Acceptable, but missing some details or slightly inaccurate
        2 = Partially correct but significant omissions or inaccuracies
        1 = Mostly incorrect or irrelevant
        0 = Completely wrong or unrelated
        
        Return only the number (0-5).
        """
        
        response = openai_client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an evaluation assistant. Respond only with a number 0-5."},
                {"role": "user", "content": evaluation_prompt}
            ],
            temperature=0
        )
        
        try:
            score = int(response.choices[0].message.content.strip())
            return {
                "key": "answer_relevance",
                "score": score / 5,  # Normalize to 0-1
                "explanation": f"Answer relevance score: {score}/5"
            }
        except ValueError:
            return {
                "key": "answer_relevance",
                "score": 0,
                "explanation": "Failed to parse score"
            }
    
    # Create a wrapped version of rag_agent that includes the clients
    def agent(inputs: dict) -> dict:
        return rag_agent(inputs, weaviate_client, openai_client)
    
    # Run the evaluation
    results = evaluate(
        agent,
        data=dataset_name,
        evaluators=[answer_relevance_evaluator],
        experiment_prefix=experiment_prefix
    )
    
    # Clean up
    weaviate_client.close()
    
    return results


In [None]:
# Run the evaluation
if __name__ == "__main__":
    results = evaluate_rag_system()
    print("Evaluation Results:", results)