In [22]:
import sys
import os
sys.path.append("..") 
from rag_query import query_rag
from langchain_chroma import Chroma
from ingestion import load_documents, split_documents, get_embeddings
from vector_store import build_or_update_chroma_db
from langchain_community.llms import Ollama


In [23]:
EVAL_PROMPT = """
You are an evaluator.

Expected Answer:
{expected_response}

Generated Answer:
{actual_response}

Question:
Does the generated answer EXPRESS THE SAME FACTUAL INFORMATION as the expected answer,
even if the wording is different?

Answer ONLY with one word: true or false.
"""

def query_and_validate(db, question: str, expected_response: str):
    #  Génération via RAG
    generated_answer = query_rag( question)

    # Prompt du juge
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response,
        actual_response=generated_answer
    )

    # 3️LLM juge (différent du générateur)
    judge_model = Ollama(model="phi")  # ou "phi"

    judgment = judge_model.invoke(prompt).strip().lower()

    print("QUESTION:", question)
    print("EXPECTED:", expected_response)
    print("GENERATED:", generated_answer)
    print("JUDGE:", judgment)
    print("-" * 50)

    if "true" in judgment:
        return True
    elif "false" in judgment:
        return False
    else:
        raise ValueError("Judge returned invalid output")


In [24]:
documents = load_documents("../data")
chunks = split_documents(documents)
embeddings = get_embeddings()


db = Chroma(
    persist_directory="../chroma_db",
    embedding_function=embeddings )

result = query_and_validate(
    db=db,
    question="How much total money does a player start with in Monopoly?",
    expected_response="$1,500"
)

print("TEST RESULT:", result)


QUESTION: How much total money does a player start with in Monopoly?
EXPECTED: $1,500
GENERATED: No relevant information found in the documents.
JUDGE: false.
--------------------------------------------------
TEST RESULT: False


In [25]:
result = query_and_validate(
    db=db,
    question="How many points does the longest continuous train get in Ticket to Ride? ",
    expected_response="10 points")

print("TEST RESULT:", result)

QUESTION: How many points does the longest continuous train get in Ticket to Ride? 
EXPECTED: 10 points
GENERATED: No relevant information found in the documents.
JUDGE: false
--------------------------------------------------
TEST RESULT: False
