In [None]:
# 03_rag_core_evaluation.ipynb

# 1. Setup and Imports
import os
import sys
import pandas as pd
from datasets import Dataset # For Ragas/DeepEval
from tqdm.notebook import tqdm # For progress bars in notebooks

# Add the project root to the system path to import from src
# This handles cases where the notebook is in a subfolder like 'notebooks/'
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Verify path for debugging
print(f"Project root added to path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"Contents of src/: {os.listdir(os.path.join(project_root, 'src'))}") # Corrected path for os.listdir

from src.rag_pipeline import RAGPipeline

# For Ragas evaluation
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)
# Note: For Ragas metrics, you'll need an LLM to evaluate.
# It can be a small local model or even a HuggingFace API endpoint if you choose.

# For DeepEval evaluation
from deepeval import evaluate as deepeval_evaluate
from deepeval.metrics import (
    FaithfulnessMetric,
    AnswerRelevancyMetric,
    ContextRecallMetric,
    # TEMPORARILY COMMENTED OUT DUE TO ImportError:
    # ContextRelevancyMetric,
    # BiasMetric,
    # ToxicityMetric
)
from deepeval.test_case import LLMTestCase

# --- Configuration ---
# Adjust paths as necessary based on your project structure
DATA_PATH = '../data/processed/processed_data.parquet' # Or your CSV file
VECTOR_STORE_PATH = 'vector_store/' # Relative to notebooks/

# Initialize RAG Pipeline
# You can change the model_name here if you prefer a different LLM
rag_pipeline = RAGPipeline(
    model_name="google/gemma-2b-it", # Try "microsoft/phi-2" or "distilbert-base-uncased" if Gemma is too slow/large
    embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
    vector_store_path=VECTOR_STORE_PATH
)

# 2. Load Models and FAISS Index
print("Loading embedding model...")
rag_pipeline.load_embedding_model()

print("Loading FAISS index and metadata...")
try:
    rag_pipeline.load_faiss_index()
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure you've run Task 2 (02_embedding_chunking.ipynb) locally to create these files.")
    # You might want to halt execution here or add more robust error handling
    raise

print("Loading LLM (this will take time and use significant RAM)...")
rag_pipeline.load_llm()

if rag_pipeline.llm is None:
    print("Failed to load LLM. Cannot proceed with RAG chain or evaluation.")
    # You might want to stop here or try a smaller model.
    raise ValueError("LLM failed to load.")

print("Setting up RAG chain...")
rag_pipeline.setup_rag_chain()

# 3. Define Test Questions and Ground Truths
# Replace with your actual questions and expected answers relevant to your credit trust data
# It's good practice to have at least 10-20 pairs for evaluation.
test_data = [
    {
        "question": "What is a credit score and why is it important?",
        "ground_truth": "A credit score is a numerical representation of your creditworthiness, used by lenders to assess risk. It's important because it influences your ability to get loans, credit cards, mortgages, and even apartment rentals.",
        "contexts": [] # Will be filled by RAG output
    },
    {
        "question": "How can I improve my credit score?",
        "ground_truth": "Improving your credit score involves paying bills on time, keeping credit utilization low, avoiding new credit applications too frequently, and regularly checking your credit report for errors.",
        "contexts": []
    },
    {
        "question": "What factors affect my credit report?",
        "ground_truth": "Factors affecting your credit report include payment history, amounts owed, length of credit history, new credit, and credit mix.",
        "contexts": []
    },
    {
        "question": "Can I dispute errors on my credit report?",
        "ground_truth": "Yes, you can dispute errors on your credit report by contacting the credit bureaus (Experian, Equifax, TransUnion) and providing evidence of the inaccuracy.",
        "contexts": []
    },
    {
        "question": "What is credit utilization?",
        "ground_truth": "Credit utilization is the amount of credit you're using compared to your total available credit. Keeping it low (ideally below 30%) is good for your credit score.",
        "contexts": []
    },
    # Add more relevant test questions based on your dataset
]

# 4. Generate RAG Responses and Collect Data for Evaluation
print("\nGenerating RAG responses for evaluation dataset...")
rag_responses = []
for item in tqdm(test_data, desc="Generating responses"):
    question = item["question"]
    
    # Retrieve documents and get the answer
    answer, context_used_str, retrieved_docs_metadata = rag_pipeline.query(question)
    
    # Extract actual text content from retrieved_docs_metadata
    retrieved_contexts_list = [doc['text_content'] for doc in retrieved_docs_metadata]

    rag_responses.append({
        "question": question,
        "answer": answer,
        "ground_truth": item["ground_truth"],
        "contexts": retrieved_contexts_list # This is the list of strings (chunks) used as context
    })

# Convert to Pandas DataFrame and then Hugging Face Dataset for Ragas/DeepEval
rag_df = pd.DataFrame(rag_responses)
eval_dataset = Dataset.from_pandas(rag_df)

print("\nRAG responses generated and dataset prepared.")
print(rag_df.head())

# 5. Ragas Evaluation

print("\n--- Starting Ragas Evaluation ---")

# Wrap your local LLM for Ragas compatibility
from langchain_core.outputs import Generation, LLMResult
from langchain_core.messages import HumanMessage, AIMessage

class RagasLocalLLM:
    def __init__(self, llm):
        self.llm = llm

    def generate(self, messages, **kwargs):
        full_prompt = ""
        for msg in messages:
            if isinstance(msg, HumanMessage):
                full_prompt += f"Human: {msg.content}\n"
            elif isinstance(msg, AIMessage):
                full_prompt += f"AI: {msg.content}\n"
            else:
                full_prompt += f"{msg.content}\n" # Fallback for other message types

        # Invoke the underlying HuggingFacePipeline
        response = self.llm.invoke(full_prompt)
        
        # Ragas expects a specific format: a list of Generation objects
        return LLMResult(generations=[[Generation(text=response)]])

ragas_llm_for_metrics = RagasLocalLLM(rag_pipeline.llm)
ragas_embeddings_for_metrics = rag_pipeline.embeddings

# Note: Evaluation can take a long time on CPU, especially with many test cases.
print("Running Ragas evaluation... This may take a very long time on CPU.")
try:
    result = evaluate(
        eval_dataset,
        metrics=[
            faithfulness,
            answer_relevancy,
            context_recall,
            context_precision
        ],
        llm=ragas_llm_for_metrics,
        embeddings=ragas_embeddings_for_metrics,
        show_progress=True # Show progress bar
    )
    ragas_metrics_df = result.to_pandas()
    print("\nRagas Evaluation Results:")
    print(ragas_metrics_df)
    print("\nOverall Ragas Scores:")
    print(ragas_metrics_df.mean(numeric_only=True)) # Calculate mean scores

except Exception as e:
    print(f"Error during Ragas evaluation: {e}")
    print("Ragas often requires specific LLM integrations. Check Ragas documentation or try a smaller subset of data.")


# 6. DeepEval Evaluation

print("\n--- Starting DeepEval Evaluation ---")

# Create DeepEval test cases from your RAG responses
deepeval_test_cases = []
for item in rag_responses:
    context_list = item["contexts"] if isinstance(item["contexts"], list) else [item["contexts"]]

    deepeval_test_cases.append(
        LLMTestCase(
            input=item["question"],
            actual_output=item["answer"],
            expected_output=item["ground_truth"],
            retrieval_context=context_list
        )
    )

# Define DeepEval metrics
try:
    # IMPORTANT: Ensure that the metrics you *do* import above are the only ones used here.
    deepeval_metrics = [
        FaithfulnessMetric(threshold=0.7, model=rag_pipeline.model_name),
        AnswerRelevancyMetric(threshold=0.7, model=rag_pipeline.model_name),
        ContextRecallMetric(threshold=0.7, model=rag_pipeline.model_name),
        # If ContextRelevancyMetric etc. are still needed, we'd need to find a compatible deepeval version or an alternative.
    ]

    print("Running DeepEval evaluation... This may also take a very long time on CPU.")
    deepeval_results = deepeval_evaluate(deepeval_test_cases, metrics=deepeval_metrics, show_progress=True)

    print("\nDeepEval Evaluation Results:")
    for result in deepeval_results:
        print(f"Question: {result.input}")
        print(f"Answer: {result.actual_output[:100]}...") # Truncate for cleaner output
        print(f"Expected: {result.expected_output[:100]}...") # Truncate for cleaner output
        print("Metrics:")
        for metric in result.metrics:
            print(f"  - {metric.metric_name}: {metric.score} (Reason: {metric.reason})")
        print("-" * 30)

except Exception as e:
    print(f"Error during DeepEval evaluation: {e}")
    print("DeepEval often requires specific LLM configurations for metrics. If errors persist, consider:")
    print("1. Setting `os.environ[\"OPENAI_API_KEY\"]` and using an OpenAI model for DeepEval's internal LLM (`model=\"gpt-3.5-turbo\"`).")
    print("2. Setting up a local Ollama server and using `deepeval.llm_pipeline.OllamaLLM` for metrics.")
    print("3. Checking DeepEval's documentation for local LLM integration.")


print("\n--- Task 3 (RAG Core & Evaluation) Complete ---")