In [1]:
#!pip install ragas

In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import json
import time
import warnings
#warnings.filterwarnings("ignore")

from langchain_classic.chains import ConversationalRetrievalChain
from langchain_classic.memory import ConversationBufferWindowMemory
from langchain_classic.vectorstores import Chroma
from langchain_classic.schema import Document
from langchain_classic.text_splitter import CharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

from ragas import evaluate, RunConfig
from ragas.metrics.collections import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

import google.genai as genai
from sympy.physics.units import temperature

print("Libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


#### 1. Preparing test data

In [3]:
knowledge_docs = [
    """Intelig√™ncia Artificial (IA) √© um campo da ci√™ncia da computa√ß√£o que se concentra
    na cria√ß√£o de sistemas capazes de realizar tarefas que normalmente requerem intelig√™ncia humana.
    Isso inclui aprendizado, racioc√≠nio, percep√ß√£o e tomada de decis√µes. A IA pode ser classificada
    em IA fraca (espec√≠fica para tarefas) e IA forte (intelig√™ncia geral).""",

    """Machine Learning √© uma sub√°rea da IA que permite que computadores aprendam e melhorem
    automaticamente atrav√©s da experi√™ncia, sem serem explicitamente programados.
    Os algoritmos de ML identificam padr√µes em dados e fazem previs√µes. Existem tr√™s tipos principais:
    aprendizado supervisionado, n√£o supervisionado e por refor√ßo.""",

    """Deep Learning √© uma t√©cnica de machine learning baseada em redes neurais artificiais
    com m√∫ltiplas camadas. √â especialmente eficaz para tarefas como reconhecimento de imagem,
    processamento de linguagem natural e reconhecimento de voz. As redes neurais profundas
    podem ter centenas de camadas e milh√µes de par√¢metros.""",

    """RAG (Retrieval-Augmented Generation) √© uma t√©cnica que combina recupera√ß√£o de informa√ß√µes
    com gera√ß√£o de texto. Permite que modelos de linguagem acessem conhecimento externo
    para gerar respostas mais precisas e atualizadas. O processo envolve buscar documentos
    relevantes e usar essas informa√ß√µes para gerar a resposta final.""",

    """Google Gemini √© um modelo de linguagem multimodal desenvolvido pelo Google,
    capaz de processar texto, imagens e c√≥digo. Oferece capacidades avan√ßadas de
    racioc√≠nio e compreens√£o contextual. O Gemini vem em diferentes vers√µes:
    Nano, Pro e Ultra, cada uma otimizada para diferentes casos de uso.""",

    """LangChain √© um framework para desenvolvimento de aplica√ß√µes com modelos de linguagem.
    Facilita a cria√ß√£o de cadeias complexas, gerenciamento de mem√≥ria e integra√ß√£o
    com diferentes fontes de dados. Oferece componentes modulares para construir
    aplica√ß√µes robustas de IA conversacional."""
]

# Convers√£o para objetos Document
docs = [Document(page_content=doc) for doc in knowledge_docs]

print(f"{len(docs)} knowledge documents created.")

6 knowledge documents created.


In [4]:
# Cria√ß√£o de dataset de teste para avalia√ß√£o RAGAS
test_data = {
    'question': [
        "O que √© Intelig√™ncia Artificial?",
        "Como funciona o Machine Learning?",
        "Quais s√£o as aplica√ß√µes do Deep Learning?",
        "O que √© RAG e como funciona?",
        "Quais s√£o as caracter√≠sticas do Google Gemini?"
    ],
    'ground_truth': [
        "Intelig√™ncia Artificial √© um campo da ci√™ncia da computa√ß√£o focado na cria√ß√£o de sistemas que realizam tarefas que requerem intelig√™ncia humana, incluindo aprendizado, racioc√≠nio e tomada de decis√µes.",
        "Machine Learning permite que computadores aprendam automaticamente atrav√©s da experi√™ncia, identificando padr√µes em dados para fazer previs√µes, sem programa√ß√£o expl√≠cita.",
        "Deep Learning √© eficaz para reconhecimento de imagem, processamento de linguagem natural e reconhecimento de voz, usando redes neurais com m√∫ltiplas camadas.",
        "RAG combina recupera√ß√£o de informa√ß√µes com gera√ß√£o de texto, permitindo que modelos acessem conhecimento externo para respostas mais precisas.",
        "Google Gemini √© um modelo multimodal que processa texto, imagens e c√≥digo, oferecendo capacidades avan√ßadas de racioc√≠nio em vers√µes Nano, Pro e Ultra."
    ]
}

print("‚úÖ Dataset created!")
print(f"üìä {len(test_data['question'])} test questions created.")

‚úÖ Dataset created!
üìä 5 test questions created.


#### 2. RAG system creation

In [5]:
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory="./chroma_db_evaluation")
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3, convert_system_message_to_human=True)
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=3, return_messages=True, output_key="answer")
rag_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k":3}),
    memory=memory,
    return_source_documents=True
)

  memory = ConversationBufferWindowMemory(memory_key="chat_history", k=3, return_messages=True, output_key="answer")


### Understanding the Cascade Effect
In **RAG (Retrieval-Augmented Generation)** systems, every component‚Äîfrom retrieval to response generation‚Äîcan be thought of as part of a chain where an error in one stage can decisively impact the rest. This phenomenon, known as the **cascade effect**, means that an initial failure, such as retrieving irrelevant or incomplete documents, can lead to incorrect or even confusing responses, compromising the reliability of the entire system.

When the input for the language model is poorly grounded, even a robust LLM tends to produce responses that do not align with the expected context. Thus, the system exemplifies the **"garbage in, garbage out"** principle, reinforcing the need for a meticulous evaluation of every stage.



### Error Propagation: From Retrieval to Generation
Imagine a scenario where the retrieval layer fails to fetch essential documents for the user's query. The lack of relevant information results in a fragile context base. When this context is passed to the generation layer, the model may produce a response with low factuality, as it lacks the correct data to validate its textual output.

This interdependence makes the evaluation of RAG systems more complex; it is necessary not only to measure the quality of each component in isolation but also how they integrate and contribute to the system's overall performance.

### Example of Error Propagation
Consider the following illustrative pseudocode:

```python
# Simulation of a simplified chain in a RAG system

# Step 1: Document Retrieval
documents = retrieve_documents(query)

# Simplified document quality validation
if not documents or relevant_documents_count(documents) < 0.5:
    log_error('Retrieval failed: insufficient context')
    context = 'generic content'
else:
    context = combine_documents(documents)

# Step 2: Response generation based on context
response = model_generate_response(query, context)
print(response)
```

In this example, the check during the retrieval stage allows for the identification of a potential failure before the error propagates to the generation stage. Although simplified, it illustrates the impact that a compromised stage can have on the entire processing chain.

### Mitigation Strategies
To minimize the impact of the cascade effect, it is fundamental to adopt monitoring and validation mechanisms at each stage. Possible strategies include:

* **Retrieval Redundancy:** Using multiple sources or search methods to ensure the context is complete, reducing the chance of isolated failures compromising the system.
* **Intermediate Validation:** Implementing checkpoints that validate context quality before proceeding to response generation. This can include using automated metrics to detect discrepancies and inconsistencies.
* **Feedback and Dynamic Adjustments:** Creating a feedback loop that allows the system to self-evaluate and adjust parameters based on both quantitative metrics and qualitative evaluations from practical testing.



These approaches contribute to a more resilient system, capable of detecting and correcting errors before they turn into cascading failures, thus ensuring more accurate and consistent responses for users.

#### 3. Collecting data for evaluation

In [6]:
def run_rag_collect_data(questions):
    """Runs the RAG chain on the provided questions and returns the results."""

    results = {
        "question": [],
        "answer": [],
        "contexts": [],
        "ground_truth": [],
    }

    for i, question in enumerate(questions):
        print(f"Processing question {i+1}/{len(questions)}...")
        try:
            result = rag_chain.invoke({"question": question})

            contexts = [doc.page_content for doc in result["source_documents"]]

            results["question"].append(question)
            results["answer"].append(result["answer"])
            results["contexts"].append(contexts)
            results["ground_truth"].append(test_data["ground_truth"][i]) # it was better pulling it from the input variables

            print(f"Generated answer: {result['answer'][:100]}")

        except Exception as e:
            print(f"Error processing question {i+1}: {e}")
            continue

    return results

print("Starting data collection for evaluation...")
evaluation_data = run_rag_collect_data(test_data["question"])

print(f"\nSuccessfully collected {len(evaluation_data['question'])} questions for evaluation.")

Starting data collection for evaluation...
Processing question 1/5...
Generated answer: Intelig√™ncia Artificial (IA) √© um campo da ci√™ncia da computa√ß√£o que se concentra na cria√ß√£o de sist
Processing question 2/5...
Error processing question 2: Error calling model 'gemini-2.5-flash' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash\nPlease retry in 17.827091795s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}

#### 4. RAGAS evaluation

In [16]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

print("Starting RAGAS evaluation...")
ragas_dataset = Dataset.from_dict(evaluation_data)

ragas_metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
]

print("\nRAGAS metrics ready:")
for metric in ragas_metrics:
    print(f"-   {metric.name}")

Starting RAGAS evaluation...

RAGAS metrics ready:
-   faithfulness
-   answer_relevancy
-   context_precision
-   context_recall


  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import (


In [19]:
print("\nEvaluating RAGAS metrics...")

# config = RunConfig( # para evitar timeout
#     timeout=180,      # Aumenta para 3 minutos por requisi√ß√£o
#     max_retries=10,   # Tenta de novo se falhar
#     max_workers=2,    # Reduz o paralelismo (o padr√£o costuma ser 16)
#     max_wait=60       # Espera at√© 60s entre as tentativas
# )

ragas_results = evaluate(
    dataset=ragas_dataset,
    metrics=ragas_metrics,
    llm=llm,
    embeddings=embeddings,
    #run_config=config
)
print("\nRAGAS evaluation results:")
print(f"Faithfulness: {ragas_results['faithfulness'][0]:.4f}")
print(f"Answer relevancy: {ragas_results['answer_relevancy'][0]:.4f}")
print(f"Context precision: {ragas_results['context_precision'][0]:.4f}")
print(f"Context recall: {ragas_results['context_recall'][0]:.4f}")

simulated_ragas_results = {
    "faithfulness": 0.85,
    "answer_relevancy": 0.78,
    "context_precision": 0.82,
    "context_recall": 0.75
}
for metric, value in simulated_ragas_results.items():
    print(f"{metric}: {value:.4f}")


Evaluating RAGAS metrics...


Evaluating:   0%|          | 0/4 [00:10<?, ?it/s]


KeyboardInterrupt: 

#### 5. Detailed metrics analysis

In [22]:
def analyze_metric(results):
    """ Analyzes the provided results and prints a detailed analysis of the metrics."""

    # 1. Convert EvaluationResult to dict
    if hasattr(results, "to_dict"):
        results = results.to_dict()
    elif hasattr(results, "_scores_dict"):
        results = results._scores_dict
    elif not isinstance(results, dict):
        results = dict(results)

    # 2. Secure helper to get score as scalar
    def get_score(key: str, default: float = 0.0) -> float:
        result_value = results.get(key, default)
        if isinstance(result_value, (list, tuple)):
            return float(result_value[0])
        return float(result_value)

    print("Detailed metric analysis:")

    # 3. Expected metrics
    metrics = {
        "faithfulness": "faithfulness",
        "answer_relevancy": "answer relevancy",
        "context_relevancy": "context relevancy",
        "context_precision": "context precision",
        "context_recall": "context recall"
    }
    valid_scores = []
    for key, legible_name in metrics.items():
        if key not in results:
            continue
        score = get_score(key)
        valid_scores.append(score)

        print(f"- {legible_name}: {score:.4f}")

        if score >= 0.8:
            print(" Excellent")
        elif score >= 0.6:
            print(" Good")
        else:
            print(" Poor")

    # 4. Overall score
    if valid_scores:
        overall_score = float(np.mean(valid_scores))
        print(f"\nOverall score: {overall_score:.4f}")

        if overall_score >= 0.8:
            print(" Excellent")
        elif overall_score >= 0.6:
            print(" Good")
        else:
            print(" Poor")
    else:
        print("No metrics available")

In [23]:
analyze_metric(ragas_results)

Detailed metric analysis:
- faithfulness: nan
 Poor
- answer relevancy: 0.8078
 Excellent
- context precision: nan
 Poor
- context recall: nan
 Poor

Overall score: nan
 Poor


In this lesson, we saw how to evaluate a **RAG system** using **LangSmith** and **Ragas**, covering everything from initial setup to executing metrics.

Now it's your chance to review and practice the concepts from this lesson, if you haven't already. To do so:

* **Install and import** the necessary libraries and tools (`LangChain`, `ChromaDB`, `Google Generative AI`, etc.).
* **Configure access keys** and import project dependencies, including auxiliary functions.
* **Prepare the test data** by creating knowledge documents and a Q&A dataset (questions and reference answers).
* **Configure the RAG system**: initialize embeddings, create the vector store, instantiate the LLM, and define the memory and query chain.
* **Implement data collection**, iterating through questions to store generated answers, contexts, and reference ground truths.
* **Configure and execute the evaluation with Ragas**, using metrics such as **faithfulness**, **relevance**, **precision**, and **recall**, and analyze the final results.


In this lesson, we learned:

* **The importance of evaluation** in RAG systems and the challenges associated with its complexity.
* **Using LangSmith** as an observability platform to track and monitor RAG systems.
* **The utilization of the Ragas library** to provide specific metrics focused on faithfulness and relevance.
* **The implementation of a complete RAG system** using Google Generative AI Embeddings, ChromaDB, and LangChain.
* **The configuration of specific metrics**, such as Faithfulness, Answer Relevance, Context Precision, and Context Recall, for RAG evaluation.
* **The development of a test dataset** with questions and answers to demonstrate the metrics.
* **Data collection and storage** for evaluation, and the use of logs for monitoring.
* **Detailed analysis and interpretation of metrics** to evaluate the overall performance of the RAG system.