In [21]:
!pip install -q ragas langchain-groq groq datasets sentence_transformers langchain-huggingface

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [29]:
import os
import json
from groq import Groq
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.metrics import answer_relevancy, answer_correctness
from datasets import Dataset
# Import the LangchainLLMWrapper for Ragas compatibility
from ragas.llms import LangchainLLMWrapper

In [3]:
# --- 2. Configuration & API Key Setup ---
# IMPORTANT: Set your GROQ_API_KEY.
# For Colab, you can add it to Colab Secrets (recommended) or directly here.
# To add to Colab Secrets:
# 1. Click the 'key' icon on the left sidebar (Secrets).
# 2. Click '+ New secret'.
# 3. For 'Name', enter 'GROQ_API_KEY'.
# 4. For 'Value', paste your actual GROQ API key.
# 5. Toggle 'Notebook access' to ON.
# Then, you can access it via os.environ.get("GROQ_API_KEY").

# If not using Colab Secrets, replace os.environ.get("GROQ_API_KEY") with your actual key string:
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY') # Ensure it's in env for ChatGroq

In [4]:
groq_api_key = os.environ.get("GROQ_API_KEY")

if not groq_api_key:
    raise ValueError("GROQ_API_KEY not found. Please set it in your environment or Colab Secrets.")

# --- 3. Initialize Groq Client and LangChain ChatGroq for RAGAS ---
# The direct Groq client for getting chatbot responses
groq_client = Groq(api_key=groq_api_key)

In [30]:
# A LangChain ChatGroq instance for RAGAS to use as its internal evaluation LLM.
# RAGAS metrics that use an LLM internally need a LangChain-compatible LLM.
# Choose a suitable GROQ model for evaluation. llama3-8b-8192 is a good balance.
groq_llm_for_ragas = LangchainLLMWrapper(
    ChatGroq(
        api_key=groq_api_key,
        model_name="llama3-8b-8192", # Or "llama3-70b-8192" for higher accuracy but slower evaluation
        temperature=0.0 # Keep temperature low for deterministic evaluation
    )
)

In [31]:
# Initialize a HuggingFace Embedding model for RAGAS.
# This avoids the default OpenAIEmbeddings and its API key requirement.
# 'sentence-transformers/all-MiniLM-L6-v2' is a good general-purpose model.
ragas_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [32]:
# --- 4. Define your Chatbot's Response Function ---
def get_chatbot_response(user_question: str) -> str:
    """
    Calls the Groq API to get a chat completion response for the chatbot.
    """
    try:
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": user_question,
                }
            ],
            model="llama3-8b-8192", # Use the model you want to evaluate (e.g., your chatbot's model)
            temperature=0.7 # You can adjust temperature for your chatbot's responses
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error getting response from Groq: {e}"

In [33]:
# --- 5. Prepare your Evaluation Dataset ---
# For a chatbot, you need questions, the chatbot's responses, and ground truth answers.
# 'contexts' is intentionally omitted as we are not performing RAG.
data = {
    'question': [
        "What is the capital of France?",
        "Who wrote 'Romeo and Juliet'?",
        "Explain the concept of photosynthesis.",
        "What is the main benefit of using fast language models?",
        "What is the purpose of a Dockerfile?",
        "What are the primary colors?",
        "How does a train move?",
        "What is the largest ocean on Earth?"
    ],
    'reference': [ # Renamed from 'ground_truths' to 'reference' for Ragas compatibility
        "Paris",
        "William Shakespeare",
        "Photosynthesis is the process by which green plants, algae, and cyanobacteria convert light energy into chemical energy, typically in the form of glucose. This process uses carbon dioxide and water as raw materials and releases oxygen as a byproduct.",
        "Fast language models enable quicker inference times, reduced computational costs, and better real-time application performance, making them suitable for latency-sensitive applications like chatbots and search.",
        "A Dockerfile is a text document that contains all the commands a user could call on the command line to assemble a Docker image. It automates the image creation process, ensuring consistency and reproducibility.",
        "Red, yellow, and blue.",
        "A train moves on railway tracks, propelled by an engine (locomotive) that generates power, typically through diesel, electricity, or steam. The wheels on the train are designed to fit the rails, guiding its movement.",
        "The Pacific Ocean."
    ],
    'response': [] # This will be populated by our Groq chatbot
}

In [34]:
# --- 6. Populate 'response' field by calling the Groq Chatbot ---
print("Populating responses from GROQ Chatbot...")
for i, q in enumerate(data['question']):
    print(f"Processing question {i+1}/{len(data['question'])}: {q[:50]}...")
    response = get_chatbot_response(q)
    data['response'].append(response)
    # Optional: Add a small delay to avoid hitting rate limits if you have many questions
    # import time
    # time.sleep(0.1)
# Convert the dictionary to a Ragas Dataset
dataset = Dataset.from_dict(data)
print("Responses populated. Dataset created.")

Populating responses from GROQ Chatbot...
Processing question 1/8: What is the capital of France?...
Processing question 2/8: Who wrote 'Romeo and Juliet'?...
Processing question 3/8: Explain the concept of photosynthesis....
Processing question 4/8: What is the main benefit of using fast language mo...
Processing question 5/8: What is the purpose of a Dockerfile?...
Processing question 6/8: What are the primary colors?...
Processing question 7/8: How does a train move?...
Processing question 8/8: What is the largest ocean on Earth?...
Responses populated. Dataset created.


In [35]:
# --- 7. Initialize Ragas metrics ---
# Assign the LangChain ChatGroq LLM to the metrics that require an LLM.
answer_relevancy.llm = groq_llm_for_ragas
answer_correctness.llm = groq_llm_for_ragas

In [36]:
# --- 8. Run the Ragas evaluation ---
print("\n--- Running RAGAS Evaluation ---")
result = evaluate(
    dataset,
    metrics=[
        answer_relevancy,
        answer_correctness,
        # faithfulness, # Faithfulness requires 'contexts', so it's not directly applicable here.
    ],
    embeddings=ragas_embeddings # Pass the HuggingFace embeddings here
)

# --- 9. Print the results ---
print("\n" + "="*70)
print("RAGAS Evaluation Results:")
print(result)
print("="*70 + "\n")


--- Running RAGAS Evaluation ---


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[9]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[1]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[3]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[5]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[7]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[11]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[12]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[13]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[15]: TimeoutError()



RAGAS Evaluation Results:
{'answer_relevancy': 0.9767, 'answer_correctness': nan}



In [38]:
print("\nIndividual scores (DataFrame):")
# Convert results to a pandas DataFrame for easier viewing
results_df = result.to_pandas()
print("\n--- Evaluation Complete ---")
print("You can analyze the 'answer_relevancy' and 'answer_correctness' scores.")
print("Higher scores (closer to 1) indicate better performance.")
results_df.head()


Individual scores (DataFrame):

--- Evaluation Complete ---
You can analyze the 'answer_relevancy' and 'answer_correctness' scores.
Higher scores (closer to 1) indicate better performance.


Unnamed: 0,user_input,response,reference,answer_relevancy,answer_correctness
0,What is the capital of France?,The capital of France is Paris.,Paris,1.0,
1,Who wrote 'Romeo and Juliet'?,'Romeo and Juliet' is a tragedy written by the...,William Shakespeare,0.95346,
2,Explain the concept of photosynthesis.,"Photosynthesis is the process by which plants,...",Photosynthesis is the process by which green p...,0.902641,
3,What is the main benefit of using fast languag...,The main benefits of using fast language model...,Fast language models enable quicker inference ...,0.980925,
4,What is the purpose of a Dockerfile?,A Dockerfile is a text file that contains a se...,A Dockerfile is a text document that contains ...,1.0,
