### Chatbot And RAG Evaluation

1. Creating a dataset with questions and their expected answers
2. Running your RAG application on those questions
3. Using evaluators to measure how well your application performed, looking at factors like:
 - Answer relevance
 - Answer accuracy
 - Retrieval quality

In [16]:
import os
from dotenv import load_dotenv
load_dotenv()
from langsmith import Client
client = Client()
from groq import Groq

groq_client = Groq(
    # This is the default and can be omitted
    api_key=os.environ.get("GROQ_API"),
)

os.environ["LANGSMITH_API_KEY"]=os.getenv("LANGSMITH_API_KEY")
os.environ["LANGSMITH_TRACING"]="true"

In [3]:
from langsmith import Client

client = Client()

# Define dataset: these are your test cases
dataset_name = "MyChatbot Evaluation"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
        "inputs": {"question": "LangChain چیست؟"},
        "outputs": {"answer": "چارچوبی برای ساخت برنامه‌های LLM"},
        },
        {
        "inputs": {"question": "LangSmith چیست؟"},
        "outputs": {"answer": "پلتفرمی برای مشاهده و ارزیابی برنامه‌های LLM"},
        },
        {
        "inputs": {"question": "OpenAI چیست؟"},
        "outputs": {"answer": "شرکتی که مدل‌های زبان بزرگ ایجاد می‌کند"},
        },
        {
        "inputs": {"question": "گوگل چیست؟"},
        "outputs": {"answer": "شرکتی که زبان‌های بزرگ ایجاد می‌کند"},
        },
        {
        "inputs": {"question": "Mistral چیست؟"},
        "outputs": {"answer":" شرکتی که زبان‌های بزرگ ایجاد می‌کند مدل‌ها"},
        }
        ]
        )

{'example_ids': ['390e9d8c-5afa-4da2-b4e4-0e30d399e58c',
  'f82f7d92-5f15-45e6-87a3-f5b1ed8664ef',
  '202def5a-b2b4-4324-816f-d130db654df8',
  '675ed529-72b3-487d-a08b-a067d440711a',
  '4daee9d6-3eb0-45dc-ab70-2c00a20e5422'],
 'count': 5}

In [18]:
from openai import OpenAI
from langsmith import wrappers
import os

# ai_client = openai_client=wrappers.wrap_openai(openai.OpenAI())

eval_instructions = (
    "You are an expert professor specialized in grading students' answers to questions. "
    "Respond ONLY with CORRECT or INCORRECT."
)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    user_content = f"""
You are grading the following question:
{inputs['question']}

Here is the real answer:
{reference_outputs['answer']}

You are grading the following predicted answer:
{outputs['response']}

Respond with CORRECT or INCORRECT:
Grade:
""".strip()

    response = groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        temperature=0,
        messages=[
            {"role": "system", "content": eval_instructions},
            {"role": "user", "content": user_content},
        ],
    ).choices[0].message.content.strip()

    return response == "CORRECT"


In [12]:
def concision(outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

In [19]:
default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."
def my_app(question: str, model: str = "llama-3.3-70b-versatile", instructions: str = default_instructions) -> str:
    return groq_client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    ).choices[0].message.content

In [20]:
def ls_target(inputs: str) -> dict:
    return {"response": my_app(inputs["question"])}

In [21]:
experiment_results=client.evaluate(
    ls_target, ## Your AI system
    data="MyChatbot Evaluation",
    evaluators=[correctness,concision],
    experiment_prefix="groq-chatbot"
)

View the evaluation results for experiment: 'groq-chatbot-0a71485f' at:
https://smith.langchain.com/o/2d81f2a4-dffa-456c-9c4b-a16f6ccc7ae8/datasets/856336c4-f610-4c69-8eaf-1b1099c222f7/compare?selectedSessions=004092b3-7940-4d79-95bd-ff8efc90d762




0it [00:00, ?it/s]

In [None]:
from groq import Groq
import os
client = Groq(
    # This is the default and can be omitted
    api_key=os.environ.get("GROQ_API"),
)
prompt = [{"role": "user", "content": "Come up with a new business idea for AI Agents"}]
stream = client.chat.completions.create(model="llama-3.3-70b-versatile", messages=prompt, stream=True)
    def event_stream(stream):
        for chunk in stream:
            text = chunk.choices[0].delta.content
            if text:
                lines = text.split("\n")
                for line in lines:
                    yield f"data: {line}\n"
                yield "\n"
print(event_stream(stream))

Here's a new business idea for AI agents:

**Business Idea:** "DreamScout" - AI-Powered Personalized Travel Planning and Itinerary Builder

**Description:** DreamScout is an AI-powered travel planning platform that utilizes machine learning algorithms to create personalized travel itineraries for users. The platform will use natural language processing (NLP) to understand users' preferences, interests, and budget constraints, and then generate customized travel plans that include destinations, activities, accommodations, and transportation options.

**How it works:**

1. **User Input:** Users will input their travel preferences, interests, and budget constraints through a conversational interface (e.g., chatbot or voice assistant).
2. **AI Analysis:** The AI agent will analyze the user's input and generate a personalized travel profile, including recommendations for destinations, activities, and accommodations.
3. **Itinerary Builder:** The AI agent will then create a customized travel