# Step 0: Install the Required Libraries

create the conda environment using the env.yml file

In [None]:
# %pip install 'smolagents[telemetry]'
# %pip install opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents
# %pip install langfuse datasets 'smolagents[gradio]' gradio
# pip install google-genai google-auth smolagents[litellm]

# Step 1: Instrument Your Agent

create a .env file in this directory that looks like the .env-example file using your API keys

In [1]:
import os
import base64
from dotenv import load_dotenv

load_dotenv()  # Loads variables from .env

LANGFUSE_AUTH = base64.b64encode(
    f"{os.environ.get('LANGFUSE_PUBLIC_KEY')}:{os.environ.get('LANGFUSE_SECRET_KEY')}".encode()
).decode()

os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = os.environ.get("LANGFUSE_HOST") + "/api/public/otel"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}"

### Test Google Gemini LLM connection, will use for LLM-as-a-judge

In [3]:
use_google = True
from google import genai

client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents="1+1",
)

print(response.text)

2



In [7]:
from opentelemetry.sdk.trace import TracerProvider
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

# Create a TracerProvider for OpenTelemetry
trace_provider = TracerProvider()

# Add a SimpleSpanProcessor with the OTLPSpanExporter to send traces
trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))

# Set the global default tracer provider
from opentelemetry import trace
trace.set_tracer_provider(trace_provider)
tracer = trace.get_tracer(__name__)

# Instrument smolagents with the configured provider
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)

# Step 2: Test Your Instrumentation

In [16]:
from smolagents import HfApiModel, CodeAgent, LiteLLMModel

if use_google:
    model = LiteLLMModel(
    model_id="gemini/gemini-2.0-flash", # you can see other model names here: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models. It is important to prefix the name with "gemini/"
    api_key=os.getenv("GEMINI_API_KEY"),
    max_tokens=8192
)
else:
    model = HfApiModel()

# Create a simple agent to test instrumentation
agent = CodeAgent(
    tools=[],
    model=model
)

agent.run("1+1=")

2

# Step 3: Observe and Evaluate a More Complex Agent

In [5]:
from smolagents import DuckDuckGoSearchTool

search_tool = DuckDuckGoSearchTool()
agent = CodeAgent(tools=[search_tool], model=model)

agent.run("How many Rubik's Cubes could you fit inside the Notre Dame Cathedral?")

'Approximately 36.3 million'

## [Trace](https://us.cloud.langfuse.com/project/cm8p07ra800wvad07nhjyaa55/traces/85838359b2036e9fa9b2eec31c602f4f?timestamp=2025-03-25T23:30:04.332Z&display=details)

### Trace with tags, user and session id

In [8]:
from opentelemetry import trace

search_tool = DuckDuckGoSearchTool()
agent = CodeAgent(
    tools=[search_tool],
    model=model
)

with tracer.start_as_current_span("Smolagent-Trace") as span:
    span.set_attribute("langfuse.user.id", "smolagent-user-123")
    span.set_attribute("langfuse.session.id", "smolagent-session-123456789")
    span.set_attribute("langfuse.tags", ["city-question", "testing-agents"])

    agent.run("What is the capital of Germany?")

## [Trace](https://us.cloud.langfuse.com/project/cm8p07ra800wvad07nhjyaa55/traces/8601410ec6e4a46505fd03ee5a083e21?timestamp=2025-03-25T23:35:12.019Z&display=details)

In [None]:
import gradio as gr
from opentelemetry.trace import format_trace_id
from smolagents import (CodeAgent, HfApiModel)
from langfuse import Langfuse

langfuse = Langfuse()
agent = CodeAgent(tools=[], model=model, add_base_tools=True)

formatted_trace_id = None  # We'll store the current trace_id globally for demonstration

def respond(prompt, history):
    with trace.get_tracer(__name__).start_as_current_span("Smolagent-Trace") as span:
        output = agent.run(prompt)

        current_span = trace.get_current_span()
        span_context = current_span.get_span_context()
        trace_id = span_context.trace_id
        global formatted_trace_id
        formatted_trace_id = str(format_trace_id(trace_id))
        langfuse.trace(id=formatted_trace_id, input=prompt, output=output)

    history.append({"role": "assistant", "content": str(output)})
    return history

def handle_like(data: gr.LikeData):
    # For demonstration, we map user feedback to a 1 (like) or 0 (dislike)
    if data.liked:
        langfuse.score(
            value=1,
            name="user-feedback",
            trace_id=formatted_trace_id
        )
    else:
        langfuse.score(
            value=0,
            name="user-feedback",
            trace_id=formatted_trace_id
        )

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="Chat", type="messages")
    prompt_box = gr.Textbox(placeholder="Type your message...", label="Your message")

    # When the user presses 'Enter' on the prompt, we run 'respond'
    prompt_box.submit(
        fn=respond,
        inputs=[prompt_box, chatbot],
        outputs=chatbot
    )

    # When the user clicks a 'like' button on a message, we run 'handle_like'
    chatbot.like(handle_like, None, None)

demo.launch()


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## [Trace with Score](https://us.cloud.langfuse.com/project/cm8p07ra800wvad07nhjyaa55/traces/fd148a268dd6bcb22c6de3db9e75e3ed?timestamp=2025-03-25T23%3A38%3A14.231Z&display=details&view=scores)
![image.png](attachment:image.png)

## 5. LLM-as-a-Judge

Requires LLM API
![Langfuse Evaluation Template](/home/ddd/documents/hugginface_agents_course/Langfuse_Eval_Template.png)

In [9]:
# Example: Checking if the agent’s output is toxic or not.

search_tool = DuckDuckGoSearchTool()
agent = CodeAgent(tools=[search_tool], model=model)

agent.run("Can eating carrots improve your vision?")


"Yes, eating carrots can improve your vision under certain conditions, as they are a good source of vitamin A, which is essential for good vision. However, they won't necessarily improve vision for people with already good vision or reverse poor eyesight to 20/20."

# Offline Evaluation

In [10]:
import pandas as pd
from datasets import load_dataset

# Fetch GSM8K from Hugging Face
dataset = load_dataset("openai/gsm8k", 'main', split='train')
df = pd.DataFrame(dataset)
print("First few rows of GSM8K dataset:")
display(df.head())

First few rows of GSM8K dataset:


Unnamed: 0,question,answer
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<..."
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...


In [11]:
from langfuse import Langfuse
langfuse = Langfuse()

langfuse_dataset_name = "gsm8k_dataset_huggingface"

# Create a dataset in Langfuse
langfuse.create_dataset(
    name=langfuse_dataset_name,
    description="GSM8K benchmark dataset uploaded from Huggingface",
    metadata={
        "date": "2025-03-10",
        "type": "benchmark"
    }
)

Dataset(id='cm8p5g14600c0ad07dvkzhd1b', name='gsm8k_dataset_huggingface', description='GSM8K benchmark dataset uploaded from Huggingface', metadata={'date': '2025-03-10', 'type': 'benchmark'}, project_id='cm8p07ra800wvad07nhjyaa55', created_at=datetime.datetime(2025, 3, 25, 23, 51, 9, 222000, tzinfo=datetime.timezone.utc), updated_at=datetime.datetime(2025, 3, 26, 22, 56, 10, 609000, tzinfo=datetime.timezone.utc))

In [12]:
for idx, row in df.iterrows():
    langfuse.create_dataset_item(
        dataset_name=langfuse_dataset_name,
        input={"text": row["question"]},
        expected_output={"text": row["answer"]},
        metadata={"source_index": idx}
    )
    if idx >= 9: # Upload only the first 10 items for demonstration
        break

In [17]:
from opentelemetry.trace import format_trace_id
from smolagents import (CodeAgent, HfApiModel, LiteLLMModel)

# Example: using HfApiModel or LiteLLMModel to access openai, anthropic, gemini, etc. models:

agent = CodeAgent(
    tools=[],
    model=model,
    add_base_tools=True
)

def run_smolagent(question):
    with tracer.start_as_current_span("Smolagent-Trace") as span:
        span.set_attribute("langfuse.tag", "dataset-run")
        output = agent.run(question)

        current_span = trace.get_current_span()
        span_context = current_span.get_span_context()
        trace_id = span_context.trace_id
        formatted_trace_id = format_trace_id(trace_id)

        langfuse_trace = langfuse.trace(
            id=formatted_trace_id,
            input=question,
            output=output
        )
    return langfuse_trace, output

In [18]:
dataset = langfuse.get_dataset(langfuse_dataset_name)

# Run our agent against each dataset item (limited to first 10 above)
for item in dataset.items:
    langfuse_trace, output = run_smolagent(item.input["text"])

    # Link the trace to the dataset item for analysis
    item.link(
        langfuse_trace,
        run_name="smolagent-notebook-run-01",
        run_metadata={ "model": model.model_id }
    )

    # Optionally, store a quick evaluation score for demonstration
    langfuse_trace.score(
        name="<example_eval>",
        value=1,
        comment="This is a comment"
    )

# Flush data to ensure all telemetry is sent
langfuse.flush()


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



AgentGenerationError: Error in generating model output:
litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.",
    "status": "RESOURCE_EXHAUSTED",
    "details": [
      {
        "@type": "type.googleapis.com/google.rpc.QuotaFailure",
        "violations": [
          {
            "quotaMetric": "generativelanguage.googleapis.com/generate_content_free_tier_requests",
            "quotaId": "GenerateRequestsPerMinutePerProjectPerModel-FreeTier",
            "quotaDimensions": {
              "location": "global",
              "model": "gemini-2.0-flash"
            },
            "quotaValue": "15"
          }
        ]
      },
      {
        "@type": "type.googleapis.com/google.rpc.Help",
        "links": [
          {
            "description": "Learn more about Gemini API quotas",
            "url": "https://ai.google.dev/gemini-api/docs/rate-limits"
          }
        ]
      },
      {
        "@type": "type.googleapis.com/google.rpc.RetryInfo",
        "retryDelay": "13s"
      }
    ]
  }
}
