gptme
diff --git a/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/contributing.rst
Lines changed: 80 additions & 4 deletions b/‎docs/contributing.rst
Lines changed: 80 additions & 4 deletions
diff --git a/‎gptme/chat.py
Lines changed: 1 addition & 0 deletions b/‎gptme/chat.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎gptme/llm/llm_openai.py
Lines changed: 2 additions & 0 deletions b/‎gptme/llm/llm_openai.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎gptme/server/api_v2_sessions.py
Lines changed: 4 additions & 1 deletion b/‎gptme/server/api_v2_sessions.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎gptme/telemetry.py
Lines changed: 105 additions & 2 deletions b/‎gptme/telemetry.py
Lines changed: 105 additions & 2 deletions
@@ -40,7 +40,7 @@ jobs:
     - name: Install dependencies
       run: |
         make build
-        poetry install -E server -E browser
+        poetry install -E server -E browser -E telemetry
         poetry run pip install tomli tomli_w
     - name: Typecheck
       run: |
 
@@ -15,7 +15,7 @@ repos:
   rev: v1.12.0
   hooks:
   - id: mypy
-    additional_dependencies: [types-tabulate, types-docutils, tomli, tomli_w, opentelemetry-api, opentelemetry-sdk]
+    additional_dependencies: [types-tabulate, types-docutils, tomli, tomli_w, opentelemetry-api, opentelemetry-sdk, prometheus-client, prompt_toolkit, click, pytest, openai, rich, tomlkit]
     args: [--ignore-missing-imports, --check-untyped-defs]
 - repo: local
   hooks:
 
@@ -67,37 +67,113 @@ To enable telemetry during development:
                 -p 9411:9411 \
                 cr.jaegertracing.io/jaegertracing/jaeger:latest
 
-3. Set the telemetry environment variable:
+3. (Optional) Run Prometheus for metrics collection:
+
+   .. code-block:: bash
+
+      # Simple default Prometheus
+      docker run --name prometheus -d -p 127.0.0.1:9090:9090 prom/prometheus
+
+      # Or with custom config to scrape gptme metrics on port 8000
+      cat > scripts/prometheus.yml << EOF
+      global:
+        scrape_interval: 15s
+      scrape_configs:
+        - job_name: 'gptme'
+          static_configs:
+            - targets: ['host.docker.internal:8000']
+          metrics_path: '/metrics'
+      EOF
+
+      docker run --rm --name prometheus \
+                -p 9090:9090 \
+                -v $(pwd)/scripts/prometheus.yml:/etc/prometheus/prometheus.yml \
+                prom/prometheus --enable-feature=otlp-write-receive
+
+4. Set the telemetry environment variables:
 
    .. code-block:: bash
 
       export GPTME_TELEMETRY_ENABLED=true
       export OTLP_ENDPOINT=http://localhost:4317  # optional (default)
+      export PROMETHEUS_PORT=8000  # optional (default)
+      export PROMETHEUS_ADDR=0.0.0.0  # optional (default: localhost, use 0.0.0.0 for Docker access)
 
-4. Run gptme:
+5. Run gptme:
 
    .. code-block:: bash
 
       poetry run gptme 'hello'
       # or gptme-server
       poetry run gptme-server
 
-5. View traces in Jaeger UI:
+6. View data:
 
-    You can view traces in the Jaeger UI at http://localhost:16686.
+   - **Traces**: Jaeger UI at http://localhost:16686
+   - **Metrics**: Prometheus UI at http://localhost:9090
+   - **Raw metrics**: Direct metrics endpoint at http://localhost:8000/metrics
 
 Once enabled, gptme will automatically:
 
 - Trace function execution times
 - Record token processing metrics
 - Monitor request durations
 - Instrument Flask and HTTP requests
+- Expose Prometheus metrics at `/metrics` endpoint
 
 The telemetry data helps identify:
 
 - Slow operations and bottlenecks
 - Token processing rates
 - Tool execution performance
+- Resource usage patterns
+
+Available Metrics
+~~~~~~~~~~~~~~~~~
+
+The following metrics are automatically collected:
+
+- ``gptme_tokens_processed_total``: Counter of tokens processed by type
+- ``gptme_request_duration_seconds``: Histogram of request durations by endpoint
+- ``gptme_tool_calls_total``: Counter of tool calls made by tool name
+- ``gptme_tool_duration_seconds``: Histogram of tool execution durations by tool name
+- ``gptme_active_conversations``: Gauge of currently active conversations
+- ``gptme_llm_requests_total``: Counter of LLM API requests by provider, model, and success status
+- HTTP request metrics (from Flask instrumentation)
+- OpenAI/Anthropic API call metrics (from LLM instrumentations)
+
+Example Prometheus Queries
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here are some useful Prometheus queries for monitoring gptme:
+
+.. code-block:: promql
+
+   # Average tool execution time by tool
+   rate(gptme_tool_duration_seconds_sum[5m]) / rate(gptme_tool_duration_seconds_count[5m])
+
+   # Most used tools
+   topk(10, rate(gptme_tool_calls_total[5m]))
+
+   # LLM request success rate
+   rate(gptme_llm_requests_total{success="true"}[5m]) / rate(gptme_llm_requests_total[5m])
+
+   # Tokens processed per second
+   rate(gptme_tokens_processed_total[5m])
+
+   # Active conversations
+   gptme_active_conversations
+
+   # Request latency percentiles
+   histogram_quantile(0.95, rate(gptme_request_duration_seconds_bucket[5m]))
+
+Environment Variables
+~~~~~~~~~~~~~~~~~~~~~
+
+- ``GPTME_TELEMETRY_ENABLED``: Enable/disable telemetry (default: false)
+- ``OTLP_ENDPOINT``: OTLP endpoint for traces (default: http://localhost:4317)
+- ``PROMETHEUS_PORT``: Port for Prometheus metrics endpoint (default: 8000)
+- ``PROMETHEUS_ADDR``: Address for Prometheus metrics endpoint (default: localhost, use 0.0.0.0 for Docker access)
 
 Release
 -------
 
@@ -336,6 +336,7 @@ def _wait_for_tts_if_enabled() -> None:
             stop()
 
 
+@trace_function(name="chat.step", attributes={"component": "chat"})
 def step(
     log: Log | list[Message],
     stream: bool,
 
@@ -185,6 +185,7 @@ def chat(messages: list[Message], model: str, tools: list[ToolSpec] | None) -> s
     api_model = model if is_proxy else base_model
 
     from openai import NOT_GIVEN  # fmt: skip
+    from openai.types.chat import ChatCompletionMessageToolCall  # fmt: skip
 
     messages_dicts, tools_dict = _prepare_messages_for_api(messages, model, tools)
 
@@ -201,6 +202,7 @@ def chat(messages: list[Message], model: str, tools: list[ToolSpec] | None) -> s
     result = []
     if choice.finish_reason == "tool_calls":
         for tool_call in choice.message.tool_calls or []:
+            assert isinstance(tool_call, ChatCompletionMessageToolCall)
             result.append(
                 f"@{tool_call.function.name}({tool_call.id}): {tool_call.function.arguments}"
             )
 
@@ -11,11 +11,11 @@
 import threading
 import uuid
 from collections import defaultdict
+from collections.abc import Generator
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 from enum import Enum
 from pathlib import Path
-from collections.abc import Generator
 
 import flask
 from dotenv import load_dotenv
@@ -27,6 +27,7 @@
 from ..llm.models import get_default_model
 from ..logmanager import LogManager, prepare_messages
 from ..message import Message
+from ..telemetry import trace_function
 from ..tools import ToolUse, get_tools, init_tools
 from .api_v2_common import ErrorEvent, EventType, msg2dict
 from .openapi_docs import (
@@ -168,6 +169,7 @@ def _append_and_notify(manager: LogManager, session: ConversationSession, msg: M
     )
 
 
+@trace_function("api_v2.step", attributes={"component": "api_v2"})
 def step(
     conversation_id: str,
     session: ConversationSession,
@@ -353,6 +355,7 @@ def start_tool_execution(
 
     # This function would ideally run asynchronously to not block the request
     # For simplicity, we'll run it in a thread
+    @trace_function("api_v2.execute_tool", attributes={"component": "api_v2"})
     def execute_tool_thread():
         config = Config.from_workspace(workspace=chat_config.workspace)
         config.chat = chat_config
 
@@ -26,20 +26,31 @@
 _meter = None
 _token_counter = None
 _request_histogram = None
+_tool_counter = None
+_tool_duration_histogram = None
+_active_conversations_gauge = None
+_llm_request_counter = None
 
 TELEMETRY_AVAILABLE = False
 TELEMETRY_IMPORT_ERROR = None
 
 try:
     from opentelemetry import metrics, trace  # fmt: skip
-    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter  # fmt: skip
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+        OTLPSpanExporter,  # fmt: skip
+    )
     from opentelemetry.exporter.prometheus import PrometheusMetricReader  # fmt: skip
+    from opentelemetry.instrumentation.anthropic import (
+        AnthropicInstrumentor,  # fmt: skip
+    )
     from opentelemetry.instrumentation.flask import FlaskInstrumentor  # fmt: skip
+    from opentelemetry.instrumentation.openai_v2 import OpenAIInstrumentor  # fmt: skip
     from opentelemetry.instrumentation.requests import RequestsInstrumentor  # fmt: skip
     from opentelemetry.sdk.metrics import MeterProvider  # fmt: skip
     from opentelemetry.sdk.resources import Resource  # fmt: skip
     from opentelemetry.sdk.trace import TracerProvider  # fmt: skip
     from opentelemetry.sdk.trace.export import BatchSpanProcessor  # fmt: skip
+    from prometheus_client import start_http_server  # fmt: skip
 
     TELEMETRY_AVAILABLE = True
 except ImportError as e:
@@ -56,9 +67,17 @@ def init_telemetry(
     service_name: str = "gptme",
     enable_flask_instrumentation: bool = True,
     enable_requests_instrumentation: bool = True,
+    enable_openai_instrumentation: bool = True,
+    enable_anthropic_instrumentation: bool = True,
+    prometheus_port: int = 8000,
 ) -> None:
     """Initialize OpenTelemetry tracing and metrics."""
     global _telemetry_enabled, _tracer, _meter, _token_counter, _request_histogram
+    global \
+        _tool_counter, \
+        _tool_duration_histogram, \
+        _active_conversations_gauge, \
+        _llm_request_counter
 
     # Check if telemetry is enabled via environment variable
     if os.getenv("GPTME_TELEMETRY_ENABLED", "").lower() not in ("true", "1", "yes"):
@@ -89,7 +108,15 @@ def init_telemetry(
         if hasattr(tracer_provider, "add_span_processor"):
             tracer_provider.add_span_processor(span_processor)  # type: ignore
 
-        # Initialize metrics
+        # Initialize metrics with Prometheus reader
+        prometheus_port = int(os.getenv("PROMETHEUS_PORT", prometheus_port))
+        prometheus_addr = os.getenv("PROMETHEUS_ADDR", "localhost")
+
+        # Start Prometheus HTTP server to expose metrics
+        start_http_server(port=prometheus_port, addr=prometheus_addr)
+
+        # Initialize PrometheusMetricReader which pulls metrics from the SDK
+        # on-demand to respond to scrape requests
         prometheus_reader = PrometheusMetricReader()
         metrics.set_meter_provider(MeterProvider(metric_readers=[prometheus_reader]))
         _meter = metrics.get_meter(service_name)
@@ -107,13 +134,43 @@ def init_telemetry(
             unit="seconds",
         )
 
+        _tool_counter = _meter.create_counter(
+            name="gptme_tool_calls",
+            description="Number of tool calls made",
+            unit="calls",
+        )
+
+        _tool_duration_histogram = _meter.create_histogram(
+            name="gptme_tool_duration_seconds",
+            description="Tool execution duration in seconds",
+            unit="seconds",
+        )
+
+        _active_conversations_gauge = _meter.create_up_down_counter(
+            name="gptme_active_conversations",
+            description="Number of active conversations",
+            unit="conversations",
+        )
+
+        _llm_request_counter = _meter.create_counter(
+            name="gptme_llm_requests",
+            description="Number of LLM API requests made",
+            unit="requests",
+        )
+
         # Auto-instrument Flask and requests if enabled
         if enable_flask_instrumentation:
             FlaskInstrumentor().instrument()
 
         if enable_requests_instrumentation:
             RequestsInstrumentor().instrument()
 
+        if enable_openai_instrumentation:
+            OpenAIInstrumentor().instrument()
+
+        if enable_anthropic_instrumentation:
+            AnthropicInstrumentor().instrument()
+
         _telemetry_enabled = True
 
         # Import console for user-visible messages
@@ -122,6 +179,9 @@ def init_telemetry(
         # Log to console so users know telemetry is active
         console.log("📊 Telemetry enabled - performance metrics will be collected")
         console.log(f"🔍 Traces will be sent via OTLP to {otlp_endpoint}")
+        console.log(
+            f"📈 Prometheus metrics available at http://{prometheus_addr}:{prometheus_port}/metrics"
+        )
 
     except Exception as e:
         logger.error(f"Failed to initialize telemetry: {e}")
@@ -181,6 +241,49 @@ def record_request_duration(
     _request_histogram.record(duration, {"endpoint": endpoint, "method": method})
 
 
+def record_tool_call(
+    tool_name: str,
+    duration: float | None = None,
+    success: bool = True,
+    error_type: str | None = None,
+    error_message: str | None = None,
+) -> None:
+    """Record tool call metrics."""
+    if not is_telemetry_enabled() or _tool_counter is None:
+        return
+
+    attributes = {"tool_name": tool_name, "success": str(success).lower()}
+
+    if error_type:
+        attributes["error_type"] = error_type
+    if error_message:
+        # Truncate long error messages
+        attributes["error_message"] = error_message[:200]
+
+    _tool_counter.add(1, attributes)
+
+    if duration is not None and _tool_duration_histogram is not None:
+        _tool_duration_histogram.record(duration, attributes)
+
+
+def record_conversation_change(delta: int) -> None:
+    """Record change in active conversations (+1 for new, -1 for ended)."""
+    if not is_telemetry_enabled() or _active_conversations_gauge is None:
+        return
+
+    _active_conversations_gauge.add(delta)
+
+
+def record_llm_request(provider: str, model: str, success: bool = True) -> None:
+    """Record LLM API request metrics."""
+    if not is_telemetry_enabled() or _llm_request_counter is None:
+        return
+
+    _llm_request_counter.add(
+        1, {"provider": provider, "model": model, "success": str(success).lower()}
+    )
+
+
 def measure_tokens_per_second(func: F) -> F:
     """Decorator to measure tokens per second for LLM operations."""