# Kubernetes Simulation Control

This notebook demonstrates a simplistic simulation of a Kubernetes environment. We can manipulate the cluster (add nodes, deploy services, move pods) and observe metrics.

In [18]:
import k8s_sim
import time
import json

# Initialize the cluster
cluster = k8s_sim.Cluster()

# Add some nodes
cluster.add_node("node-1", cpu_capacity=4.0, memory_capacity=16.0)
cluster.add_node("node-2", cpu_capacity=4.0, memory_capacity=16.0)
cluster.add_node("node-3", cpu_capacity=8.0, memory_capacity=32.0)

print("Cluster initialized with 3 nodes.")

Cluster initialized with 3 nodes.


In [19]:
# Deploy services
print("Deploying services...")
cluster.deploy_service("frontend", replicas=3, cpu_request=0.5, memory_request=1.0)
cluster.deploy_service("backend", replicas=2, cpu_request=1.0, memory_request=2.0)
cluster.deploy_service("database", replicas=1, cpu_request=2.0, memory_request=4.0)

# Check initial state
print(json.dumps(cluster.get_metrics(), indent=2))

Deploying services...
{
  "timestamp": 1767184656.736337,
  "nodes": {
    "node-1": {
      "cpu_usage": 3.5,
      "cpu_capacity": 4.0,
      "cpu_utilization_pct": 87.5,
      "memory_usage": 7.0,
      "pod_count": 5,
      "latency_ms": 106.26320346761432
    },
    "node-2": {
      "cpu_usage": 2.0,
      "cpu_capacity": 4.0,
      "cpu_utilization_pct": 50.0,
      "memory_usage": 4.0,
      "pod_count": 1,
      "latency_ms": 69.66369050399591
    },
    "node-3": {
      "cpu_usage": 0,
      "cpu_capacity": 8.0,
      "cpu_utilization_pct": 0.0,
      "memory_usage": 0,
      "pod_count": 0,
      "latency_ms": 17.841047899572523
    }
  },
  "services": {
    "frontend": {
      "pod_count": 3,
      "running_pods": 3,
      "avg_latency_ms": 106.26320346761433,
      "request_completion_rate": 78.74735930647714
    },
    "backend": {
      "pod_count": 2,
      "running_pods": 2,
      "avg_latency_ms": 106.26320346761432,
      "request_completion_rate": 78.7473593064771

In [20]:
# Manipulation: Scale up backend
print("Scaling up backend to 5 replicas...")
cluster.scale_service("backend", 5)

# Observe metrics after scaling
metrics = cluster.get_metrics()
print(f"Backend running pods: {metrics['services']['backend']['running_pods']}")
print(f"Node-1 CPU Utilization: {metrics['nodes']['node-1']['cpu_utilization_pct']:.1f}%")

Scaling up backend to 5 replicas...
Backend running pods: 5
Node-1 CPU Utilization: 87.5%


In [21]:
# Manipulation: Move a pod
# Let's find a pod on node-1 and move it to node-3
node1 = cluster.nodes["node-1"]
if node1.pods:
    pod_id = list(node1.pods.keys())[0]
    pod = node1.pods[pod_id]
    print(f"Moving pod {pod.id} ({pod.service_name}) from node-1 to node-3...")
    
    try:
        cluster.move_pod(pod_id, "node-3")
        print("Move successful.")
    except Exception as e:
        print(f"Move failed: {e}")
else:
    print("No pods on node-1 to move.")

# Final metrics check
print(json.dumps(cluster.get_metrics(), indent=2))

Moving pod c93ba755 (frontend) from node-1 to node-3...
Move successful.
{
  "timestamp": 1767184656.752928,
  "nodes": {
    "node-1": {
      "cpu_usage": 3.0,
      "cpu_capacity": 4.0,
      "cpu_utilization_pct": 75.0,
      "memory_usage": 6.0,
      "pod_count": 4,
      "latency_ms": 91.84241234946532
    },
    "node-2": {
      "cpu_usage": 4.0,
      "cpu_capacity": 4.0,
      "cpu_utilization_pct": 100.0,
      "memory_usage": 8.0,
      "pod_count": 3,
      "latency_ms": 123.97628920305316
    },
    "node-3": {
      "cpu_usage": 1.5,
      "cpu_capacity": 8.0,
      "cpu_utilization_pct": 18.75,
      "memory_usage": 3.0,
      "pod_count": 2,
      "latency_ms": 40.05862905965093
    }
  },
  "services": {
    "frontend": {
      "pod_count": 3,
      "running_pods": 3,
      "avg_latency_ms": 74.58115125286052,
      "request_completion_rate": 85.0837697494279
    },
    "backend": {
      "pod_count": 5,
      "running_pods": 5,
      "avg_latency_ms": 94.33920643293

# Orchestrator Agent

We will now implement an Orchestrator Agent using the Google Agent Development Kit (ADK). 
This agent will be responsible for maintaining the following Service Level Agreements (SLAs):

*   **Frontend Latency**: < 100ms
*   **Backend Latency**: < 150ms
*   **Node CPU Usage**: < 80%

The agent will have access to tools to observe metrics and manipulate the cluster.

In [22]:
from google.adk.agents import LlmAgent
from google.adk.models.google_llm import Gemini
from google.adk.runners import InMemoryRunner
from google.genai import types
import os

# Ensure you have your Google API Key set
# os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY"

# --- Tool Definitions ---

def get_cluster_metrics() -> dict:
    """Retrieves the current performance metrics of the Kubernetes cluster.
    
    Returns:
        dict: A dictionary containing metrics for nodes, services, and the cluster summary.
              Includes CPU usage, latency, and pod counts.
    """
    print("\n[Tool Call] get_cluster_metrics")
    try:
        metrics = cluster.get_metrics()
        return {"status": "ok", "result": metrics}
    except Exception as e:
        return {"status": "error", "error": str(e)}

def scale_service(service_name: str, replicas: int) -> dict:
    print(f"\n[Tool Call] scale_service(service_name='{service_name}', replicas={replicas})")
    """Scales a specific service to a target number of replicas.
    
    Args:
        service_name: The name of the service to scale (e.g., 'frontend', 'backend').
        replicas: The desired number of replicas.
        
    Returns:
        dict: A status-aware response with keys `status` and `message` or `error`.
    """
    try:
        cluster.scale_service(service_name, replicas)
        return {"status": "ok", "message": f"Successfully scaled {service_name} to {replicas} replicas."}
    except Exception as e:
        return {"status": "error", "error": str(e)}

def move_pod(pod_id: str, target_node: str) -> dict:
    print(f"\n[Tool Call] move_pod(pod_id='{pod_id}', target_node='{target_node}')")
    """Moves a specific pod to a different node.
    
    Args:
        pod_id: The unique identifier of the pod to move.
        target_node: The name of the destination node (e.g., 'node-1').
        
    Returns:
        dict: A status-aware response with keys `status` and `message` or `error`.
    """
    try:
        cluster.move_pod(pod_id, target_node)
        return {"status": "ok", "message": f"Successfully moved pod {pod_id} to {target_node}."}
    except Exception as e:
        return {"status": "error", "error": str(e)}

print("Tools defined.")

Tools defined.


In [23]:
import dotenv
dotenv.load_dotenv()

retry_config = types.HttpRetryOptions(
    attempts=5,  # Maximum retry attempts
    exp_base=7,  # Delay multiplier
    initial_delay=1,
    http_status_codes=[429, 500, 503, 504],  # Retry on these HTTP errors
)

In [None]:
# --- Agent Definition ---
from google.adk.models import BaseLlm, LlmResponse, LlmRequest
from google.adk.models.lite_llm import LiteLlm
from google.genai.types import Content, Part
import httpx
from typing import AsyncGenerator, Any

MODEL = 'gpt-oss:120b-cloud'

orchestrator_agent = LlmAgent(
    name="Orchestrator",
    model=LiteLlm(model=f"ollama_chat/{MODEL}"),#, retry_options=retry_config),
    instruction="""
    You are a smart Kubernetes Orchestrator responsible for maintaining system stability and SLAs. 
    
    The deployed services are: 
    `frontend`, `backend`, `database`

    The given SLAs are:
        *   Frontend Latency: Must be under 100ms.
        *   Backend Latency: Must be under 150ms.
        *   Node CPU Utilization: Should ideally be under 80% to prevent degradation.

    For SLA violation requests you must specifically follow the following steps:
    1. Use `get_cluster_metrics()` to get metrics (CPU utilization, latency, pod numbers) from the cluster you orchestrate.
    2. If any SLA is violated or at risk, take corrective actions:
        *   High Latency: Scale up the service to distribute load using `scale_service()`
        *   High Node CPU: Move pods from the overloaded node to a node with spare capacity using `move_pod()`
    3. After taking action, verify the result by checking the metrics again using `get_cluster_metrics()`
    
    Check the "status" field in each tool's response for errors. If any tool returns status "error", explain the issue to the user clearly.
    
    Don't ask questions. The only available information can be accessed through the given tools. Be decisive and use the corresponding tools to complete the request. 
    """,
    tools=[get_cluster_metrics, scale_service, move_pod]
)

runner = InMemoryRunner(agent=orchestrator_agent)
print("Orchestrator Agent initialized with Ollama.")

App name mismatch detected. The runner is configured with app name "InMemoryRunner", but the root agent was loaded from "/Users/gtzanettis/Documents/Projects/agentic-service-broker/.venv/lib/python3.12/site-packages/google/adk/agents", which implies app name "agents".


Orchestrator Agent initialized with Ollama.


In [25]:
# --- Run the Agent ---

# Let's induce a problem first to see the agent fix it.
# We'll scale down the frontend to cause high load/latency (simulated)
print("Inducing high load on frontend...")
cluster.scale_service("frontend", 1) 

# Run the agent
import asyncio

async def run_orchestrator():
    print("Starting Orchestrator run...")
    # Pass the input as a keyword argument
    result = await runner.run_debug(
        "There seems to be an issue with the cluster performance. Identify and resolve any SLA violations."
    )
    print("\nAgent Output:")
    print(result)

# Note: In a real notebook environment, you can just await run_orchestrator()
# await run_orchestrator()
print("Run 'await run_orchestrator()' to execute the agent.")

Inducing high load on frontend...
Run 'await run_orchestrator()' to execute the agent.


In [26]:
await run_orchestrator()

Starting Orchestrator run...

 ### Created new session: debug_session_id

User > There seems to be an issue with the cluster performance. Identify and resolve any SLA violations.

[Tool Call] get_cluster_metrics

[Tool Call] move_pod(pod_id='backend-pod-1', target_node='node-3')

[Tool Call] move_pod(pod_id='database-pod-1', target_node='node-3')

[Tool Call] scale_service(service_name='backend', replicas=6)

[Tool Call] move_pod(pod_id='backend-0', target_node='node-3')

[Tool Call] get_cluster_metrics

[Tool Call] scale_service(service_name='backend', replicas=5)

[Tool Call] move_pod(pod_id='backend-0', target_node='node-3')

[Tool Call] scale_service(service_name='frontend', replicas=2)

[Tool Call] move_pod(pod_id='frontend-0', target_node='node-3')

[Tool Call] get_cluster_metrics

[Tool Call] move_pod(pod_id='database-0', target_node='node-3')

[Tool Call] move_pod(pod_id='database-pod-0', target_node='node-3')

[Tool Call] scale_service(service_name='backend', replicas=4)

[Too

In [27]:
import litellm
litellm._turn_on_debug()

# Debug

In [28]:
saved_request

NameError: name 'saved_request' is not defined

In [None]:
model = OllamaModel(model="gemma3:4b")

In [None]:
# Test local Ollama model (uses existing `model`, `LlmRequest`, `Content`, `Part`)
req = LlmRequest(contents=[Content(parts=[Part(text="Hello Ollama, please respond with 'yang'")])])
resp = model.generate_content(saved_request)
print("Response:", resp.content.parts[0].text)

AttributeError: 'NoneType' object has no attribute 'contents'