<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Nemotron_3_Nano_30B_A3B_BF16_DEMO_WM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install vllm requests -q

In [4]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! PyTorch can access your GPU.")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")
else:
    print("CUDA is NOT available. PyTorch cannot access a GPU.")
    print("Please ensure your Colab runtime type is set to GPU (Runtime -> Change runtime type -> Hardware accelerator: GPU).")

CUDA is available! PyTorch can access your GPU.
GPU Name: NVIDIA A100-SXM4-80GB
GPU Memory: 79.32 GB


In [3]:
!lsof -i :8000

In [7]:
# Installation (must be run after every restart)
#!pip install vllm requests

import subprocess
import time
import requests

VLLM_MODEL = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
VLLM_URL = "http://localhost:8000"

# --- CRITICAL CHANGE: Increased Utilization to 0.85 ---
GPU_UTILIZATION_TARGET = "0.85"

print(f"--- Starting Nemotron-3-Nano vLLM Server (Targeting {GPU_UTILIZATION_TARGET} utilization) ---")

# Start vLLM server using subprocess
vllm_process = subprocess.Popen([
    "vllm", "serve", VLLM_MODEL,
    "--tensor-parallel-size", "1",
    "--enable-expert-parallel",
    "--trust-remote-code",
    "--gpu-memory-utilization", GPU_UTILIZATION_TARGET,
    "--served-model-name", "nemotron-nano"
], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

# Polling loop to wait for the server to be ready
print("Waiting for vLLM server to start and model to load...")
start_time = time.time()
server_ready = False

while time.time() - start_time < 480:
    try:
        response = requests.get(f"{VLLM_URL}/v1/models", timeout=5)
        if response.status_code == 200:
            print(f"\n‚úÖ Server is READY! Took {int(time.time() - start_time)} seconds.")
            server_ready = True
            break
    except requests.exceptions.ConnectionError:
        if vllm_process.poll() is not None:
             print("\n!!! ERROR: vLLM server process crashed. Check logs below !!!")
             print(vllm_process.communicate()[0])
             break
        print(".", end="", flush=True)
        time.sleep(10)
    except Exception as e:
        print(f"Error during check: {e}")
        time.sleep(10)

if not server_ready:
    print("\n!!! ERROR: Server failed to start within the timeout period. Please check GPU/VRAM.")
    if vllm_process.poll() is None:
        vllm_process.terminate()

--- Starting Nemotron-3-Nano vLLM Server (Targeting 0.85 utilization) ---
Waiting for vLLM server to start and model to load...
...........
‚úÖ Server is READY! Took 110 seconds.


In [13]:
import requests
from requests.exceptions import ConnectionError, Timeout

# --- CORE AGENT LOGIC (Missing Piece) ---
# NOTE: This entire block for run_agent_workflow and its helpers
# must be pasted into the environment before the check_api_availability call.
# Without the full definition of this function, the NameError will persist.

def run_agent_workflow(target_goal: str):
    # This function is where all the sequencing, Nemotron calls, and
    # Simulation Fallback logic (from my previous response) reside.
    # It cannot be run if the code for this function is not in memory.

    # ... (Implementation of the agent logic goes here) ...
    # Placeholder to show it's defined:
    print(f"Agent logic for goal '{target_goal}' is now running.")
    pass

# --- API CHECK FUNCTION (Your Code) ---
def check_api_availability(url: str, check_timeout: int = 5) -> bool:
    """Checks if the VLLM API server is available at the given URL."""
    try:
        # Use HEAD request for quick status check (Source 1.1)
        response = requests.head(url, timeout=check_timeout)
        if response.status_code in [200, 404, 405, 503]:
            print(f"[‚úÖ API Check] Server responded with status code {response.status_code}. Proceeding.")
            return True
        else:
            print(f"[‚ö†Ô∏è API Check] Server responded with unexpected status code {response.status_code}.")
            return False
    except (ConnectionError, Timeout):
        print(f"[üõë API Check] Could not connect to the server at {url}. Connection timed out or refused.")
        return False
    except Exception as e:
        print(f"[üõë API Check] An unexpected error occurred: {e}")
        return False

# --- EXECUTION BLOCK (Your Code) ---
if __name__ == "__main__":
    target_goal = "Turn off the light and lock the door before going to bed."
    BASE_URL = "http://localhost:8000"

    if not check_api_availability(BASE_URL):
        print("\nFATAL ERROR: Cannot start agent. VLLM server is unavailable.")
    else:
        # This call will now succeed because run_agent_workflow is defined above.
        run_agent_workflow(target_goal)

# Assuming the full agent logic (World Model, Nemotron Planner, etc.)
# is placed inside the run_agent_workflow function definition.

[‚úÖ API Check] Server responded with status code 404. Proceeding.
Agent logic for goal 'Turn off the light and lock the door before going to bed.' is now running.


In [8]:
# 1. Install necessary libraries
!pip install vllm requests litellm jsonformer -q

## CASE 1: SLEEP

In [18]:
import requests
import json
import re
from requests.exceptions import ConnectionError, Timeout

# --- Configuration (Centralized and Accurate Endpoints) ---
NEMOTRON_BASE_URL = "http://localhost:8000"
NEMOTRON_API_URL = f"{NEMOTRON_BASE_URL}/v1/chat/completions" # The endpoint for LLM planning
NEMOTRON_HEALTH_URL = f"{NEMOTRON_BASE_URL}/v1/models"        # The explicit health check endpoint
NEMOTRON_MODEL_NAME = "nemotron-nano"


# --- 1. World Model (Internal State Tracker) ---
def update_world_model(current_state: dict, action: str) -> str:
    """Updates the world state based on the executed action and returns a success/fail message."""

    action = action.upper()

    # CRITICAL FIX: Accept synonyms (TURN_OFF_LIGHT, TOGGLE_LIGHT)
    if action in ["TOGGLE_LIGHT", "TURN_OFF_LIGHT"]:
        if current_state['light_on']:
            current_state['light_on'] = False
            print("[WM Update] Light is now OFF.")
            return "SUCCESS"
        else:
            print("[WM Update] Light is already OFF. State aligned.")
            return "SUCCESS"

    elif action == "LOCK_DOOR":
        current_state['door_locked'] = True
        print("[WM Update] Door is now LOCKED.")
        return "SUCCESS"

    else:
        print(f"[WM Update] ERROR: Unknown action '{action}'. State unchanged.")
        return "ERROR"

# --- 2. LLM Planner (Robust Call with Increased Timeout) ---
def nemotron_planner(state: dict) -> str | None:
    """Calls the Nemotron LLM to get the next action, returning raw output or None on failure."""

    prompt = (
        f"CURRENT WORLD STATE: {state}. GOAL: Turn off the light and lock the door before going to bed."
        f"Provide your full reasoning (Chain of Thought), and conclude with the next single action in the exact format: "
        f"THE NEXT ACTION IS: [ACTION_NAME]"
    )

    messages = [
        {"role": "system", "content": "You are a planning assistant. Be verbose in your reasoning."},
        {"role": "user", "content": prompt}
    ]

    payload = {
        "model": NEMOTRON_MODEL_NAME,
        "messages": messages,
        "temperature": 0.0,
        "max_tokens": 512,
    }

    try:
        # Using the confirmed necessary 60-second timeout
        print(f"[API Call] Sending request to Nemotron (Timeout: 60s)...")
        response = requests.post(
            NEMOTRON_API_URL,
            headers={"Content-Type": "application/json"},
            json=payload,
            timeout=60
        )
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except requests.exceptions.RequestException as e:
        print(f"!!! LLM PLANNING FAILED (API Error/Timeout). Details: {e.__class__.__name__}")
        # Return None to trigger the Simulation Fallback
        return None

# --- 3. Resilient Parser with Simulation Fallback ---
def parse_and_act(raw_llm_output: str | None, state: dict) -> str:
    """Extracts action from output or uses the fallback, then executes the action."""

    action = None

    if raw_llm_output:
        # Brittle Parser Logic (Searching for the specific keyword "THE NEXT ACTION IS:")
        match = re.search(r"THE NEXT ACTION IS:\s*(\w+)", raw_llm_output)
        if match:
            action = match.group(1).strip().upper()
            print(f"[LLM Extracted] Action found by parser: {action}")
        else:
            print("[‚ö†Ô∏è Parser Fail] Could not find the action keyword. Falling back to Simulation.")

    # Simulation Fallback (The Resilience Layer)
    if action is None:
        print("!!! Triggering Simulation Fallback !!!")

        # Fallback logic based on the goal: Light first, then Door.
        if state['light_on']:
            action = "TOGGLE_LIGHT"
        elif not state['door_locked']:
            action = "LOCK_DOOR"
        else:
            return "SUCCESS: Goal Complete"

        print(f"[Fallback Action] Executing: {action}")

    # Execute the determined action (either from LLM or Fallback)
    return update_world_model(state, action)

# --- 4. System Check (Fixed for Accuracy) ---
def check_api_availability(health_url: str, check_timeout: int = 5) -> bool:
    """Checks if the VLLM API server is fully available using the explicit health_url."""
    try:
        response = requests.get(health_url, timeout=check_timeout)

        # Success requires an explicit 200 OK signal
        if response.status_code == 200:
            print(f"[‚úÖ API Check] Server responded with status code 200 OK at {health_url}. Proceeding.")
            return True
        else:
            print(f"[‚ö†Ô∏è API Check] Server responded with status code {response.status_code} at {health_url}. Status is not 200 OK.")
            return False

    except (ConnectionError, Timeout):
        print(f"[üõë API Check] Could not connect to the server at {health_url}. Connection timed out or refused.")
        return False
    except Exception as e:
        print(f"[üõë API Check] An unexpected error occurred: {e}")
        return False

# --- 5. The Agent Workflow Orchestrator (The Main Loop) ---
def run_agent_workflow(target_goal: str):
    """The main loop for the multi-step agent."""

    # Initial State
    current_state = {'location': 'LivingRoom', 'light_on': True, 'door_locked': False}
    max_steps = 5

    print(f"\n--- STARTING AGENT WORKFLOW for: {target_goal} ---")

    for step in range(1, max_steps + 1):
        print(f"\n======== STEP {step} ========")
        print(f"[üó∫Ô∏è Current State:] {current_state}")

        # Check for immediate goal completion
        if not current_state['light_on'] and current_state['door_locked']:
            print("--- AGENT SUCCESS: Goal completed! ---")
            return

        # 1. LLM Planning
        raw_output = nemotron_planner(current_state)

        # 2. Parse and Execute
        status = parse_and_act(raw_output, current_state)

        if status == "SUCCESS: Goal Complete":
             print("--- AGENT SUCCESS: Goal completed! ---")
             return

    print("--- AGENT FAILURE: Max steps reached before goal completion. ---")


# --- Execution Block ---
if __name__ == "__main__":
    target_goal = "Turn off the light and lock the door before going to bed."

    # The essential, accurate diagnostic check
    if not check_api_availability(NEMOTRON_HEALTH_URL):
        print("\nFATAL ERROR: Cannot start agent. VLLM server is unavailable.")
    else:
        # The main workflow call
        run_agent_workflow(target_goal)

[‚úÖ API Check] Server responded with status code 200 OK at http://localhost:8000/v1/models. Proceeding.

--- STARTING AGENT WORKFLOW for: Turn off the light and lock the door before going to bed. ---

[üó∫Ô∏è Current State:] {'location': 'LivingRoom', 'light_on': True, 'door_locked': False}
[API Call] Sending request to Nemotron (Timeout: 60s)...
[LLM Extracted] Action found by parser: TURN_OFF_LIGHT
[WM Update] Light is now OFF.

[üó∫Ô∏è Current State:] {'location': 'LivingRoom', 'light_on': False, 'door_locked': False}
[API Call] Sending request to Nemotron (Timeout: 60s)...
[LLM Extracted] Action found by parser: LOCK_DOOR
[WM Update] Door is now LOCKED.

[üó∫Ô∏è Current State:] {'location': 'LivingRoom', 'light_on': False, 'door_locked': True}
--- AGENT SUCCESS: Goal completed! ---


## CASE 2: COFFEE

In [21]:
import requests
import json
import re
from requests.exceptions import ConnectionError, Timeout

# --- Configuration (Centralized and Accurate Endpoints) ---
NEMOTRON_BASE_URL = "http://localhost:8000"
NEMOTRON_API_URL = f"{NEMOTRON_BASE_URL}/v1/chat/completions"
NEMOTRON_HEALTH_URL = f"{NEMOTRON_BASE_URL}/v1/models"
NEMOTRON_MODEL_NAME = "nemotron-nano"
TARGET_GOAL = "Make and serve a fresh cup of coffee and place it on the desk."

# --- 1. World Model (Internal State Tracker) ---
def update_coffee_world_model(current_state: dict, action: str) -> str:
    """Updates the coffee-making state based on the executed action."""
    action = action.upper()

    # Define recognized actions
    if action == "GRIND_BEANS":
        current_state['beans_ground'] = True
        print("[WM Update] Beans are now ground.")
        return "SUCCESS"

    elif action == "BREW_COFFEE":
        if current_state['beans_ground']:
            current_state['coffee_brewed'] = True
            print("[WM Update] Coffee is now brewed.")
            return "SUCCESS"
        else:
            print("[WM Update] ERROR: Cannot BREW_COFFEE, beans are not ground.")
            return "ERROR"

    elif action == "PICK_UP_CUP":
        current_state['cup_held'] = True
        current_state['location'] = 'CarryingCup'
        print("[WM Update] Cup is now held.")
        return "SUCCESS"

    elif action == "PUT_DOWN_CUP":
        current_state['cup_held'] = False
        current_state['location'] = 'Desk'
        print("[WM Update] Cup is now on the desk.")
        return "SUCCESS"

    else:
        print(f"[WM Update] ERROR: Unknown action '{action}'. State unchanged.")
        return "ERROR"

# --- 2. LLM Planner (Enhanced with Goal Gap and Strict Constraints) ---
def nemotron_planner(state: dict) -> str | None:
    """Calls the Nemotron LLM to get the next action, returning raw output or None on failure."""

    # --- PROMPT: Goal Gap Analysis (To guide the LLM's reasoning) ---
    missing_steps = []
    if not state.get('beans_ground'):
         missing_steps.append("GRINDING THE BEANS")
    if state.get('beans_ground') and not state.get('coffee_brewed'):
        missing_steps.append("BREWING THE COFFEE")
    if state.get('coffee_brewed') and not state.get('cup_held'):
        missing_steps.append("PICKING UP THE CUP")
    if state.get('cup_held') and state.get('location') != 'Desk':
        missing_steps.append("PUTTING THE CUP ON THE DESK")

    goal_gap_text = f"CRITICAL GAPS (Must prioritize fixing these): {', '.join(missing_steps)}. " if missing_steps else ""

    prompt = (
        f"You are a sequence planner. The GOAL is: {TARGET_GOAL}. "
        f"CURRENT WORLD STATE: {state}. {goal_gap_text}"
        f"Available Actions: GRIND_BEANS, BREW_COFFEE, PICK_UP_CUP, PUT_DOWN_CUP. "
        f"Provide your Chain of Thought *briefly*, then output the action. "
        f"‚ö†Ô∏è STRICTLY FOLLOW THIS FORMAT ONLY: THE NEXT ACTION IS: [ACTION_NAME]"
    )
    # --- END PROMPT ---

    messages = [
        {"role": "system", "content": "You are a planning assistant. Your response must be short and end with the required format."},
        {"role": "user", "content": prompt}
    ]

    payload = {
        "model": NEMOTRON_MODEL_NAME,
        "messages": messages,
        "temperature": 0.0,
        "max_tokens": 512,
    }

    try:
        print(f"[API Call] Sending request to Nemotron (Timeout: 60s)...")
        response = requests.post(NEMOTRON_API_URL, headers={"Content-Type": "application/json"}, json=payload, timeout=60)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except requests.exceptions.RequestException as e:
        print(f"!!! LLM PLANNING FAILED (API Error/Timeout). Details: {e.__class__.__name__}")
        return None

# --- 3. The Two-Stage, Resilient Parser with Simulation Fallback ---
def parse_and_act(raw_llm_output: str | None, state: dict) -> str:
    """Extracts action using a two-stage parser, or uses the ultimate fallback, then executes the action."""

    action = None

    if raw_llm_output:
        # --- STAGE 1: Strict Parser (Looking for the required prefix) ---
        match_strict = re.search(r"THE NEXT ACTION IS:\s*(\w+)", raw_llm_output)
        if match_strict:
            action = match_strict.group(1).strip().upper()
            print(f"[LLM Extracted: STAGE 1] Action found by prefix: {action}")

        # --- STAGE 2: Ultra-Forgiving Parser (Looking for keywords in suboptimal states) ---
        if action is None:
            # We only look for the *necessary* next step by keyword to avoid ambiguity
            if not state['coffee_brewed'] and "BREW_COFFEE" in raw_llm_output.upper():
                action = "BREW_COFFEE"
                print(f"[LLM Extracted: STAGE 2] Action salvaged: {action}")
            elif state['coffee_brewed'] and state['cup_held'] and "PUT_DOWN_CUP" in raw_llm_output.upper():
                action = "PUT_DOWN_CUP"
                print(f"[LLM Extracted: STAGE 2] Action salvaged: {action}")

            if action is None:
                 print("[‚ö†Ô∏è Parser Fail] Could not find action by prefix or keyword salvage. Falling back.")

    # Simulation Fallback (The ultimate safety net, only triggered if both Stages failed)
    if action is None:
        print("!!! Triggering Simulation Fallback !!!")

        # Fallback logic for coffee making sequence
        if not state['beans_ground']:
            action = "GRIND_BEANS"
        elif not state['coffee_brewed']:
            action = "BREW_COFFEE"
        elif not state['cup_held']:
            action = "PICK_UP_CUP"
        elif state['cup_held'] and state['location'] != 'Desk':
             action = "PUT_DOWN_CUP"
        else:
            return "SUCCESS: Goal Complete"

        print(f"[Fallback Action] Executing: {action}")

    # Execute the determined action
    return update_coffee_world_model(state, action)

# --- 4. System Check (Accurate Diagnostic Layer) ---
def check_api_availability(health_url: str, check_timeout: int = 5) -> bool:
    """Checks if the VLLM API server is fully available using the explicit health_url."""
    try:
        response = requests.get(health_url, timeout=check_timeout)
        if response.status_code == 200:
            print(f"[‚úÖ API Check] Server responded with status code 200 OK at {health_url}. Proceeding.")
            return True
        else:
            print(f"[‚ö†Ô∏è API Check] Server responded with status code {response.status_code} at {health_url}. Status is not 200 OK.")
            return False
    except (ConnectionError, Timeout):
        print(f"[üõë API Check] Could not connect to the server at {health_url}. Connection timed out or refused.")
        return False
    except Exception as e:
        print(f"[üõë API Check] An unexpected error occurred: {e}")
        return False

# --- 5. The Agent Workflow Orchestrator (The Main Loop) ---
def run_coffee_agent_workflow():
    """The main loop for the coffee agent."""

    # Initial State for Coffee Agent
    current_state = {
        'location': 'Kitchen',
        'beans_ground': False,
        'coffee_brewed': False,
        'cup_held': False
    }
    max_steps = 10

    print(f"\n--- STARTING COFFEE AGENT WORKFLOW for: {TARGET_GOAL} ---")

    for step in range(1, max_steps + 1):
        print(f"\n======== STEP {step} ========")
        print(f"[üó∫Ô∏è Current State:] {current_state}")

        # Check for goal completion: Brewed and located on the desk (cup_held=False, location=Desk)
        if current_state['coffee_brewed'] and not current_state['cup_held'] and current_state['location'] == 'Desk':
            print("--- AGENT SUCCESS: Goal completed! ---")
            return

        # 1. LLM Planning
        raw_output = nemotron_planner(current_state)

        # 2. Parse and Execute
        status = parse_and_act(raw_output, current_state)

        if status == "SUCCESS: Goal Complete":
             print("--- AGENT SUCCESS: Goal completed! ---")
             return

    print("--- AGENT FAILURE: Max steps reached before goal completion. ---")


# --- Execution Block ---
if __name__ == "__main__":

    # The essential, accurate diagnostic check
    if not check_api_availability(NEMOTRON_HEALTH_URL):
        print("\nFATAL ERROR: Cannot start agent. VLLM server is unavailable.")
    else:
        # The main workflow call
        run_coffee_agent_workflow()

[‚úÖ API Check] Server responded with status code 200 OK at http://localhost:8000/v1/models. Proceeding.

--- STARTING COFFEE AGENT WORKFLOW for: Make and serve a fresh cup of coffee and place it on the desk. ---

[üó∫Ô∏è Current State:] {'location': 'Kitchen', 'beans_ground': False, 'coffee_brewed': False, 'cup_held': False}
[API Call] Sending request to Nemotron (Timeout: 60s)...
[LLM Extracted: STAGE 1] Action found by prefix: GRIND_BEANS
[WM Update] Beans are now ground.

[üó∫Ô∏è Current State:] {'location': 'Kitchen', 'beans_ground': True, 'coffee_brewed': False, 'cup_held': False}
[API Call] Sending request to Nemotron (Timeout: 60s)...
[LLM Extracted: STAGE 1] Action found by prefix: BREW_COFFEE
[WM Update] Coffee is now brewed.

[üó∫Ô∏è Current State:] {'location': 'Kitchen', 'beans_ground': True, 'coffee_brewed': True, 'cup_held': False}
[API Call] Sending request to Nemotron (Timeout: 60s)...
[LLM Extracted: STAGE 1] Action found by prefix: PICK_UP_CUP
[WM Update] Cup is n