In [1]:
from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
import json

# ----------------------------
# Step 1: Load Vision Model (VLM)
# ----------------------------
model_id_vlm = "LiquidAI/LFM2-VL-450M"
model_vlm = AutoModelForImageTextToText.from_pretrained(
    model_id_vlm,
    device_map={"": "cpu"},
    torch_dtype="float32",
    trust_remote_code=True
)
processor_vlm = AutoProcessor.from_pretrained(model_id_vlm, trust_remote_code=True)

# Load test image
image_path = "data/test_mm_7.36.jpg"
image = Image.open(image_path).convert("RGB")

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Read the gauge and output only '<value> mm'."},
        ],
    },
]

# Run VLM
inputs = processor_vlm.apply_chat_template(
    conversation, add_generation_prompt=True, return_tensors="pt"
).to(model_vlm.device)

outputs = model_vlm.generate(**inputs, max_new_tokens=64)
decoded = processor_vlm.batch_decode(outputs, skip_special_tokens=True)[0]
reading_text = decoded.split()[-2]  # e.g. "7.36"
meter_reading = float(reading_text)

print(f"\n--- Meter Reading ---\n{meter_reading} mm")

# ----------------------------
# Step 2: Define functions
# ----------------------------
def turn_on_light(room: str):
    return {"status": f"Light in {room} turned ON"}

def turn_off_light(room: str):
    return {"status": f"Light in {room} turned OFF"}

tools_json = [
    {
        "name": "turn_on_light",
        "description": "Turn on the light in a specified room",
        "parameters": {"type": "object", "properties": {"room": {"type": "string"}}, "required": ["room"]}
    },
    {
        "name": "turn_off_light",
        "description": "Turn off the light in a specified room",
        "parameters": {"type": "object", "properties": {"room": {"type": "string"}}, "required": ["room"]}
    }
]

# ----------------------------
# Step 3: Load LLM
# ----------------------------
model_id_llm = "LiquidAI/LFM2-350M"
model_llm = AutoModelForCausalLM.from_pretrained(
    model_id_llm,
    device_map={"": "cpu"},
    torch_dtype="float32",
    trust_remote_code=True
)
tokenizer_llm = AutoTokenizer.from_pretrained(model_id_llm)

# ----------------------------
# Step 4: Build decision prompt
# ----------------------------
prompt = f"""
<|startoftext|><|im_start|>system
List of tools: <|tool_list_start|>{json.dumps(tools_json)}<|tool_list_end|><|im_end|>
<|im_start|>user
The meter reading is {meter_reading} mm.
If the reading is greater than 80 mm, call turn_off_light with room="living_room".
If it is 80 or less, do nothing.
<|im_end|>
<|im_start|>assistant
"""

# Generate LLM decision
input_ids = tokenizer_llm.apply_chat_template(
    [{"role": "user", "content": prompt}],
    add_generation_prompt=True,
    return_tensors="pt"
).to(model_llm.device)

output = model_llm.generate(
    input_ids,
    temperature=0.2,
    max_new_tokens=128,
)

decoded_llm = tokenizer_llm.decode(output[0], skip_special_tokens=True)
print("\n--- LLM Decision ---")
print(decoded_llm)

`torch_dtype` is deprecated! Use `dtype` instead!


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

AttributeError: 'str' object has no attribute 'to'

In [None]:
# ----------------------------
# Step 5: Execute function if needed
# ----------------------------
if "turn_off_light" in decoded_llm:
    result = turn_off_light("living_room")
    print("\n--- Action Taken ---")
    print(result)
else:
    print("\n--- Action Taken ---\nNo action (reading safe).")
