In [12]:
# !pip uninstall numpy opencv-python opencv-python-headless -y
# !pip install numpy==1.24.4
# !pip install opencv-python-headless

## Agent with GROQ


In [13]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["GROK_TOKEN"]=os.getenv("GROK_TOKEN")

from smolagents import CodeAgent
from smolagents.models import LiteLLMModel
import os

os.environ["GROQ_API_KEY"] = os.getenv("GROK_TOKEN")

model = LiteLLMModel(
    model_id="groq/llama3-70b-8192",
    model="groq/llama3-8b-8192",  
    api_base="https://api.groq.com/openai/v1",
    max_tokens=500  # Removed 'system_prompt'
)

# Required for Groq compatibility
model._flatten_messages_as_text = False

## Getting the images

In [14]:
from PIL import Image
import requests
from io import BytesIO

image_urls = [
    "https://upload.wikimedia.org/wikipedia/commons/e/e8/The_Joker_at_Wax_Museum_Plus.jpg", # Joker image
    "https://upload.wikimedia.org/wikipedia/en/9/98/Joker_%28DC_Comics_character%29.jpg" # Joker image
]

images = []
for url in image_urls:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" 
    }
    response = requests.get(url,headers=headers)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    images.append(image)

In [15]:
images

[<PIL.Image.Image image mode=RGB size=640x832>,
 <PIL.Image.Image image mode=RGB size=250x315>]

In [17]:
import torch
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration
from smolagents import CodeAgent
from smolagents.models.base import BaseModel
import os

# ----------------- Step 1: LLaVA Wrapper -----------------
class LLaVAWrapper:
    def __init__(self, model_id="llava-hf/llava-1.5-7b-hf", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = LlavaForConditionalGeneration.from_pretrained(
            model_id, torch_dtype=torch.float16 if "cuda" in self.device else torch.float32
        ).to(self.device)

    def ask(self, image, prompt, max_tokens=300):
        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.device)
        output = self.model.generate(**inputs, max_new_tokens=max_tokens)
        return self.processor.decode(output[0], skip_special_tokens=True)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "C:\ProgramData\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 711, in start
    self.io_loop.start()
  File "C:\ProgramData\anaconda3\Lib\site-packa

AttributeError: _ARRAY_API not found

RuntimeError: Failed to import transformers.models.auto.processing_auto because of the following error (look up to see its traceback):
numpy.core.multiarray failed to import

In [None]:
# ----------------- Step 2: Custom Model for smolagents -----------------
class LocalVisionModel(BaseModel):
    def __init__(self):
        self.vlm = LLaVAWrapper()

    def __call__(self, messages, **kwargs):
        # Get the latest prompt
        prompt = messages[-1]["content"]
        images = kwargs.get("images", [])
        responses = []

        if not images:
            raise ValueError("No images provided for vision model.")

        for img in images:
            response = self.vlm.ask(img, prompt)
            responses.append(response)

        full_response = "\n\n".join(responses)
        return {"role": "assistant", "content": full_response}


In [None]:
# ----------------- Step 3: Instantiate the Agent -----------------
# Load images as PIL objects
from io import BytesIO
import requests

image_urls = [
    "https://upload.wikimedia.org/wikipedia/commons/e/e8/The_Joker_at_Wax_Museum_Plus.jpg",
    "https://upload.wikimedia.org/wikipedia/en/9/98/Joker_%28DC_Comics_character%29.jpg"
]

images = []
for url in image_urls:
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(url, headers=headers)
    img = Image.open(BytesIO(response.content)).convert("RGB")
    images.append(img)


In [None]:
# Create agent using local vision model
model = LocalVisionModel()

agent = CodeAgent(
    tools=[],
    model=model,
    max_steps=1,
    verbosity_level=2
)

# ----------------- Step 4: Run Agent -----------------
response = agent.run(
    """
    Describe the costume and makeup that the comic character in these photos is wearing and return the description.
    Tell me if the guest is The Joker or Wonder Woman.
    """,
    images=images
)

print("\nResponse from Agent:\n", response)