In [28]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
from smolagents import CodeAgent, LiteLLMModel
import yaml
from tools.final_answer import FinalAnswerTool
#from tools.math_operations import MathOperationsTool
from tools.visit_webpage import VisitWebpageTool
from tools.web_search import DuckDuckGoSearchTool
from tools.wikipedia_search import WikipediaSearchTool
from tools.arxiv_search import ArxivSearchTool
from tools.rag_search import RAGSearchTool
from tools.code_execution import CodeExecutionTool
from tools.document_processing import DocumentProcessingTool
from tools.image_processing import ImageProcessingTool

In [29]:
# 2. Definir herramientas
visit_webpage = VisitWebpageTool()
internet_search = DuckDuckGoSearchTool(max_results=5)
#math_tools = MathOperationsTool()
final_answer = FinalAnswerTool()
wikipedia_search = WikipediaSearchTool(load_max_docs=2)
arxiv_search = ArxivSearchTool(load_max_docs=3)
rag_search = RAGSearchTool(persist_dir="rag_db")
code_execution = CodeExecutionTool()
document_processing = DocumentProcessingTool(temp_dir="doc_processing")
image_processing = ImageProcessingTool(output_dir="image_outputs")

In [31]:
tools = [    
    visit_webpage, 
    internet_search, 
    wikipedia_search, 
    arxiv_search,
    rag_search, 
    code_execution, 
    document_processing,
    image_processing,
    final_answer
]

additional_imports = [
    # Document processing
    "PyMuPDF",                              # PDF reading
    # HTTP & URLs
    "requests",                             # HTTP client for REST calls :contentReference[oaicite:0]{index=0}
    "urllib.parse",                         # URL parsing and construction :contentReference[oaicite:1]{index=1}

    # Data formats
    "json",                                 # JSON serialization/deserialization
    "csv",                                  # CSV reading/writing
    "xml.etree.ElementTree",                # XML parsing
    "bs4",                                  # BeautifulSoup for HTML parsing

    # Text processing
    "re",                                   # Regular expressions

    # File & OS
    "os",                                   # OS interactions (env vars, paths)
    "sys",                                  # Interpreter info
    "pathlib",                              # Object‑oriented filesystem paths
    "subprocess",                           # Safe subprocess calls

    # Computation
    "math",                                 # Advanced math functions
    "random",                               # Random sampling and shuffling

    # Date & time
    "datetime",                             # Date/time parsing and arithmetic

    # Data analysis
    "numpy",                                # Numerical arrays :contentReference[oaicite:2]{index=2}
    "pandas",                               # Tabular data manipulation

    # Imaging
    "PIL.Image",                            # Image loading/inspection

    # Logging
    "logging",                              # Structured debug/info logging    
]

In [32]:
# 3. Inicializar modelo Ollama local
local_model = LiteLLMModel(
    model_id="ollama/qwen2.5-coder:7b",    
    api_base="http://localhost:11434",
    temperature=0,
    max_tokens=2096,
    request_timeout=900,
)

In [37]:
# Cargar las plantillas de los prompts
with open("prompt.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)

In [38]:
# 4. Crear agente con el modelo local
agent = CodeAgent(
    model=local_model,
    tools = tools, ## math_tools
    additional_authorized_imports = additional_imports,
    prompt_templates=prompt_templates,
    #max_steps=5,
    verbosity_level = 1,
)

In [35]:
dataset_validation = load_dataset(
    "gaia-benchmark/GAIA",
    '2023_all',    
    split="validation",
    token=os.getenv("HF_TOKEN"),    
    trust_remote_code=True,
)

In [4]:
dataset_test = load_dataset(
    "gaia-benchmark/GAIA",
    '2023_all',    
    split="test",
    token=os.getenv("HF_TOKEN"),
    trust_remote_code=True,
)

In [36]:
print("---- Test set ----")
indices = list(range(1))
split_dataset_validation = dataset_validation.select(indices)
for example in split_dataset_validation:
    print(example)  

---- Test set ----
{'task_id': 'c61d22de-5f6c-4958-a7f6-5e9707bd3466', 'Question': 'A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?', 'Level': '2', 'Final answer': 'egalitarian', 'file_name': '', 'file_path': '', 'Annotator Metadata': {'Steps': '1. Go to arxiv.org and navigate to the Advanced Search page.\n2. Enter "AI regulation" in the search box and select "All fields" from the dropdown.\n3. Enter 2022-06-01 and 2022-07-01 into the date inputs, select "Submission date (original)", and submit the search.\n4. Go through the search results to find the article that has a figure with three axes and labels on each end of the axes, titled "Fairness in Agreement With European Values: An Interdisciplinary Perspective on AI Regulation".\n5. Note t

In [39]:
# 5. Ejecutar preguntas y recoger respuestas
predictions = []
for example in split_dataset_validation:
    q = example["Question"]
    print("Agent started running...")
    pred = agent.run(q)
    predictions.append(str(pred).strip())
    print("Agent finished running...")
    print(f"QUESTION: {q}\nANSWER: {pred}\n")
    



Agent started running...



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.



AgentGenerationError: Error in generating model output:
litellm.APIConnectionError: OllamaException - litellm.Timeout: Connection timed out after 900.0 seconds.

In [24]:
# 6. Calcular exact‑match accuracy
gold = [ex["Final answer"].strip() for ex in split_dataset_validation]
correct = sum(1 for p, g in zip(predictions, gold) if p == g)
acc = correct / len(gold)
print(f"Exact‑match accuracy on GAIA dev: {acc:.2%}")

Exact‑match accuracy on GAIA dev: 0.00%
