In [None]:
import os
import sys
import threading
import argparse
from datetime import datetime
from pathlib import Path
import shutil
import json
import requests

import pandas as pd
import datasets
from tqdm import tqdm
from dotenv import load_dotenv

from huggingface_hub import login, snapshot_download

from tools.reformulator import prepare_response
from tools.run_agents import get_single_file_description, get_zip_description
from tools.text_inspector_tool import TextInspectorTool
from tools.text_web_browser import (
    ArchiveSearchTool,
    FinderTool,
    FindNextTool,
    PageDownTool,
    PageUpTool,
    SimpleTextBrowser,
    VisitTool,
)
from tools.visual_qa import VisualQATool
from tools.agent_with_tools import create_plan_and_execute_agent, PlanExecute

from smolagents import (
    CodeAgent,
    GoogleSearchTool,  # make sure this is distinct from Serp/BSerp tools
    LiteLLMModel,
    Model,
    ToolCallingAgent,
)

from langchain.chat_models import init_chat_model, ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.tools import tool
from langchain_community.utilities import GoogleSerperAPIWrapper, SerpAPIWrapper
from langchain_community.tools import BraveSearch
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchResults
from langgraph.prebuilt import create_react_agent

# Setup
load_dotenv()
login(os.getenv("HF_TOKEN"))

append_answer_lock = threading.Lock()


In [None]:
# Testing AI Agent on sample questions

question = "when is aymar de bergeyck born?"

config = {"recursion_limit": 50}
graph = create_plan_and_execute_agent(
    llm_name_planner="gpt-4.1-mini",
    llm_name_executor="gpt-4.1-mini",
    llm_name_replanner="gpt-4.1-mini",
    llm_name_answer="gpt-4.1-mini")

initial_state = PlanExecute(
    question=question,
    plan=[],
    intermediate_responses=[],
    response="",
    current_step=0,
    error_count=0,
    validation=None,
    agent_finished=False
)

workflow = initial_state
workflow =  graph.invoke(initial_state, config=config)


In [None]:
print("Final Response:", workflow)

In [None]:

def load_gaia_dataset(use_raw_dataset: bool, set_to_run: str) -> datasets.Dataset:
    if not os.path.exists("data/gaia"):
        print('gaia dataset path doesnt exist')
        if use_raw_dataset:
            print('in use raw dataset')
            snapshot_download(
                repo_id="gaia-benchmark/GAIA",
                repo_type="dataset",
                local_dir="data/gaia",
                ignore_patterns=[".gitattributes", "README.md"],
            )
        else:
            # WARNING: this dataset is gated: make sure you visit the repo to require access.
            snapshot_download(
                repo_id="smolagents/GAIA-annotated",
                repo_type="dataset",
                local_dir="data/gaia",
                ignore_patterns=[".gitattributes", "README.md"],
            )

    def preprocess_file_paths(row):
        if len(row["file_name"]) > 0:
            row["file_name"] = f"data/gaia/{set_to_run}/" + row["file_name"]
        return row
    
    eval_ds = datasets.load_dataset(
        "data/gaia/GAIA.py",
        name="2023_all",
        split=set_to_run,
    )

    eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
    eval_ds = eval_ds.map(preprocess_file_paths)
    return eval_ds

def append_answer(entry: dict, jsonl_file: str) -> None:
    jsonl_path = Path(jsonl_file)
    jsonl_path.parent.mkdir(parents=True, exist_ok=True)
    with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp:
        fp.write(json.dumps(entry) + "\n")
    assert jsonl_path.exists(), "File not found!"
    print("Answer exported to file:", jsonl_path.resolve())

def answer_single_question(
    example: dict, answers_file: str
) -> None:
    print('EXAMPLE', example)
    question = example["question"]

    document_inspection_tool = TextInspectorTool(
            model=init_chat_model("gpt-4.1-mini", model_provider="openai", temperature=0), 
            text_limit=10000)
    visual_inspection_tool = VisualQATool(model=init_chat_model("gpt-4.1-mini", model_provider="openai", temperature=0))

    if example["file_name"]:
        if ".zip" in example["file_name"]:
            prompt_use_files = "\n\nTo solve the task above, you will have to use these attached files:\n"
            prompt_use_files += get_zip_description(
                example["file_name"], None, visual_inspection_tool, document_inspection_tool
            )
        else:
            prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:\n"
            prompt_use_files += get_single_file_description(
                example["file_name"], None, visual_inspection_tool, document_inspection_tool
            )

        question += prompt_use_files
    
    # CREATE AGENT
    config = {"recursion_limit": 50}
    graph = create_plan_and_execute_agent(
        llm_name_planner="gpt-4.1",
        llm_name_executor="gpt-4.1",
        llm_name_replanner="gpt-4.1",
        llm_name_answer="gpt-4.1",
        llm_text_inspector = "gpt-4.1",
        llm_visual_qa = "gpt-4.1"
        )

    initial_state = PlanExecute(
        question=question,
        plan=[],
        intermediate_responses=[],
        response="",
        current_step=0,
        error_count=0,
        validation=None,
        agent_finished=False
    )

    start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    try: 
        # Run Agent
        # print('Question:', question)
        workflow = initial_state
        workflow =  graph.invoke(initial_state, config=config)

        # print('QUESTION')
        # print(workflow["question"])
        # print("STEPS")
        # previous_steps_with_answers = "\n".join(
        #     f"{i}. INSTRUCTION: {workflow['plan'][i]} ANSWER: {workflow['intermediate_responses'][i]}\n" for i in range(len(workflow['intermediate_responses']))
        # )
        # print(previous_steps_with_answers)
        # print("Current step: ", workflow["current_step"])
        # print("RESPONSE")
        # print(workflow["response"])
        # print('TRUE RESPONSE')
        # print(example['true_answer'])
        # response = "No response was found"
        # for event in graph.stream(initial_state, config=config):
        #     if "planner" in event:
        #         print('PLAN')
        #         for i, p in enumerate(event['planner']['plan']):
        #             print(f"Step {i}: {p}")
        #     elif "agent" in event:
        #         print('AGENT')
        #         print(event['agent']['intermediate_responses'])
        #     elif "replan" in event:
        #         if 'plan' in event['replan']:
        #             print('REPLAN')
        #             for i, p in enumerate(event['replan']['plan']):
        #                 print(f"Step {i}: {p}")
        #         else:
        #             print('REPLAN ERROR')
        #             print(event['replan'])
        #     elif 'answer' in event:
        #         response = event['answer']['response']
        #         print('RESPONSE')
        #         print(response)
        #         break
        # workflow = {}

        raised_exception = False

    except Exception as e: 
        print("Error on", question, e)
        exception = e
        raised_exception = True

    end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    annotated_example = {
        "question": example["question"],
        "augmented_question": question,
        "response": workflow["response"].replace('FINAL ANSWER: ', ''),
        "plan": workflow["plan"],
        "intermediate_responses": workflow["intermediate_responses"],
        "current_step": workflow["current_step"],
        "plan_with_answers": "\n".join(
            f"{i}. INSTRUCTION: {workflow['plan'][i]} ANSWER: {workflow['intermediate_responses'][i]}\n" for i in range(len(workflow['intermediate_responses']))
        ),
        "agent_error": str(exception) if raised_exception else None,
        "task": example["task"],
        "task_id": example["task_id"],
        "true_answer": example["true_answer"],
        "start_time": start_time,
        "end_time": end_time,
    }
    print('-------------------------------------------------------------')
    list_print = ['question', 'augmented_question', 'plan_with_answers', 'response', 'true_answer']
    for k in list_print:
        print(k)
        print(annotated_example[k])
    print('-------------------------------------------------------------')

    append_answer(annotated_example, answers_file)


def get_examples_to_answer(answers_file: str, eval_ds: datasets.Dataset) -> list[dict]:
    print(f"Loading answers from {answers_file}...")
    try:
        done_questions = pd.read_json(answers_file, lines=True)["question"].tolist()
        print(f"Found {len(done_questions)} previous results!")
    except Exception as e:
        print("Error when loading records: ", e)
        print("No usable records! ▶️ Starting new.")
        done_questions = []
    return [line for line in eval_ds.to_list() if line["question"] not in done_questions and line["file_name"]]




In [None]:
run_name = "test3"
set_to_run = "validation"
use_raw_dataset = True
levels_to_run = [1]
eval_ds = load_gaia_dataset(use_raw_dataset, set_to_run)

print("Loaded evaluation dataset:")
print(pd.DataFrame(eval_ds)["task"].value_counts())

answers_file = f"output/{set_to_run}/{run_name}.jsonl"
tasks_to_run = get_examples_to_answer(answers_file, eval_ds)

for example in tqdm(tasks_to_run, desc="Processing tasks"):
    if int(example["task"]) in levels_to_run:
        answer_single_question(example, answers_file)

print("All tasks processed.")


