In [None]:
# Should be first to load all the env vars for browser
import os
from dotenv import load_dotenv

load_dotenv()

assert os.getenv("CHROME_USER_DATA_DIR", None)

# Needed to use the static/ default folder with the cache
from browser import BRAVE_SEARCH_SESSION

BRAVE_SEARCH_SESSION.browser.chrome_user_data_dir

In [None]:
from typing import TypedDict

from generalist.agents.core import AgentPlan
from generalist.tools.data_model import ContentResource, ShortAnswer, Task

class ExecutionState(TypedDict):
    # what user is asking to do for them 
    ask: str
    # Identifies what the original question/task given, which objective it got transferred to, what the plan to get an answer is
    task: Task
    # order index of the step of the task's plan that is being executed 
    step: int
    # Clues, findings and answers to the previous subtasks
    # Used to execute a capability plan step given already found information
    context: str  
    # capability plan for this task (overwritten when a new subtask from the main plain is picked up)
    capability_plan: AgentPlan
    # capability plan step order 
    capability_plan_step: int
    # answers to subtask, the last one should be the final answer to the task 
    answers: list[ShortAnswer]
    # all text resources that might be needed to execute the task
    resources: list[ContentResource]
    # tools that already got called
    # TODO: see if this is needed 
    tools_called: str

MAX_STEPS = 2

In [None]:
import json
from generalist.agents.core import AgentDeepWebSearch, AgentUnstructuredDataProcessor, \
    AgentCodeWriterExecutor, AgentAudioProcessor, AgentOutput
from generalist.tools.planning import determine_capabilities, create_plan
from generalist.tools.summarisers import construct_short_answer
from langgraph.graph import StateGraph, START, END


def init_state(ask: str, resources: list[ContentResource] | None = None) -> ExecutionState:
    return ExecutionState(
        ask=ask,
        task=None,
        step=None,
        context="",
        answers=list(),
        resources=resources if resources else [],
    )

def set_task(state: ExecutionState) -> ExecutionState:
    question_task = state["ask"]
    task_plan_response = create_plan(question_task)

    result = json.loads(task_plan_response)
    task = Task(
      question=question_task,
      objective=result["objective"],
      plan=result["plan"],
    )
    identified_resource = result.get("resource", None)
    if identified_resource:
        task_resource = ContentResource(
            provided_by="user",
            content=identified_resource.get("content", None),
            link=identified_resource.get("link", None),
            metadata={},
        )
        state["resources"].append(task_resource)
    state["task"] = task

    state["step"] = 0
    return state

def evaluate_task_completion(state: ExecutionState) -> str:
    short_answer = construct_short_answer(
        state["task"].objective,
        state["context"]
    )

    # Early stopping if answer exists
    if short_answer.answered:
        return "end"

    # Early stopping if maximum number of steps reached
    if state['step'] >= MAX_STEPS:
        return "end"

    return "continue"

def plan_next_step(state: ExecutionState) -> ExecutionState:
    # Automatically determine which step to execute based on context
    capability_plan_json = determine_capabilities(
        task=state["task"],
        context=state["context"]
    )

    state["capability_plan"] = AgentPlan.from_json(capability_plan_json)
    return state

def execute(state: ExecutionState) -> ExecutionState:
    activity, capability = state["capability_plan"].subplan[0]
    output = AgentOutput(activity)
    capability_agent = capability(activity)
    if capability is AgentDeepWebSearch:
        output = capability_agent.run()
    elif capability is AgentUnstructuredDataProcessor:
        output = capability_agent.run(state["resources"])
    elif capability is AgentCodeWriterExecutor:
        output = capability_agent.run(state["resources"])
    elif capability is AgentAudioProcessor:
        output = capability_agent.run(state["resources"])
    else:
        print("DEBUG | run_capability | Call to unidentified agent: ", capability)

    if output.answers:
        state["answers"].extend(output.answers)
    if output.resources:
        state["resources"].extend(output.resources)

    # Update context with step results
    state["context"] += f"\nStep {capability_agent.name}: {state['answers']}"
    state["step"] += 1

    return state


workflow = StateGraph(state_schema=ExecutionState)

workflow.add_node("set_task", set_task)
workflow.add_node("plan_next_step", plan_next_step)
workflow.add_node("execute", execute)

workflow.add_edge(START, "set_task")
workflow.add_edge("set_task", "plan_next_step")
workflow.add_edge("plan_next_step", "execute")
workflow.add_conditional_edges(
    "execute",
    evaluate_task_completion,
    {
        "continue": "plan_next_step",
        "end": END,
    }
)

generalist_graph = workflow.compile()

from IPython.display import Image, display
display(Image(generalist_graph.get_graph().draw_mermaid_png()))

In [None]:
import os
import logging
from dotenv import load_dotenv

import mlflow
from huggingface_hub import snapshot_download
from datasets import load_dataset


logging.getLogger().setLevel(logging.INFO)
load_dotenv()


gaia_path = os.environ.get("HUGGING_FACE_GAIA_FOLDER_PATH")
data_dir = snapshot_download(local_dir=gaia_path,  local_files_only=True, repo_id="gaia-benchmark/GAIA", repo_type="dataset")

dataset = load_dataset(data_dir, "2023_level1", split="validation")
gaia_keys = ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata']

sosa_many_studio_albums_task_id = "8e867cd7-cff9-4e6c-867a-ff5ddc2550be" # web search
running_to_the_moon_task_id = "e1fc63a2-da7a-432f-be78-7c4a95598703"  # web search + calculating
dr_who_season_9_eps_11_location_task_id = "4b6bb5f7-f634-410e-815d-e673ab7f8632" # web search and pdf reading
calc_sales_xlsx_task_id = "7bd855d8-463d-4ed5-93ca-5fe35145f733"
just_running_python_task_id = "f918266a-b3e0-4914-865d-4faa564f1aef"
looking_up_paper_authors_task_id = "46719c30-f4c3-4cad-be07-d5cb21eee6bb" # web search + possibly pdf reading from online
dinosaur_article_task_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8" # web search
merriam_webster_word_task_id = "5188369a-3bbe-43d8-8b94-11558f909a08" # web search
reading_txt_and_solving_puzzle_task_id = "389793a7-ca17-4e82-81cb-2b3a2391b4b9"
teal_video_task_id = "9d191bce-651d-4746-be2d-7ef8ecadb9c2"
very_specific_web_search_download_task_id = "cabe07ed-9eca-40ea-8ead-410ef5e83f91"
listening_to_recipe_task_id = "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3" # audio processing
scikit_learn_change_log_task_id = "d0633230-7067-47a9-9dbf-ee11e0a2cdd6" # web search + code processing
polish_raymond_task_id = "305ac316-eef6-4446-960a-92d80d542f82"
baseball_table_processing_task_id = "3f57289b-8c60-48be-bd80-01f8099ca449" # web search + online table processing
audio_processing_homework_task_id = "1f975693-876d-457b-a649-393859e79bf3"
mulitstep_browser_search_on_a_website_task_id = "a0068077-79f4-461a-adfe-75c1a4148545"
kuznetsov_paper_st_petersburg_task_id = "bda648d7-d618-4883-88f4-3466eabd860e" # web search + pdf analysis
country_with_list_medals_task_id = "cf106601-ab4f-4af9-b045-5295fe67b37d" # web search + small table processing
country_no_longer_exists_web_search_task_id = "5a0c1adf-205e-4841-a666-7c3ef95def9d"

evaluation_tasks = [
    # just_running_python_task_id,
    # sosa_many_studio_albums_task_id,
    # calc_sales_xlsx_task_id,
    # running_to_the_moon_task_id,

    # merriam_webster_word_task_id,
    # looking_up_paper_authors_task_id,

    # baseball_table_processing_task_id,
    # just_browser_search_task_id
]

results = []
dataset_questions = { sample["task_id"]:sample for sample in dataset }
for sample_task_id in evaluation_tasks:
    sample = dataset_questions[sample_task_id]
    [ print(k, "=", sample[k]) for k in gaia_keys ]

    mlflow.langchain.autolog()                                               # this is needed to register traces within the experiment
    experiment_name = f"gaia_{"_".join(sample["task_id"].split("-"))}"
    mlflow.set_experiment(experiment_name)
    mlflow.models.set_model(generalist_graph)
    logging.getLogger().setLevel(logging.INFO)

    experiment_url = mlflow.get_experiment_by_name(experiment_name)
    # mlflow.set_tracking_uri('http://localhost:5000')

    question = sample["Question"]
    resources = []
    if sample["file_path"]:
        resource = ContentResource(
            provided_by="user",
            content="file provided with the main task",
            link=os.path.join(os.environ.get("HUGGING_FACE_GAIA_FOLDER_PATH"), sample["file_path"]),
            metadata={"note":"the file is already in the list of available resources"}
        )
        print(resource.link)
        resources.append(resource)
    initial_state = init_state(question, resources=resources)
    final_state = generalist_graph.invoke(initial_state)
    answers = final_state["answers"]
    results.append((sample, {"answers":answers, "experiment_url": experiment_url}))

In [None]:
[print(r,"\n") for r in results]