In [1]:
import os

import hashlib
import shutil
from pathlib import Path
import time
import logging


from ldp.agent import AgentConfig
from ldp.alg.rollout import RolloutManager
from ldp.data_structures import Trajectory, Transition

from fhda.data_analysis_env import DataAnalysisEnv
from fhda.notebook_env import NBEnvironment
from fhda.utils import NBLanguage
from fhda import prompts
import fhda.config as cfg

from dotenv import load_dotenv 
load_dotenv()

True

In [2]:
def setup_data_analysis_env(
    query: str, dataset: Path, language: NBLanguage = NBLanguage.PYTHON
):
    # Hash the task to get a unique identifier
    task_hash = hashlib.sha256(query.encode()).hexdigest()
    trajectory_path = (
        Path(os.path.abspath("tmp_results_dir")) / f"{task_hash}-{time.time()}"
    )
    trajectory_path.mkdir(parents=True, exist_ok=True)
    nb_path = trajectory_path / NBEnvironment.NOTEBOOK_NAME
    # Copy task data to trajectory path
    if dataset.is_dir():
        for item in dataset.iterdir():
            if item.is_file():
                shutil.copy2(item, trajectory_path)
            elif item.is_dir():
                shutil.copytree(item, trajectory_path / item.name, dirs_exist_ok=True)
    else:
        shutil.copy2(dataset, trajectory_path)
    # Augment incoming task with CoT instructions
    augmented_task = f"""\
    Here is the user query to address:


    <query>
    {query}
    </query>

    {prompts.CHAIN_OF_THOUGHT_AGNOSTIC.format(language=language.name)}
    {prompts.GENERAL_NOTEBOOK_GUIDELINES.format(language=language.name)}"""

    if language == NBLanguage.R:
        augmented_task += f"\n{prompts.R_SPECIFIC_GUIDELINES}"

    dae = DataAnalysisEnv(
        problem_id=f"data-analysis-task-{task_hash}",
        problem=augmented_task,
        eval_mode=None,
        nb_path=nb_path,
        work_dir=trajectory_path,
        language=language,
        system_prompt=prompts.CAPSULE_SYSTEM_PROMPT_QUERY,
        use_tmp_work_dir=False
    )
    return dae

In [10]:
# ENVIRONMENT CONFIGURATION
# ANTHROPIC_API_KEY and OPENAI_API_KEY handled by dotenv
cfg.USE_DOCKER = False
LANGUAGE = NBLanguage.PYTHON
MAX_STEPS = 30
# MODEL_NAME = "claude-sonnet-4-20250514"
MODEL_NAME = "gpt-4.1-2025-04-14"

In [11]:
print('hello')

hello


In [12]:
# AVIARY ROLLOUT
logger = logging.getLogger(__name__)
logger.info("Setting up data analysis environment")

dataset = Path("cohort-data.csv")
query = """The attached dataset contains information about participants in a longitudinal study. 
Examine how educational attainment and income level relate to mental health outcomes using linear regression anlayses adjusted for sex.
"""
environment = setup_data_analysis_env(query, dataset, LANGUAGE)

agent = AgentConfig(
    agent_type="ReActAgent",
    agent_kwargs={
        "llm_model": {
            "parallel_tool_calls": False,
            "num_retries": 3,
            "temperature": 1.0,
            "name": MODEL_NAME,
        },
        "hide_old_env_states": True,
    },
)

agent = agent.construct_agent()
rollout = RolloutManager(agent=agent)

# You can see the notebook updating live in the tmp_results_dir folder
result = await rollout.sample_trajectories(
    environments=[environment], max_steps=MAX_STEPS
)

print("Trajectory completed! Final notebook available at: \n", environment.nb_path)
print(f"Final agent answer:\n{environment.state.answer}")



Trajectory completed! Final notebook available at: 
 /home/jovyan/agentic-systems-workspace/automated-epidemiology/tmp_results_dir/baf30796853c5f310e7075aecf8e99b94fb5594147a9ba1c819f69834256ad2f-1750703801.3971999/notebook.ipynb
Final agent answer:
The notebook is complete and fully addresses the user's query. It demonstrates, via linear regression adjusted for sex, that both higher education and higher income are independently associated with lower depression symptoms. All findings and diagnostics are documented, with clear supporting plots and tables.
