In [1]:
from dotenv import load_dotenv
import os, httpx, json
import nest_asyncio
from typing import Optional

from imagelib.datasets.bids import BIDSTree, SelectBIDSDatasetInfo

from pydantic import BaseModel, Field
from bids import BIDSLayout
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.providers.openai import OpenAIProvider
from openai import AsyncOpenAI

In [2]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
api_key
# nest_asyncio.apply()

'fa4ee01c1e764e0bbb79bd34c8dd8cdb'

In [3]:
nest_asyncio.apply()

In [4]:
client = AsyncOpenAI(
    base_url="https://apimd.mdanderson.edu/dig/llm/llama31-70b/v1/",
    # api_key="unused",
    default_headers={"Ocp-Apim-Subscription-Key": api_key, "Content-Type": "application/json"}
)
provider = OpenAIProvider(openai_client=client)
model = OpenAIModel("meta-llama/Llama-3.1-70B-Instruct", provider=provider)
# agent = Agent(model=model, system_prompt="You are a doctor at MD Anderson Cancer Center. You are tasked with diagnosing a patient with a rare form of cancer", model_settings={"temperature": 0.2})

# result = await agent.run(user_prompt="What is the meaning of cancer?")
# result

## Building a Pydantic-AI agent that interprets a BIDS dataset

In [5]:
# class BIDSDatasetInfo(BaseModel):
#     dataset_name: str = Field(description="The name of the dataset. This should be a human-readable name that describes the dataset.")
#     dataset_description: str = Field(description="A description of the dataset. This should include information about the data, its purpose, and any relevant details.")
#     analysis_description: str = Field(description="A description of the analysis performed on the dataset. This should include information about the methods used, the results obtained, and any relevant details.")
#     analysis_results: str = Field(description="The results of the analysis performed on the dataset. This should include any relevant metrics, visualizations, or other information that summarizes the results.")
#     detailed_summary: str = Field(description="A detailed summary of the dataset and analysis. This should include any relevant information that is not covered in the other fields, such as limitations, future work, or other details.")

class BIDSDatasetInfo(BaseModel):
    dataset_name: str = Field(description="The name of the dataset. This should be a human-readable name that describes the dataset.")
    n_files: int = Field(description="The number of files in the dataset. This should include all files in the dataset, including raw data, processed data, and metadata files.")
    
bids_agent_system_prompt = f"""You are an expert in the BIDS (Brain Imaging Data Structure) format.
You are tasked with answering questions about a BIDS dataset, including its structure, contents, and pipelines.
You are provided with a BIDS dataset root directory. Load the dataset, analyze the files, and provide a summary of the dataset.

Here is a gist of the structure -
- The root of the directory consists of the dataset description file (dataset_description.json) along with the raw data (within sub-xxx/ses-xxx directories).
- The dataset_description.json file contains the dataset name, description, and other relevant information.
- Within the derivatives directory, you will find the results of the data pipelines/analysis performed on the raw data.
- The dataset_description.json file within the derivatives directory contains the name and description of the analysis performed on the dataset.
- The analysis results are stored in the derivatives directory, which may include processed data, visualizations, or other relevant information.
- There could be relevant metadata files (e.g., README, CHANGELOG) that provide additional context about the dataset and analysis.
- There could statistics files (e.g., in csv format) that provide additional context about the dataset and analysis.

"""

bids_agent: Agent = Agent(
    model=model,
    system_prompt=bids_agent_system_prompt,
    result_type=BIDSDatasetInfo,
    deps_type=str,
    retries=3
)

@bids_agent.tool
def get_bids_files(ctx: RunContext[str]) -> list[str]:
    # Load the BIDS dataset and return all the files in the dataset
    layout: BIDSLayout = BIDSLayout(ctx.deps)
    files: list[str] = layout.get(return_type="file")
    return files

# @bids_agent.tool_plain
# def read_json(filepath: str) -> Optional[dict]:
#     # Read a JSON file and return its contents
#     if not filepath.endswith(".json"):
#         print(f"File {filepath} is not a JSON file.")
#         return None
#     try:
#         with open(filepath, "r") as f:
#             data = json.load(f)
#         return data
#     except Exception as e:
#         print(f"Error reading JSON file: {e}")
#         return None

In [6]:
bids_root: str = "/Users/cmokashi/data/bids_datasets/open_neuro/ds005596-1.1.1"

result = bids_agent.run_sync(
    user_prompt="Analyze the BIDS dataset and provide a detailed summary of the dataset and analysis.",
    deps=bids_root
)

In [7]:
result.all_messages()

[ModelRequest(parts=[SystemPromptPart(content='You are an expert in the BIDS (Brain Imaging Data Structure) format.\nYou are tasked with answering questions about a BIDS dataset, including its structure, contents, and pipelines.\nYou are provided with a BIDS dataset root directory. Load the dataset, analyze the files, and provide a summary of the dataset.\n\nHere is a gist of the structure -\n- The root of the directory consists of the dataset description file (dataset_description.json) along with the raw data (within sub-xxx/ses-xxx directories).\n- The dataset_description.json file contains the dataset name, description, and other relevant information.\n- Within the derivatives directory, you will find the results of the data pipelines/analysis performed on the raw data.\n- The dataset_description.json file within the derivatives directory contains the name and description of the analysis performed on the dataset.\n- The analysis results are stored in the derivatives directory, which

In [10]:
result.data.model_dump()

{'dataset_name': 'BIDS Dataset',
 'dataset_description': 'The BIDS dataset contains raw and processed brain imaging data.',
 'analysis_description': 'The analysis performed on the dataset includes data cleaning, preprocessing, and visualization.',
 'analysis_results': 'The results of the analysis include statistics and visualizations of the data.',
 'detailed_summary': 'The dataset and analysis provide insights into brain function and structure.'}