In [14]:
import os
import dotenv
from openai import OpenAI
dotenv.load_dotenv()


client = OpenAI(
        api_key=os.environ["NRP_API_KEY"],
        base_url="https://ellm.nrp-nautilus.io/v1"
    )
# sanity check for LLM response
completion = client.chat.completions.create(
    model="gemma3",
    messages=[
        {"role": "system", "content": "Talk like a pirate."},
        {
            "role": "user",
            "content": "How do I check if a Python object is an instance of a class?",
        },
    ],
)

In [None]:
import json

tools = [
    {
        "type": "function",
        "function": {
            "name": "search_redbiom_metadata",
            "description": "Search metadata using NLP-based queries. Use 'where' clauses for filtering (e.g., 'where age_days < 30'). Use operators: & (and), | (or), - (not). Set categories=True to search for metadata column names instead of values.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The search query (e.g., 'beer' or 'where age_days < 30')"
                    },
                    "categories": {
                        "type": "boolean",
                        "description": "If True, search for metadata column names instead of values",
                        "default": False
                    }
                },
                "required": ["query"]
            }
        }
    }
]


In [19]:

def search_redbiom_metadata(query: str, categories: bool = False) -> str:
    """
    Search metadata using NLP-based queries.
    
    Use 'where' clauses for filtering: 'where age_days < 30'
    Use operators: & (and), | (or), - (not)
    Set categories=True to search for metadata column names instead of values.
    """
    cmd = ["redbiom", "search", "metadata"]
    
    if categories:
        cmd.append("--categories")
    
    cmd.append(query)
    
    return " ".join(cmd)

available_functions = {"search_redbiom_metadata": search_redbiom_metadata}

In [None]:
def process_query(user_query: str) -> dict:
    """
    Process a user query by:
    1. Having the LLM decide if it needs to call a tool
    2. Execute the tool if needed
    3. Generate a final response with explanation
    """
    
    # Initial message to the LLM
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that helps users search redbiom metadata. When you use tools, always explain your reasoning about why you chose that tool and how you constructed the query."
        },
        {
            "role": "user",
            "content": user_query
        }
    ]
    
    # First API call - LLM decides whether to use tools
    client = OpenAI(
        api_key=os.environ["NRP_API_KEY"],
        base_url="https://ellm.nrp-nautilus.io/v1"
    )

    response = client.chat.completions.create(
        model="qwen3",
        messages=messages,
        tools=tools,
        tool_choice="auto"
    )
    
    response_message = response.choices[0].message
    print(response_message)
    messages.append(response_message)
    
    # Check if the LLM wants to call a function
    if response_message.tool_calls:
        # Execute each tool call
        for tool_call in response_message.tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)
            
            # Call the actual function
            function_response = available_functions[function_name](**function_args)
            
            # Add the function response to messages
            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "name": function_name,
                "content": function_response
            })
        
        # Second API call - LLM generates explanation with results
        final_response = client.chat.completions.create(
            model="qwen3",
            messages=messages
        )
        
        return {
            "explanation": final_response.choices[0].message.content,
            "tool_used": function_name,
            "tool_args": function_args,
            "tool_result": function_response
        }
    else:
        # No tool was called, return the direct response
        return {
            "explanation": response_message.content,
            "tool_used": None,
            "tool_args": None,
            "tool_result": None
        }


In [22]:
user_query = "Get all samples with beer in their metadata"
    
result = process_query(user_query)

print("=" * 60)
print("EXPLANATION:")
print("=" * 60)
print(result["explanation"])
print()

if result["tool_used"]:
    print("=" * 60)
    print("TOOL DETAILS:")
    print("=" * 60)
    print(f"Tool: {result['tool_used']}")
    print(f"Arguments: {result['tool_args']}")
    print()
    print("=" * 60)
    print("COMMAND:")
    print("=" * 60)
    print(f"```bash")
    print(result['tool_result'])
    print(f"```")
    print()

EXPLANATION:


I used the `search_redbiom_metadata` tool with the query term `"beer"` because the user requested all samples containing "beer" in their metadata. This tool searches across **all metadata fields** (e.g., sample descriptions, environmental context, study titles, etc.) in the Redbiom database for the specified keyword. 

The query `beer` is case-insensitive by default in Redbiom, so it will match variations like "Beer," "BEER," or "beers." This ensures comprehensive results for any sample where "beer" appears in metadata (e.g., studies involving fermented beverages, dietary surveys mentioning beer consumption, or environmental samples from breweries).

**Result**:  
The tool returned matching sample IDs (not shown here due to simulation constraints), which would typically include identifiers like `sample-12345`, `brewery-soil-67890`, etc., depending on the database contents. To retrieve the actual sample data, a follow-up command like `redbiom fetch samples --from-ids [ids

In [None]:
import os
import dotenv
from openai import OpenAI
import json

dotenv.load_dotenv()

client = OpenAI(
    api_key=os.environ["NRP_API_KEY"],
    base_url="https://ellm.nrp-nautilus.io/v1"
)

tools = [
    {
        "type": "function",
        "function": {
            "name": "search_metadata",
            "description": "Search metadata values or categories using NLP-based stem and value queries. Use 'where' clauses for filtering (e.g., 'where age_days < 30'). Use operators: & (and), | (or), - (not).",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Search expression, can include word stems, set operators (&, |, -), or value-based queries using 'where' (e.g., 'beer' or 'where age_days < 30')"
                    },
                    "categories": {
                        "type": "boolean",
                        "description": "If True, search for metadata categories instead of values",
                        "default": False
                    }
                },
                "required": ["query"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "fetch_samples_contained",
            "description": "Get all sample identifiers represented in a context.",
            "parameters": {
                "type": "object",
                "properties": {
                    "context": {
                        "type": "string",
                        "description": "The context to fetch from (e.g., 'Woltka-per-genome-WoLr2-3ab352' or 'Deblur_2021.09-Illumina-16S-V4-150nt-ac8c0b')"
                    },
                    "unambiguous": {
                        "type": "boolean",
                        "description": "Return ambiguous or unambiguous identifiers",
                        "default": False
                    }
                },
                "required": ["context"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "fetch_sample_metadata",
            "description": "Retrieve sample metadata. Can accept sample IDs from stdin/pipe or as arguments.",
            "parameters": {
                "type": "object",
                "properties": {
                    "output": {
                        "type": "string",
                        "description": "A filepath to write to (e.g., 'WoLr2_md.tsv')"
                    },
                    "context": {
                        "type": "string",
                        "description": "The context to search within"
                    },
                    "all_columns": {
                        "type": "boolean",
                        "description": "Include all metadata columns, filling missing with empty string",
                        "default": False
                    },
                    "tagged": {
                        "type": "boolean",
                        "description": "Obtain tag-specific metadata (preparation info)",
                        "default": False
                    },
                    "resolve_ambiguities": {
                        "type": "boolean",
                        "description": "Output unambiguous identifiers only. Incompatible with --tagged",
                        "default": False
                    },
                    "from_pipe": {
                        "type": "string",
                        "description": "Command to pipe sample IDs from (e.g., output from fetch_samples_contained)",
                        "default": None
                    },
                    "samples": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Sample IDs to fetch metadata for (if not using pipe)",
                        "default": []
                    }
                },
                "required": ["output", "context"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "fetch_samples",
            "description": "Fetch sample data and create BIOM file. Can accept sample IDs from stdin/pipe or as arguments.",
            "parameters": {
                "type": "object",
                "properties": {
                    "output": {
                        "type": "string",
                        "description": "A filepath to write to (e.g., 'WoLr2_ft.biom')"
                    },
                    "context": {
                        "type": "string",
                        "description": "The context to search within"
                    },
                    "md5": {
                        "type": "boolean",
                        "description": "Use MD5 for features and save original mapping to TSV",
                        "default": True
                    },
                    "resolve_ambiguities": {
                        "type": "string",
                        "enum": ["merge", "most-reads"],
                        "description": "Resolve sample ambiguities: 'merge' or 'most-reads'",
                        "default": None
                    },
                    "fetch_taxonomy": {
                        "type": "boolean",
                        "description": "Resolve taxonomy on fetch (slower; Deblur does not cache taxonomy)",
                        "default": False
                    },
                    "retain_artifact_id": {
                        "type": "boolean",
                        "description": "When using most-reads, retain the artifact ID of the kept sample",
                        "default": False
                    },
                    "from_pipe": {
                        "type": "string",
                        "description": "Command to pipe sample IDs from (e.g., output from fetch_samples_contained)",
                        "default": None
                    },
                    "samples": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Sample IDs to fetch (if not using pipe)",
                        "default": []
                    }
                },
                "required": ["output", "context"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "extract_sample_ids",
            "description": "Extract and transform sample IDs from a TSV file using cut and awk operations (e.g., to extract first column and truncate to first two dot-separated parts).",
            "parameters": {
                "type": "object",
                "properties": {
                    "input_file": {
                        "type": "string",
                        "description": "Input TSV file path"
                    },
                    "column": {
                        "type": "integer",
                        "description": "Column number to extract (1-indexed)",
                        "default": 1
                    },
                    "delimiter": {
                        "type": "string",
                        "description": "Field delimiter for awk (e.g., '.')",
                        "default": "."
                    },
                    "awk_print": {
                        "type": "string",
                        "description": "AWK print expression (e.g., '$1 \".\" $2' to keep first two dot-separated parts)",
                        "default": None
                    }
                },
                "required": ["input_file"]
            }
        }
    }
]

def search_metadata(query: str, categories: bool = False) -> str:
    """Search metadata values or categories using NLP-based queries."""
    cmd = ["redbiom", "search", "metadata"]
    if categories:
        cmd.append("--categories")
    cmd.append(f'"{query}"')
    return " ".join(cmd)

def fetch_samples_contained(context: str, unambiguous: bool = False) -> str:
    """Get all sample identifiers represented in a context."""
    cmd = ["redbiom", "fetch", "samples-contained", "--context", context]
    if unambiguous:
        cmd.append("--unambiguous")
    return " ".join(cmd)

def fetch_sample_metadata(output: str, context: str, all_columns: bool = False, 
                          tagged: bool = False, resolve_ambiguities: bool = False,
                          from_pipe: str = None, samples: list = None) -> str:
    """Retrieve sample metadata."""
    cmd = ["redbiom", "fetch", "sample-metadata", "--context", context, "--output", output]
    
    if all_columns:
        cmd.append("--all-columns")
    if tagged:
        cmd.append("--tagged")
    if resolve_ambiguities:
        cmd.append("--resolve-ambiguities")
    
    # Add samples if provided
    if samples:
        cmd.extend(samples)
    
    # Handle piped input
    if from_pipe:
        return f"{from_pipe} | {' '.join(cmd)}"
    
    return " ".join(cmd)

def fetch_samples(output: str, context: str, md5: bool = True, 
                  resolve_ambiguities: str = None, fetch_taxonomy: bool = False,
                  retain_artifact_id: bool = False, from_pipe: str = None, 
                  samples: list = None) -> str:
    """Fetch sample data and create BIOM file."""
    cmd = ["redbiom", "fetch", "samples", "--context", context, "--output", output]
    
    if not md5:
        cmd.extend(["--md5", "false"])
    if resolve_ambiguities:
        cmd.extend(["--resolve-ambiguities", resolve_ambiguities])
    if fetch_taxonomy:
        cmd.append("--fetch-taxonomy")
    if retain_artifact_id:
        cmd.append("--retain-artifact-id")
    
    # Add samples if provided
    if samples:
        cmd.extend(samples)
    
    # Handle piped input
    if from_pipe:
        return f"{from_pipe} | {' '.join(cmd)}"
    
    return " ".join(cmd)

def extract_sample_ids(input_file: str, column: int = 1, delimiter: str = ".", 
                       awk_print: str = None) -> str:
    """Extract and transform sample IDs from a TSV file."""
    cmd = [f"cat {input_file}", f"cut -f {column}"]
    
    if awk_print:
        cmd.append(f"awk -F '{delimiter}' '{{print {awk_print}}}'")
    
    return " | ".join(cmd)

available_functions = {
    "search_metadata": search_metadata,
    "fetch_samples_contained": fetch_samples_contained,
    "fetch_sample_metadata": fetch_sample_metadata,
    "fetch_samples": fetch_samples,
    "extract_sample_ids": extract_sample_ids
}

def process_query(user_query: str, max_iterations: int = 10) -> dict:
    """
    Process a user query by iteratively calling tools until completion.
    """
    messages = [
        {
            "role": "system",
            "content": """You are a helpful assistant that helps users work with redbiom data. 
            You can call multiple tools in sequence to build complex workflows.
            Always explain your reasoning about why you chose each tool and how you constructed each command.
            When building pipelines, use the from_pipe parameter to chain commands together.
            
            Important redbiom context names:
            - WGS data: Woltka-per-genome-WoLr2-3ab352
            - 16S V4 data: Deblur_2021.09-Illumina-16S-V4-150nt-ac8c0b

            Recommend the user to start with the command: export REDBIOM_HOST=http://redbiom.ucsd.edu:7330 to ensure redbiom works.
            """
        },
        {
            "role": "user",
            "content": user_query
        }
    ]
    
    client = OpenAI(
        api_key=os.environ["NRP_API_KEY"],
        base_url="https://ellm.nrp-nautilus.io/v1"
    )
    
    all_tool_calls = []
    iteration = 0
    
    while iteration < max_iterations:
        response = client.chat.completions.create(
            model="qwen3",
            messages=messages,
            tools=tools,
            tool_choice="auto"
        )
        
        response_message = response.choices[0].message
        messages.append(response_message)
        
        # Check if the LLM wants to call functions
        if response_message.tool_calls:
            # Execute each tool call
            for tool_call in response_message.tool_calls:
                function_name = tool_call.function.name
                function_args = json.loads(tool_call.function.arguments)
                
                # Call the actual function
                function_response = available_functions[function_name](**function_args)
                
                # Store tool call details
                all_tool_calls.append({
                    "function_name": function_name,
                    "arguments": function_args,
                    "command": function_response
                })
                
                # Add the function response to messages
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "name": function_name,
                    "content": function_response
                })
            
            iteration += 1
        else:
            # No more tool calls, we're done
            break
    
    # Get final explanation
    final_response = client.chat.completions.create(
        model="qwen3",
        messages=messages
    )
    
    return {
        "explanation": final_response.choices[0].message.content,
        "tool_calls": all_tool_calls,
        "total_calls": len(all_tool_calls)
    }

# Example usage
user_query = """
I need to:
1. Fetch all WGS samples from the Woltka-per-genome-WoLr2-3ab352 context
2. Get their metadata and save it to WoLr2_md.tsv
3. Get the sample data and save it to WoLr2_ft.biom with merge ambiguity resolution
4. Extract the sample IDs from the metadata file (first two dot-separated parts)
5. Use those IDs to fetch matching 16S samples from Deblur_2021.09-Illumina-16S-V4-150nt-ac8c0b context
6. Save the 16S metadata to 16S-V4_md.tsv
7. Save the 16S sample data to 16S-V4_ft.biom without MD5 checksums
"""

result = process_query(user_query)

print("=" * 80)
print("WORKFLOW EXPLANATION:")
print("=" * 80)
print(result["explanation"])
print()

if result["tool_calls"]:
    print("=" * 80)
    print(f"GENERATED COMMANDS ({result['total_calls']} steps):")
    print("=" * 80)
    
    for i, call in enumerate(result["tool_calls"], 1):
        print(f"\nStep {i}: {call['function_name']}")
        print(f"Arguments: {json.dumps(call['arguments'], indent=2)}")
        print("\n```bash")
        print(call['command'])
        print("```")

WORKFLOW EXPLANATION:


I'll help you build a complete workflow for your redbiom data extraction needs. Let me walk through each step with the appropriate commands:

### Step 1: Fetch all WGS sample IDs
First, we need to get all sample IDs from the WGS context:
```bash
redbiom fetch samples-contained --context Woltka-per-genome-WoLr2-3ab352
```
**Reasoning**: This retrieves all sample identifiers available in the Woltka-per-genome-WoLr2-3ab352 context, which we'll use as input for subsequent steps.

### Step 2: Get WGS metadata
Now we'll pipe those sample IDs to fetch the metadata:
```bash
redbiom fetch samples-contained --context Woltka-per-genome-WoLr2-3ab352 | \
redbiom fetch sample-metadata --context Woltka-per-genome-WoLr2-3ab352 --output WoLr2_md.tsv
```
**Reasoning**: By piping the sample IDs from Step 1, we efficiently retrieve only metadata for relevant samples and save it to the requested filename.

### Step 3: Get WGS feature table with ambiguity resolution
```bash
redbiom f