In [5]:
import os
from openai import OpenAI
import dotenv

dotenv.load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key = "ifjO4oMMtU6MAXw2wc9cdaCjGU3FfdZs",
    base_url = "https://ellm.nrp-nautilus.io/v1"
)

completion = client.chat.completions.create(
    model="gemma3",
    messages=[
        {"role": "system", "content": "Talk like a pirate."},
        {
            "role": "user",
            "content": "How do I check if a Python object is an instance of a class?",
        },
    ],
)

print(completion.choices[0].message.content)

Ahoy, matey! Ye be wantin' to know how to see if a scallywag o' an object be truly one o' a certain crew (a class, ye landlubber!)? 

There be a few ways to do it, aye!

**1. The `isinstance()` function - The most trusty method!**

This here be the best way, 'tis! It checks if an object be an instance o' a class, *or* if it be an instance o' a subclass o' that class. Think o' it as checkin' if the scallywag be part o' the whole fleet!

```python
class Ship:
    pass

class Galleon(Ship): #A Galleon be a type o' Ship, ye see?
    pass

me_ship = Ship()
me_galleon = Galleon()

print(isinstance(me_ship, Ship))    # Prints: True
print(isinstance(me_galleon, Galleon)) # Prints: True
print(isinstance(me_galleon, Ship))   # Prints: True! 'Cause a Galleon *is* a Ship!
print(isinstance(me_ship, Galleon))   # Prints: False - This ship ain't a Galleon!
```

**2. `type()` - A bit more strict, beware!**

Ye can also use `type()`, but it's a bit more particular. It only checks if the object be *exac

# Tool Making and Calling

In [7]:
import os
from openai import OpenAI
import dotenv

dotenv.load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key = os.environ.get("NRP_API_KEY"),
    base_url = "https://ellm.nrp-nautilus.io/v1"
)

completion = client.chat.completions.create(
    model="gemma3",
    messages=[
        {"role": "system", "content": "Talk like a pirate."},
        {
            "role": "user",
            "content": "How do I check if a Python object is an instance of a class?",
        },
    ],
)

print(completion.choices[0].message.content)

Ahoy there, matey! Ye want to know how to see if a Python object be a scion o' a certain class, aye? A right useful skill fer a pirate coder, that be!

Ye be usin' the `isinstance()` function, ye scurvy dog!

Here's how it works, see?

```python
class Ship:
  def __init__(self, name):
    self.name = name

class Galleon(Ship):
  def __init__(self, name, cannons):
    super().__init__(name)
    self.cannons = cannons

me_ship = Ship("The Sea Serpent")
me_galleon = Galleon("The Black Pearl", 40)

# Check if me_ship be an instance o' the Ship class
if isinstance(me_ship, Ship):
  print("Shiver me timbers! 'Tis a Ship, it be!")

# Check if me_galleon be an instance o' the Ship class (it inherits from it!)
if isinstance(me_galleon, Ship):
  print("Avast! A Ship o' sorts, indeed!")

# Check if me_galleon be an instance o' the Galleon class
if isinstance(me_galleon, Galleon):
  print("Aye, 'tis a Galleon, fit fer plunderin'!")

# Check if me_ship be an instance o' the Galleon class (it ain't!

In [2]:
import subprocess
import os
import json
from typing import Optional, List, Dict, Any
from pathlib import Path

from langchain_core.tools import tool
from openai import OpenAI
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from pydantic import BaseModel, Field

NRP_API_KEY = os.environ.get("NRP_API_KEY")

client = OpenAI(
    # This is the default and can be omitted
    api_key = NRP_API_KEY,
    base_url = "https://llm.nrp-nautilus.io/"
)

In [9]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that translates {input_language} to {output_language}.",
        ),
        ("human", "{input}"),
    ]
)
llm = ChatOpenAI(
    model="gemma3",
    openai_api_key= os.environ.get("NRP_API_KEY"),
    openai_api_base="https://ellm.nrp-nautilus.io/v1/",
)

chain = prompt | llm
chain.invoke(
    {
        "input_language": "English",
        "output_language": "German",
        "input": "I love programming.",
    }
)

AIMessage(content='Here are a few options for translating "I love programming" into German, depending on the nuance you want to convey:\n\n* **Ich liebe das Programmieren.** (This is a very direct translation, and quite common. It emphasizes a deep love for the *activity* of programming.)\n* **Ich programmiere sehr gerne.** (This translates to "I really like to program." It\'s a bit more casual and focuses on enjoying the act of programming.)\n* **Programmieren macht mir viel Spaß.** (This means "Programming gives me a lot of fun."  It\'s another common and natural way to express enjoyment.)\n\nWhich one is best depends on the context, but **Ich liebe das Programmieren** is probably the closest and most passionate translation.\n\n\n\n', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 158, 'prompt_tokens': 25, 'total_tokens': 183, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'google/gemma-3-27b-it', 'system

In [37]:
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI, custom_tool

llm = ChatOpenAI(
    model="gemma3",
    openai_api_key=os.environ["NRP_API_KEY"],
    openai_api_base="https://ellm.nrp-nautilus.io/v1",
)

def get_weather(location: str) -> str:
    """Get weather at a location."""
    return "It's sunny."

class GetWeather(BaseModel):
    """Get the current weather in a given location"""

    location: str = Field(..., description="The city and state, e.g. San Francisco, CA")


llm_with_tools = llm.bind_tools([GetWeather])

ai_msg = llm_with_tools.invoke("what is the weather like in San Francisco")

# The tool is called but not executed yet
if ai_msg.tool_calls:
    for tool_call in ai_msg.tool_calls:
        # Execute the tool
        result = llm_with_tools.invoke(tool_call['args'])
        print(f"Result: {result}")

ValueError: Invalid input type <class 'dict'>. Must be a PromptValue, str, or list of BaseMessages.

In [None]:
# ai_msg = llm_with_tools.invoke(
#     "what is the weather like in San Francisco",
# )
# ai_msg

AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'chatcmpl-tool-dbe2acbb5ca44e2ca4c91aaa43b052ae', 'function': {'arguments': '{"location": "San Francisco, CA"}', 'name': 'GetWeather'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 12, 'prompt_tokens': 288, 'total_tokens': 300, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'google/gemma-3-27b-it', 'system_fingerprint': None, 'id': 'chatcmpl-c79014e4-4d15-48ec-a2a1-71917267606f', 'service_tier': None, 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--c0a3bd11-1cf8-487b-aeac-22d5d968dfd9-0', tool_calls=[{'name': 'GetWeather', 'args': {'location': 'San Francisco, CA'}, 'id': 'chatcmpl-tool-dbe2acbb5ca44e2ca4c91aaa43b052ae', 'type': 'tool_call'}], usage_metadata={'input_tokens': 288, 'output_tokens': 12, 'total_tokens': 300, 'input_token_details': {}, 'output_token_details': {}})

In [41]:
ai_msg

AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'chatcmpl-tool-650a878ef919488db148a5108c76144e', 'function': {'arguments': '{"location": "San Francisco, CA"}', 'name': 'GetWeather'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 12, 'prompt_tokens': 288, 'total_tokens': 300, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'google/gemma-3-27b-it', 'system_fingerprint': None, 'id': 'chatcmpl-32a5a759-23cb-46b5-9f25-646e7063ca54', 'service_tier': None, 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--52a472f0-4213-4103-88fb-8fcba8bf3850-0', tool_calls=[{'name': 'GetWeather', 'args': {'location': 'San Francisco, CA'}, 'id': 'chatcmpl-tool-650a878ef919488db148a5108c76144e', 'type': 'tool_call'}], usage_metadata={'input_tokens': 288, 'output_tokens': 12, 'total_tokens': 300, 'input_token_details': {}, 'output_token_details': {}})

In [40]:
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, ToolMessage

llm = ChatOpenAI(
    model="gemma3",
    openai_api_key=os.environ["NRP_API_KEY"],
    openai_api_base="https://ellm.nrp-nautilus.io/v1",
)

class GetWeather(BaseModel):
    """Get the current weather in a given location"""
    location: str = Field(..., description="The city and state, e.g. San Francisco, CA")

# Bind tools
llm_with_tools = llm.bind_tools([GetWeather])

# Invoke
ai_msg = llm_with_tools.invoke("what is the weather like in San Francisco")

# Execute the tool if called
if ai_msg.tool_calls:
    for tool_call in ai_msg.tool_calls:
        if tool_call['name'] == 'GetWeather':
            location = tool_call['args']['location']
            # Execute your actual function
            result = f"It's sunny in {location}."
            print(f"Tool executed: {result}")

Tool executed: It's sunny in San Francisco, CA.


# Functions to write the commands themselves

In [42]:
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gemma3",
    openai_api_key=os.environ["NRP_API_KEY"],
    openai_api_base="https://ellm.nrp-nautilus.io/v1",
)

@tool
def get_weather(location: str) -> str:  # ✅ Correct return type
    """Get the current weather in a given location.
    
    Args:
        location: The city and state, e.g. San Francisco, CA
    """
    return f"It's sunny in {location}."

# Bind the actual tool function
llm_with_tools = llm.bind_tools([get_weather])  # ✅ Pass the function, not a class

# Invoke
ai_msg = llm_with_tools.invoke("what is the weather like in San Francisco")

# The tool is called but not executed yet
if ai_msg.tool_calls:
    for tool_call in ai_msg.tool_calls:
        # Execute the tool
        result = get_weather.invoke(tool_call['args'])
        print(f"Result: {result}")

Result: It's sunny in San Francisco, CA.


In [None]:
from langchain_core.tools import tool
from typing import List
import subprocess

@tool
def search_features(context: str, features: List[str], exact: bool = False, min_count: int = 1) -> str:
    """Search for samples containing specified microbiome features.
    
    Args:
        context: The context to search within (e.g., Woltka-per-genome-WoLr2-3ab352)
        features: List of feature IDs to search for
        exact: If True, all samples must contain all specified features
        min_count: Minimum number of times the feature was observed
    """
    cmd = f"redbiom search features --context {context}"
    if exact:
        cmd += " --exact"
    cmd += f" --min-count {min_count}"
    cmd += " " + " ".join(features)
    
    # Execute command
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        return result.stdout
    except Exception as e:
        return f"Error: {str(e)}"

# Bind and use
llm_with_tools = llm.bind_tools([search_features])

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "Find samples with Bacteroides in Woltka-per-genome-WoLr2-3ab352"}
]

response = llm_with_tools.invoke(messages)

# Execute tools if called
if response.tool_calls:
    for tool_call in response.tool_calls:
        result = search_features.invoke(tool_call['args'])
        print(f"Command executed. Result:\n{result}")

In [None]:
import os
from openai import OpenAI

# ---- 1. Configure OpenAI client for NRP ----
client = OpenAI(
    api_key=os.environ["NRP_API_KEY"],
    base_url="https://ellm.nrp-nautilus.io/v1"
)

# ---- 2. Define your tool schema ----
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the weather at a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Diego, CA"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# ---- 3. Define the actual function ----
def get_weather(location: str) -> str:
    """Get the weather at a given location."""
    
    return f"The weather in {location} is sunny with mild temperatures."

# ---- 4. Make the API call ----
messages = [
    {"role": "user", "content": "what is the weather like in San Diego"}
]

response = client.chat.completions.create(
    model="gemma3",
    messages=messages,
    tools=tools,
    tool_choice="auto"  # Let the model decide when to use tools
)

print("Response:")
print(response)

# ---- 5. Handle tool calls ----
message = response.choices[0].message

if message.tool_calls:
    print("\nTool was called!")
    
    # Append assistant's message to conversation
    messages.append(message)
    
    # Execute each tool call
    for tool_call in message.tool_calls:
        function_name = tool_call.function.name
        function_args = eval(tool_call.function.arguments)  # Parse JSON string
        
        print(f"\nTool: {function_name}")
        print(f"Arguments: {function_args}")
        
        # Execute the function
        if function_name == "get_weather":
            function_response = get_weather(**function_args)
            print(f"Result: {function_response}")
            
            # Append tool result to messages
            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "name": function_name,
                "content": function_response
            })
    
    # Get final response with tool results
    final_response = client.chat.completions.create(
        model="gemma3",
        messages=messages
    )
    
    print("\nFinal response:")
    print(final_response.choices[0].message.content)
else:
    print("\nNo tool calls. Direct response:")
    print(message.content)

Response:
ChatCompletion(id='chatcmpl-260633eb-25bb-4f89-8f36-b068809a8ea8', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-2fd97c1e174d47c0a5e55bcf29317247', function=Function(arguments='{"location": "San Diego, CA"}', name='get_weather'), type='function')], reasoning_content=None), stop_reason=106, token_ids=None)], created=1760607725, model='google/gemma-3-27b-it', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=13, prompt_tokens=288, total_tokens=301, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, prompt_token_ids=None, kv_transfer_params=None)

Tool was called!

Tool: get_weather
Arguments: {'location': 'San Diego, CA'}
Result: The weather in San Diego, CA is sunny with mil

In [56]:
import os
import json
import subprocess
from openai import OpenAI

client = OpenAI(
    api_key=os.environ["NRP_API_KEY"],
    base_url="https://ellm.nrp-nautilus.io/v1"
)

# Redbiom tool
tools = [
    {
        "type": "function",
        "function": {
            "name": "search_features",
            "description": "Search for samples containing specified microbiome features in Redbiom",
            "parameters": {
                "type": "object",
                "properties": {
                    "context": {
                        "type": "string",
                        "description": "The context to search within (e.g., Woltka-per-genome-WoLr2-3ab352)"
                    },
                    "features": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of feature IDs to search for"
                    },
                    "exact": {
                        "type": "boolean",
                        "description": "If true, all samples must contain all specified features",
                        "default": False
                    },
                    "min_count": {
                        "type": "integer",
                        "description": "Minimum number of times the feature was observed",
                        "default": 1
                    }
                },
                "required": ["context", "features"]
            }
        }
    }
]

def search_features(context: str, features: list, exact: bool = False, min_count: int = 1) -> str:
    """Execute redbiom search features command"""
    cmd = f"redbiom search features --context {context}"
    if exact:
        cmd += " --exact"
    cmd += f" --min-count {min_count}"
    cmd += " " + " ".join(features)
    
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

available_functions = {
    "search_features": search_features
}

# Use it
messages = [
    {"role": "system", "content": "You are a microbiome data analyst. Use the search_features tool to find samples."},
    {"role": "user", "content": "Find samples with Bacteroides in Woltka-per-genome-WoLr2-3ab352"}
]

response = client.chat.completions.create(
    model="gemma3",
    messages=messages,
    tools=tools,
    tool_choice="auto"
)

message = response.choices[0].message
print(message)

if message.tool_calls:
    messages.append(message)
    
    for tool_call in message.tool_calls:
        function_name = tool_call.function.name
        function_args = json.loads(tool_call.function.arguments)
        
        result = available_functions[function_name](**function_args)
        
        print(f"Command executed: {function_name}")
        print(f"Result:\n{result}")
        
        messages.append({
            "role": "tool",
            "tool_call_id": tool_call.id,
            "name": function_name,
            "content": result
        })
    
    final_response = client.chat.completions.create(
        model="gemma3",
        messages=messages
    )
    
    print(f"\nFinal answer:\n{final_response.choices[0].message.content}")

ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-8fd02b69f95d417e84f3f9d856369e08', function=Function(arguments='{"context": "Woltka-per-genome-WoLr2-3ab352", "features": ["Bacteroides"]}', name='search_features'), type='function')], reasoning_content=None)
Command executed: search_features
Result:


Final answer:
Okay, the tool returned no results. This means there are no features specifically identified as "Bacteroides" within the "Woltka-per-genome-WoLr2-3ab352" dataset as directly searchable features. 

However, this *doesn't* necessarily mean Bacteroides isn't *present* in the data. It just means it isn't flagged as a key feature or directly annotated as such in the way the tool is indexing the data. 

To find samples *containing* Bacteroides, I'd need to use a different approach. I would need to search for reads/sequences that *taxonomically* cl

In [57]:
import os
import json
import subprocess
from openai import OpenAI

client = OpenAI(
    api_key=os.environ["NRP_API_KEY"],
    base_url="https://ellm.nrp-nautilus.io/v1"
)

# Tool definition based on the system prompt
tools = [
    {
        "type": "function",
        "function": {
            "name": "fetch_sample_metadata",
            "description": "Retrieve sample metadata from Redbiom and save to a file",
            "parameters": {
                "type": "object",
                "properties": {
                    "output": {
                        "type": "string",
                        "description": "A filepath to write the metadata to (e.g., 'WoLr2_md.tsv'). REQUIRED."
                    },
                    "context": {
                        "type": "string",
                        "description": "The context to search within - use 'Woltka-per-genome-WoLr2-3ab352'"
                    },
                    "samples": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of sample IDs to fetch metadata for. If not provided, fetches all samples in context."
                    },
                    "all_columns": {
                        "type": "boolean",
                        "description": "Include all metadata columns, filling missing with empty string",
                        "default": False
                    },
                    "resolve_ambiguities": {
                        "type": "boolean",
                        "description": "Output unambiguous identifiers only. Incompatible with --tagged.",
                        "default": False
                    },
                    "tagged": {
                        "type": "boolean",
                        "description": "Obtain tag-specific metadata (preparation info)",
                        "default": False
                    },
                    "force_categories": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Force output to include specific metadata variables/columns"
                    }
                },
                "required": ["output"]
            }
        }
    }
]

def fetch_sample_metadata(
    output: str,
    context: str = None,
    samples: list = None,
    all_columns: bool = False,
    resolve_ambiguities: bool = False,
    tagged: bool = False,
    force_categories: list = None
) -> str:
    """Execute redbiom fetch sample-metadata command"""
    
    # Start building command
    cmd = "redbiom fetch sample-metadata"
    
    # Add required output
    cmd += f" --output {output}"
    
    # Add optional context
    if context:
        cmd += f" --context {context}"
    
    # Add optional flags
    if all_columns:
        cmd += " --all-columns"
    
    if resolve_ambiguities:
        cmd += " --resolve-ambiguities"
    
    if tagged:
        cmd += " --tagged"
    
    # Add force categories if provided
    if force_categories:
        for category in force_categories:
            cmd += f" --force-category {category}"
    
    # Add sample IDs if provided
    if samples:
        cmd += " " + " ".join(samples)
    
    print(f"Executing command: {cmd}")
    
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        
        if result.returncode == 0:
            return f"Success! Metadata saved to {output}\n{result.stdout}"
        else:
            return f"Error executing command:\n{result.stderr}"
    except subprocess.TimeoutExpired:
        return f"Command timed out after 300 seconds"
    except Exception as e:
        return f"Error: {str(e)}"

available_functions = {
    "fetch_sample_metadata": fetch_sample_metadata
}

# Example usage
messages = [
    {
        "role": "system", 
        "content": """You are a microbiome data analyst. When users ask to fetch sample metadata, 
        use the fetch_sample_metadata tool. The output parameter is REQUIRED and should be a .tsv file.
        Available context: Woltka-per-genome-WoLr2-3ab352"""
    },
    {
        "role": "user", 
        "content": "Fetch sample metadata from Woltka-per-genome-WoLr2-3ab352 and save it to WoLr2_md.tsv"
    }
]

response = client.chat.completions.create(
    model="gemma3",
    messages=messages,
    tools=tools,
    tool_choice="auto"
)

message = response.choices[0].message

if message.tool_calls:
    print("Tool was called!\n")
    messages.append(message)
    
    for tool_call in message.tool_calls:
        function_name = tool_call.function.name
        function_args = json.loads(tool_call.function.arguments)
        
        print(f"Function: {function_name}")
        print(f"Arguments: {json.dumps(function_args, indent=2)}\n")
        
        # Execute the function
        result = available_functions[function_name](**function_args)
        
        print(f"Result:\n{result}\n")
        
        # Add tool response to messages
        messages.append({
            "role": "tool",
            "tool_call_id": tool_call.id,
            "name": function_name,
            "content": result
        })
    
    # Get final response
    final_response = client.chat.completions.create(
        model="gemma3",
        messages=messages
    )
    
    print(f"Final answer:\n{final_response.choices[0].message.content}")
else:
    print(f"No tool called. Direct response:\n{message.content}")

Tool was called!

Function: fetch_sample_metadata
Arguments: {
  "output": "WoLr2_md.tsv",
  "context": "Woltka-per-genome-WoLr2-3ab352"
}

Executing command: redbiom fetch sample-metadata --output WoLr2_md.tsv --context Woltka-per-genome-WoLr2-3ab352


KeyboardInterrupt: 

In [5]:
import os
import json
import subprocess
from typing import Optional, List
from openai import OpenAI
import json

with open("./Langchain_attempts/tools.json") as f:
    tools = json.load(f)["tools"]

# client = OpenAI(
#     api_key=os.environ["NRP_API_KEY"],
#     base_url="https://ellm.nrp-nautilus.io/v1",
#     # tools=tools
# )

# ============================================================================
# FUNCTION IMPLEMENTATIONS
# ============================================================================

def search_features(context: str, features: List[str], exact: bool = False, min_count: int = 1) -> str:
    """Execute redbiom search features command"""
    cmd = f"redbiom search features --context {context}"
    if exact:
        cmd += " --exact"
    cmd += f" --min-count {min_count}"
    cmd += " " + " ".join(features)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def search_samples(context: str, samples: List[str], exact: bool = False, min_count: int = 1) -> str:
    """Execute redbiom search samples command"""
    cmd = f"redbiom search samples --context {context}"
    if exact:
        cmd += " --exact"
    cmd += f" --min-count {min_count}"
    cmd += " " + " ".join(samples)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def search_metadata(query: str, categories: bool = False) -> str:
    """Execute redbiom search metadata command"""
    cmd = "redbiom search metadata"
    if categories:
        cmd += " --categories"
    cmd += f' "{query}"'
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def search_taxon(context: str, taxon: str) -> str:
    """Execute redbiom search taxon command"""
    cmd = f'redbiom search taxon --context {context} "{taxon}"'
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def fetch_sample_metadata(
    output: str,
    context: str = None,
    samples: List[str] = None,
    all_columns: bool = False,
    resolve_ambiguities: bool = False,
    tagged: bool = False,
    force_categories: List[str] = None
) -> str:
    """Execute redbiom fetch sample-metadata command"""
    cmd = f"redbiom fetch sample-metadata --output {output}"
    
    if context:
        cmd += f" --context {context}"
    if all_columns:
        cmd += " --all-columns"
    if resolve_ambiguities:
        cmd += " --resolve-ambiguities"
    if tagged:
        cmd += " --tagged"
    if force_categories:
        for cat in force_categories:
            cmd += f" --force-category {cat}"
    if samples:
        cmd +=  " --samples".join(samples)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        if result.returncode == 0:
            return f"Success! Metadata saved to {output}\n{result.stdout}"
        return result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def fetch_features(
    output: str,
    context: str,
    features: List[str],
    exact: bool = False,
    md5: bool = False,
    resolve_ambiguities: str = "none",
    fetch_taxonomy: bool = False
) -> str:
    """Execute redbiom fetch features command"""
    cmd = f"redbiom fetch features --context {context} --output {output}"
    
    if exact:
        cmd += " --exact"
    if md5:
        cmd += " --md5"
    if resolve_ambiguities != "none":
        cmd += f" --resolve-ambiguities {resolve_ambiguities}"
    if fetch_taxonomy:
        cmd += " --fetch-taxonomy"
    cmd += " " + " ".join(features)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        if result.returncode == 0:
            return f"Success! Features saved to {output}\n{result.stdout}"
        return result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def fetch_samples(
    output: str,
    context: str,
    samples: List[str],
    md5: bool = False,
    resolve_ambiguities: str = "none",
    fetch_taxonomy: bool = False
) -> str:
    """Execute redbiom fetch samples command"""
    cmd = f"redbiom fetch samples --context {context} --output {output}"
    
    if md5:
        cmd += " --md5"
    if resolve_ambiguities != "none":
        cmd += f" --resolve-ambiguities {resolve_ambiguities}"
    if fetch_taxonomy:
        cmd += " --fetch-taxonomy"
    cmd += " " + " ".join(samples)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        if result.returncode == 0:
            return f"Success! Samples saved to {output}\n{result.stdout}"
        return result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def fetch_qiita_study(
    study_id: int,
    context: str,
    output_basename: str,
    resolve_ambiguities: str = "none",
    fetch_taxonomy: bool = False,
    remove_blanks: bool = False,
    md5: bool = True
) -> str:
    """Execute redbiom fetch qiita-study command"""
    cmd = f"redbiom fetch qiita-study --study-id {study_id} --context {context} --output-basename {output_basename}"
    
    if resolve_ambiguities != "none":
        cmd += f" --resolve-ambiguities {resolve_ambiguities}"
    if fetch_taxonomy:
        cmd += " --fetch-taxonomy"
    if remove_blanks:
        cmd += " --remove-blanks"
    if md5:
        cmd += " --md5 True"
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=600)
        if result.returncode == 0:
            return f"Success! Study saved with basename {output_basename}\n{result.stdout}"
        return result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def fetch_samples_contained(context: str = None, unambiguous: bool = False) -> str:
    """Execute redbiom fetch samples-contained command"""
    cmd = "redbiom fetch samples-contained"
    
    if context:
        cmd += f" --context {context}"
    if unambiguous:
        cmd += " --unambiguous"
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def fetch_features_contained(context: str = None) -> str:
    """Execute redbiom fetch features-contained command"""
    cmd = "redbiom fetch features-contained"
    
    if context:
        cmd += f" --context {context}"
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def summarize_contexts() -> str:
    """Execute redbiom summarize contexts command"""
    cmd = "redbiom summarize contexts"
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def summarize_metadata_category(category: str, counter: bool = False, descending: bool = False, dump: bool = False) -> str:
    """Execute redbiom summarize metadata-category command"""
    cmd = f"redbiom summarize metadata-category --category {category}"
    
    if counter:
        cmd += " --counter"
    if descending:
        cmd += " --descending"
    if dump:
        cmd += " --dump"
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def summarize_metadata(categories: List[str] = None, descending: bool = False) -> str:
    """Execute redbiom summarize metadata command"""
    cmd = "redbiom summarize metadata"
    
    if descending:
        cmd += " --descending"
    if categories:
        cmd += " " + " ".join(categories)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def summarize_features(category: str, context: str, features: List[str], exact: bool = False) -> str:
    """Execute redbiom summarize features command"""
    cmd = f"redbiom summarize features --category {category} --context {context}"
    
    if exact:
        cmd += " --exact"
    cmd += " " + " ".join(features)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def summarize_samples(category: str, samples: List[str]) -> str:
    """Execute redbiom summarize samples command"""
    cmd = f"redbiom summarize samples --category {category}"
    cmd += " " + " ".join(samples)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def summarize_taxonomy(context: str, features: List[str], normalize_ranks: str = "kpcofgs") -> str:
    """Execute redbiom summarize taxonomy command"""
    cmd = f"redbiom summarize taxonomy --context {context} --normalize-ranks {normalize_ranks}"
    cmd += " " + " ".join(features)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def select_samples_from_metadata(context: str, query: str, samples: List[str]) -> str:
    """Execute redbiom select samples-from-metadata command"""
    cmd = f'redbiom select samples-from-metadata --context {context} "{query}"'
    cmd += " " + " ".join(samples)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

def select_features_from_samples(context: str, samples: List[str], exact: bool = False) -> str:
    """Execute redbiom select features-from-samples command"""
    cmd = f"redbiom select features-from-samples --context {context}"
    
    if exact:
        cmd += " --exact"
    cmd += " " + " ".join(samples)
    
    print(f"Executing: {cmd}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
        return result.stdout if result.stdout else result.stderr
    except Exception as e:
        return f"Error: {str(e)}"

# ============================================================================
# FUNCTION REGISTRY
# ============================================================================

available_functions = {
    "search_features": search_features,
    "search_samples": search_samples,
    "search_metadata": search_metadata,
    "search_taxon": search_taxon,
    "fetch_sample_metadata": fetch_sample_metadata,
    "fetch_features": fetch_features,
    "fetch_samples": fetch_samples,
    "fetch_qiita_study": fetch_qiita_study,
    "fetch_samples_contained": fetch_samples_contained,
    "fetch_features_contained": fetch_features_contained,
    "summarize_contexts": summarize_contexts,
    "summarize_metadata_category": summarize_metadata_category,
    "summarize_metadata": summarize_metadata,
    "summarize_features": summarize_features,
    "summarize_samples": summarize_samples,
    "summarize_taxonomy": summarize_taxonomy,
    "select_samples_from_metadata": select_samples_from_metadata,
    "select_features_from_samples": select_features_from_samples
}

In [7]:
def run_agent(client: OpenAI, user_query: str, max_iterations: int = 5) -> dict:
    """
    Run the agent to process a user query and generate redbiom commands.
    
    Args:
        client: OpenAI client instance
        user_query: Natural language query from user
        max_iterations: Maximum number of tool calling iterations
    
    Returns:
        Dictionary containing conversation history, commands, and results
    """
    messages = [{"role": "user", "content": user_query}]
    commands_generated = []
    iteration = 0
    
    # client = OpenAI(
#     api_key=os.environ["NRP_API_KEY"],
#     base_url="https://ellm.nrp-nautilus.io/v1",
#     # tools=tools
# )
    while iteration < max_iterations:
        iteration += 1
        
        # Call the LLM with tools
        response = client.chat.completions.create(
            model="qwen3",
            messages=messages,
            tools=tools,
            tool_choice="auto"
        )
        
        response_message = response.choices[0].message
        messages.append(response_message)
        
        # Check if the model wants to call functions
        tool_calls = response_message.tool_calls
        
        if not tool_calls:
            # No more function calls, we're done
            break
        
        # Process each tool call
        for tool_call in tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)
            
            print(f"\n[Tool Call] {function_name}")
            print(f"[Arguments] {json.dumps(function_args, indent=2)}")
            
            # Get the function from our registry
            function_to_call = available_functions.get(function_name)
            
            if function_to_call:
                # Call the function to get the command
                command = function_to_call(**function_args)
                print(f"[Command Generated] {command}")
                
                commands_generated.append({
                    "function": function_name,
                    "arguments": function_args,
                    "command": command
                })
                
                # Add the function response to messages
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "name": function_name,
                    "content": command
                })
            else:
                # Function not found
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "name": function_name,
                    "content": f"Error: Function {function_name} not found"
                })
    
    return {
        "messages": messages,
        "commands": commands_generated,
        "final_response": response_message.content if response_message.content else None
    }

In [8]:
client = OpenAI(
    api_key=os.environ["NRP_API_KEY"],
    base_url="https://ellm.nrp-nautilus.io/v1"
)

# Example queries
example_queries = [
    "Find samples containing Bacteroides in the Woltka-per-genome-WoLr2-3ab352 context",
    "Search for samples where age is less than 30 days and antibiotics were used",
    "Get all available contexts in the database",
    "Download study 10317 from Qiita using context Woltka-per-genome-WoLr2-3ab352 and save it as study_10317",
    "Find features associated with Escherichia coli in Woltka-per-genome-WoLr2-3ab352"
]

print("=" * 80)
print("REDBIOM AGENT - EXAMPLE QUERIES")
print("=" * 80)

for i, query in enumerate(example_queries, 1):
    print(f"\n{'='*80}")
    print(f"Query {i}: {query}")
    print('='*80)
    
    result = run_agent(client, query)
    
    if result["commands"]:
        print(f"\n✓ Generated {len(result['commands'])} command(s):")
        for j, cmd_info in enumerate(result["commands"], 1):
            print(f"\n  Command {j}:")
            print(f"    Function: {cmd_info['function']}")
            print(f"    Command:  {cmd_info['command']}")
    
    if result["final_response"]:
        print(f"\n💬 AI Response: {result['final_response']}")
    
    print()

print("=" * 80)
print("INTERACTIVE MODE")
print("=" * 80)
print("Enter your queries (type 'exit' to quit):\n")

while True:
    user_input = input("You: ").strip()
    
    if user_input.lower() in ['exit', 'quit', 'q']:
        print("Goodbye!")
        break
    
    if not user_input:
        continue
    
    print("\nProcessing...")
    result = run_agent(client, user_input)
    
    if result["commands"]:
        print(f"\n✓ Generated {len(result['commands'])} command(s):")
        for cmd_info in result["commands"]:
            print(f"  → {cmd_info['command']}")
    
    if result["final_response"]:
        print(f"\nAI: {result['final_response']}")
    
    print()

REDBIOM AGENT - EXAMPLE QUERIES

Query 1: Find samples containing Bacteroides in the Woltka-per-genome-WoLr2-3ab352 context

[Tool Call] search_taxon
[Arguments] {
  "context": "Woltka-per-genome-WoLr2-3ab352",
  "taxon": "Bacteroides"
}
Executing: redbiom search taxon --context Woltka-per-genome-WoLr2-3ab352 "Bacteroides"
[Command Generated] 

[Tool Call] search_features
[Arguments] {
  "context": "Woltka-per-genome-WoLr2-3ab352",
  "features": [
    "Bacteroides_feature_1",
    "Bacteroides_feature_2",
    "... (250 total)"
  ],
  "exact": false
}
Executing: redbiom search features --context Woltka-per-genome-WoLr2-3ab352 --min-count 1 Bacteroides_feature_1 Bacteroides_feature_2 ... (250 total)
[Command Generated] /bin/sh: -c: line 0: syntax error near unexpected token `('
/bin/sh: -c: line 0: `redbiom search features --context Woltka-per-genome-WoLr2-3ab352 --min-count 1 Bacteroides_feature_1 Bacteroides_feature_2 ... (250 total)'


[Tool Call] search_features
[Arguments] {
  "conte

KeyboardInterrupt: 

In [None]:
output = "beer.biom"
# context ="Woltka-per-genome-WoLr2-3ab352"
context = "Pick_closed-reference_OTUs-Greengenes-Illumina-16S-V4-5c6506"
samples = "10105.Ingredient.18"
result = available_functions["fetch_samples"](
    output=output,
    context=context,
    samples=[samples],
)

print(result)

Executing: redbiom fetch samples --context Pick_closed-reference_OTUs-Greengenes-Illumina-16S-V4-5c6506 --output beer.biom 10105.Ingredient.18
Success! Samples saved to beer.biom



In [None]:
# Additional example queries that should work:

# Example 1: Simple fetch
messages = [
    {"role": "system", "content": "You are a microbiome analyst. Use fetch_sample_metadata tool."},
    {"role": "user", "content": "Get all metadata and save to my_metadata.tsv"}
]

# Example 2: Fetch specific samples
messages = [
    {"role": "system", "content": "You are a microbiome analyst. Use fetch_sample_metadata tool."},
    {"role": "user", "content": "Fetch metadata for samples sample1 sample2 sample3 and save to output.tsv"}
]

# Example 3: Fetch with all columns
messages = [
    {"role": "system", "content": "You are a microbiome analyst. Use fetch_sample_metadata tool."},
    {"role": "user", "content": "Get sample metadata with all columns from Woltka-per-genome-WoLr2-3ab352, save to full_metadata.tsv"}
]

# Example 4: User specifies file type
messages = [
    {"role": "system", "content": "You are a microbiome analyst. Use fetch_sample_metadata tool."},
    {"role": "user", "content": "Fetch metadata and save as CSV to data.csv"}
]

In [None]:
import os
import dotenv
from openai import OpenAI
dotenv.load_dotenv()

client = OpenAI(
        api_key=os.environ["NRP_API_KEY"],
        base_url="https://ellm.nrp-nautilus.io/v1"
    )
# sanity check for LLM response
completion = client.chat.completions.create(
    model="gemma3",
    messages=[
        {"role": "system", "content": "Talk like a pirate."},
        {
            "role": "user",
            "content": "How do I check if a Python object is an instance of a class?",
        },
    ],
)

print(completion.choices[0].message.content)


Ahoy, matey! Ye be askin' how to see if a treasure (a Python object, aye!) be truly from a certain ship (a class)? Shiver me timbers, it be easier than swabbin' the deck! 

Ye use the `isinstance()` function, ye do! 

Here's how it be workin':

```python
isinstance(object, classinfo)
```

*   `object`: This be the treasure ye be inspectin'.
*   `classinfo`: This be the ship (class) ye be checkin' if the treasure came from.

It'll return `True` if the treasure *is* an instance o' that ship, and `False` if it ain't.

**Example, ye say? Aye, here be one!**

```python
class Pirate:
    def __init__(self, name):
        self.name = name

class Captain(Pirate):
    def __init__(self, name, ship):
        super().__init__(name)
        self.ship = ship

jack = Pirate("Jack Sparrow")
barbossa = Captain("Hector Barbossa", "Black Pearl")

print(isinstance(jack, Pirate))      # Output: True - Jack be a Pirate!
print(isinstance(barbossa, Pirate))  # Output: True - Barbossa be a Pirate too, seein' 

In [3]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "search_redbiom_metadata",
            "description": "Search metadata using NLP-based queries. Use 'where' clauses for filtering (e.g., 'where age_days < 30'). Use operators: & (and), | (or), - (not). Set categories=True to search for metadata column names instead of values.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The search query (e.g., 'beer' or 'where age_days < 30')"
                    },
                    "categories": {
                        "type": "boolean",
                        "description": "If True, search for metadata column names instead of values",
                        "default": False
                    }
                },
                "required": ["query"]
            }
        }
    }
]

In [None]:

def search_redbiom_metadata(query: str, categories: bool = False) -> str:
    """
    Search metadata using NLP-based queries.
    
    Use 'where' clauses for filtering: 'where age_days < 30'
    Use operators: & (and), | (or), - (not)
    Set categories=True to search for metadata column names instead of values.
    """
    cmd = ["redbiom", "search", "metadata"]
    
    if categories:
        cmd.append("--categories")
    
    cmd.append(query)
    
    # try:
    #     result = subprocess.run(
    #         cmd, 
    #         capture_output=True, 
    #         text=True, 
    #         check=True,
    #         env={**os.environ, "REDBIOM_HOST": "http://redbiom.ucsd.edu:7330"}
    #     )
    #     return result.stdout
    # except subprocess.CalledProcessError as e:
    #     return f"Error: {e.stderr}"
    
    return cmd
available_functions = {"search_redbiom_metadata": search_redbiom_metadata}

In [None]:
def process_query(user_query: str) -> dict:
    """
    Process a user query by:
    1. Having the LLM decide if it needs to call a tool
    2. Execute the tool if needed
    3. Generate a final response with explanation
    """
    
    # Initial message to the LLM
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that helps users search redbiom metadata. When you use tools, always explain your reasoning about why you chose that tool and how you constructed the query."
        },
        {
            "role": "user",
            "content": user_query
        }
    ]
    
    # First API call - LLM decides whether to use tools
    client = OpenAI(
        api_key=os.environ["NRP_API_KEY"],
        base_url="https://ellm.nrp-nautilus.io/v1"
    )

    response = client.chat.completions.create(
        model="qwen3",
        messages=messages,
        tools=tools,
        tool_choice="auto"
    )
    
    response_message = response.choices[0].message
    messages.append(response_message)
    
    # Check if the LLM wants to call a function
    if response_message.tool_calls:
        # Execute each tool call
        for tool_call in response_message.tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)
            
            # Call the actual function
            function_response = available_functions[function_name](**function_args)
            
            # Add the function response to messages
            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "name": function_name,
                "content": function_response
            })
        
        # Second API call - LLM generates explanation with results
        final_response = client.chat.completions.create(
            model="qwen3",
            messages=messages
        )
        
        return {
            "explanation": final_response.choices[0].message.content,
            "tool_used": function_name,
            "tool_args": function_args,
            "tool_result": function_response
        }
    else:
        # No tool was called, return the direct response
        return {
            "explanation": response_message.content,
            "tool_used": None,
            "tool_args": None,
            "tool_result": None
        }


In [9]:
user_query = "Get all the samples in which the word 'beer' is found"
    
result = process_query(user_query)

print("=" * 60)
print("EXPLANATION:")
print("=" * 60)
print(result["explanation"])
print()

if result["tool_used"]:
    print("=" * 60)
    print("TOOL DETAILS:")
    print("=" * 60)
    print(f"Tool: {result['tool_used']}")
    print(f"Arguments: {result['tool_args']}")
    print()
    print("=" * 60)
    print("RESULTS (first 10 lines):")
    print("=" * 60)
    print('\n'.join(result["tool_result"].split('\n')[:10]))

EXPLANATION:
```tool_code
[search_redbiom_metadata(query='beer')]
```



In [10]:
result

{'explanation': "```tool_code\n[search_redbiom_metadata(query='beer')]\n```",
 'tool_used': None,
 'tool_args': None,
 'tool_result': None}

# Retrieving Qiita studies

In [1]:
import os
from collections import defaultdict
import redbiom
import redbiom.search
import redbiom.summarize


def search_qiita_studies(query, search_type='metadata'):
    """
    Search Qiita's public redbiom database for microbiome studies.
    
    Args:
        query (str): Your search query
            Examples:
            - "gut microbiome obesity" (metadata)
            - "where age_years > 50 and body_site == 'gut'" (metadata)
            - "TACGGAGGATGCGAGCGTTATCCGG" (feature/sequence)
            - "4479944" (feature ID)
            - "Bacteroides" (taxon)
        
        search_type (str): 'metadata', 'feature', or 'taxon' (default: 'metadata')
    
    Returns:
        list: List of matching studies, each with:
            - study_id: Study identifier
            - study_title: Study name
            - num_artifacts: Number of artifacts found
            - num_samples: Number of samples found
            - samples: List of sample IDs
    
    Example:
        >>> results = search_qiita_studies("gut samples", "metadata")
        >>> for study in results:
        ...     print(f"{study['study_title']}: {study['num_samples']} samples")
    """
    # Set to use Qiita's public redbiom server
    os.environ['REDBIOM_HOST'] = 'http://qiita.ucsd.edu:7379'
    
    # Get available contexts (databases)
    contexts = redbiom.summarize.contexts()
    if not contexts:
        print("Warning: No redbiom contexts available")
        return []
    
    # Use the first/main context
    context = contexts[0]
    print(f"Searching in context: {context}")
    
    # Perform the search based on type
    sample_ids = []
    
    try:
        if search_type == 'metadata':
            # Natural language metadata search
            sample_ids = redbiom.search.metadata_full(query, context)
            
        elif search_type == 'feature':
            # Feature/sequence search
            if set(query.upper()) <= set('ACGT'):
                # It's a DNA sequence
                results = redbiom.search.features([query], context, exact=True)
            else:
                # It's a feature ID
                results = redbiom.search.features([query], context, exact=False)
            
            # Flatten results
            if isinstance(results, dict):
                sample_ids = []
                for samples in results.values():
                    sample_ids.extend(samples)
        
        elif search_type == 'taxon':
            # Taxonomic search
            sample_ids = redbiom.search.taxon(query, context)
        
        else:
            raise ValueError(f"Invalid search_type: {search_type}")
        
        if not sample_ids:
            print(f"No samples found for query: '{query}'")
            return []
        
        print(f"Found {len(sample_ids)} matching samples")
        
        # Get metadata to group by study
        metadata = redbiom.summarize.sample_metadata(sample_ids, context)
        
        # Group samples by study
        studies = defaultdict(lambda: {
            'samples': [],
            'title': 'Unknown Study',
            'study_id': None
        })
        
        for sample_id in sample_ids:
            if sample_id in metadata.index:
                row = metadata.loc[sample_id]
                study_id = row.get('qiita_study_id', 'unknown')
                study_title = row.get('qiita_study_title', f'Study {study_id}')
                
                studies[study_id]['samples'].append(sample_id)
                studies[study_id]['title'] = study_title
                studies[study_id]['study_id'] = study_id
        
        # Format results
        results = []
        for study_id, data in studies.items():
            results.append({
                'study_id': study_id,
                'study_title': data['title'],
                'num_artifacts': 1,  # Simplified
                'num_samples': len(data['samples']),
                'samples': data['samples']
            })
        
        # Sort by number of samples (most relevant first)
        results.sort(key=lambda x: x['num_samples'], reverse=True)
        
        return results
        
    except Exception as e:
        print(f"Search error: {e}")
        return []


def search_and_display(query, search_type='metadata', max_results=10):
    """
    Search and display results in a nice format.
    
    Args:
        query: Search query
        search_type: 'metadata', 'feature', or 'taxon'
        max_results: Maximum number of studies to display
    """
    print(f"\n{'='*70}")
    print(f"Searching for: '{query}' (type: {search_type})")
    print(f"{'='*70}\n")
    
    results = search_qiita_studies(query, search_type)
    
    if not results:
        print("No results found.")
        return
    
    print(f"\nFound {len(results)} studies with matching samples:\n")
    
    for i, study in enumerate(results[:max_results], 1):
        print(f"{i}. Study ID: {study['study_id']}")
        print(f"   Title: {study['study_title']}")
        print(f"   Samples: {study['num_samples']}")
        print(f"   First 5 samples: {', '.join(study['samples'][:5])}")
        if len(study['samples']) > 5:
            print(f"   ... and {len(study['samples']) - 5} more")
        print()
    
    if len(results) > max_results:
        print(f"... and {len(results) - max_results} more studies")

# Example usage
search_and_display("gut microbiome obesity", search_type='metadata')


Searching for: 'gut microbiome obesity' (type: metadata)



ConnectionError: HTTPConnectionPool(host='qiita.ucsd.edu', port=7379): Max retries exceeded with url: /HGETALL/state:contexts.json (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x115e57810>: Failed to establish a new connection: [Errno 61] Connection refused'))