In [9]:
from datasets import load_dataset
from biomni.eval import BiomniEval1
from biomni.agent import A1
from biomni.eval import BiomniEval1
import pandas as pd
import os
file_path = 'biomni_eval1_dataset.parquet'
evaluator = BiomniEval1()
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("biomni/Eval1")
if os.path.exists(file_path):
    df = pd.read_parquet(file_path)
    print(f"Successfully loaded '{file_path}' into DataFrame 'df'. Shape: {df.shape}")
else:
    print(f"Error: The file '{file_path}' was not found.")
    print("Please make sure the file is in the same directory as this script.")
    # Exit script or handle error if file is critical
    df = None # Ensure df is explicitly not defined if file is missing

Loaded BiomniEval1 dataset: 433 instances across 10 tasks
Successfully loaded 'biomni_eval1_dataset.parquet' into DataFrame 'df'. Shape: (433, 6)


In [10]:
# Initialize agent and evaluator
agent = A1(path='./data', llm='claude-sonnet-4-20250514')
evaluator = BiomniEval1()

# Get validation instances for a task
val_instances = evaluator.get_instances_by_task(
    'gwas_causal_gene_opentargets', 
    split='val'
)
#val_instances
selected_val_instances = val_instances.iloc[1:4]
print(selected_val_instances)

ðŸŽ“ Academic mode: Using all datasets (including non-commercial)

ðŸ”§ BIOMNI CONFIGURATION
ðŸ“‹ DEFAULT CONFIG (Including Database LLM):
  Path: ./data
  Timeout Seconds: 600
  Llm: claude-sonnet-4-5
  Temperature: 0.7
  Use Tool Retriever: True
  Commercial Mode: Academic (all datasets)

ðŸ¤– AGENT LLM (Constructor Override):
  LLM Model: claude-sonnet-4-20250514

Checking and downloading missing data lake files...
Loaded BiomniEval1 dataset: 433 instances across 10 tasks
     instance_id  task_instance_id  \
334          334               619   
335          335               760   
336          336               645   

                                                prompt  \
334  Your task is to identify likely causal genes w...   
335  Your task is to identify likely causal genes w...   
336  Your task is to identify likely causal genes w...   

                        task_name split answer  
334  gwas_causal_gene_opentargets   val  PPARG  
335  gwas_causal_gene_opentargets   

In [11]:
from biomni.agent import A1
from biomni.eval import BiomniEval1
import re

def extract_answer(agent_output):
    """
    Extract the answer from <solution></solution> tags in the agent output.
    
    Args:
        agent_output: String output from agent.go() containing <solution> tags
        
    Returns:
        str: The extracted answer, or None if no solution tag found
    """
    # Handle None or empty input
    if not agent_output:
        return None
    
    # Convert to string if needed (in case it's not already)
    output_str = str(agent_output)
    
    # Use regex to find content between <solution> and </solution> tags
    # re.DOTALL allows . to match newlines
    match = re.search(r'<solution>(.*?)</solution>', output_str, re.DOTALL)
    
    if match:
        # Extract the content and strip whitespace
        answer = match.group(1).strip()
        return answer
    
    # If no solution tag found, return None
    return None
# Initialize agent and evaluator
#    split='val'
#)
evaluations = []
for _, row in selected_val_instances.iterrows():
    # Run agent on prompt
    result = agent.go(row['prompt'])
    
    # Extract answer from agent output (implementation-specific)
    user_answer = extract_answer(result)  # Custom function
    
    evaluations.append((
        row['task_name'],
        row['task_instance_id'],
        user_answer
    ))
evaluations

Using prompt-based retrieval with the agent's LLM

Your task is to identify likely causal genes within a locus for a given GWAS phenotype. From the list, provide only the likely causal gene (matching one of the given genes). 
Identify the causal gene.
GWAS phenotype: Type 2 diabetes (type II diabetes mellitus)
Genes in locus: {CAND2},{ENSG00000289809},{MKRN2},{MKRN2OS},{PPARG},{RAF1},{RPL32},{SYN2},{TIMP4},{TMEM40},{TSEN2}


I'll help you identify the likely causal gene for Type 2 diabetes from the given list. Let me create a systematic plan to analyze each gene and determine which one is most likely to be causally related to Type 2 diabetes.

## Plan:
1. [ ] Research each gene in the locus to understand their functions and known associations
2. [ ] Query databases for Type 2 diabetes associations with each gene
3. [ ] Look up literature evidence for each gene's role in diabetes/metabolism
4. [ ] Analyze the biological pathways and mechanisms
5. [ ] Identify the most likely causal gene

[('gwas_causal_gene_opentargets', 619, '\\nPPARG\\n'),
 ('gwas_causal_gene_opentargets', 760, '\\nCHEK2\\n'),
 ('gwas_causal_gene_opentargets', 645, '\\nRREB1\\n')]

In [12]:
def clean_answer(answer): 
    """
    Clean answer by removing all whitespace, newlines, and escaped newlines
    """
    if answer is None:
        return ""
    
    answer_str = str(answer)
    
    # Remove escaped newlines (\\n)
    answer_str = answer_str.replace('\\n', '')
    
    # Remove actual newlines (\n)
    answer_str = answer_str.replace('\n', '')
    
    # Remove all whitespace
    answer_str = answer_str.strip()
    
    return answer_str

def calculate_accuracy(evaluations, eval_df):
    """
    Calculate accuracy by matching evaluation results with ground truth.
    """
    correct = 0
    total = len(evaluations)
    
    print("=" * 60)
    print("EVALUATION RESULTS")
    print("=" * 60)
    
    for task_name, task_instance_id, user_answer in evaluations:
        # Find ground truth
        ground_truth_row = eval_df[eval_df['task_instance_id'] == task_instance_id]
        
        if len(ground_truth_row) > 0:
            ground_truth = ground_truth_row.iloc[0]['answer']
            
            # Clean both answers
            user_clean = clean_answer(user_answer)
            truth_clean = clean_answer(ground_truth)
            
            # Case-insensitive comparison
            is_correct = user_clean.upper() == truth_clean.upper()
            
            if is_correct:
                correct += 1
                status = "âœ“ CORRECT"
            else:
                status = "âœ— INCORRECT"
            
            print(f"\nTask Instance ID: {task_instance_id}")
            print(f"  User Answer (raw):    '{user_answer}'")
            print(f"  User Answer (clean):  '{user_clean}'")
            print(f"  Ground Truth (clean): '{truth_clean}'")
            print(f"  Status: {status}")
        else:
            print(f"\nâš  Task Instance ID {task_instance_id}: No ground truth found")
    
    accuracy = correct / total if total > 0 else 0
    
    print("\n" + "=" * 60)
    print(f"Total: {total} | Correct: {correct} | Accuracy: {accuracy:.2%}")
    print("=" * 60)
    
    return {
        'total': total,
        'correct': correct,
        'accuracy': accuracy
    }

# Your evaluation results
evaluations = [
    ('gwas_causal_gene_opentargets', 619, 'PPARG'),
    ('gwas_causal_gene_opentargets', 760, '\\nCHEK2\\n'),
    ('gwas_causal_gene_opentargets', 645, '\\nRREB1\\n')
]

# Load eval DataFrame
eval_df = pd.read_parquet('biomni_eval1_dataset.parquet')

# Calculate accuracy
results = calculate_accuracy(evaluations, eval_df)

EVALUATION RESULTS

Task Instance ID: 619
  User Answer (raw):    'PPARG'
  User Answer (clean):  'PPARG'
  Ground Truth (clean): 'PPARG'
  Status: âœ“ CORRECT

Task Instance ID: 760
  User Answer (raw):    '\nCHEK2\n'
  User Answer (clean):  'CHEK2'
  Ground Truth (clean): 'CHEK2'
  Status: âœ“ CORRECT

Task Instance ID: 645
  User Answer (raw):    '\nRREB1\n'
  User Answer (clean):  'RREB1'
  Ground Truth (clean): 'RREB1'
  Status: âœ“ CORRECT

Total: 3 | Correct: 3 | Accuracy: 100.00%


In [15]:
results

{'total': 3, 'correct': 3, 'accuracy': 1.0}