# BERDL Pangenome Data Exploration - Simple & Robust

This notebook uses simpler queries to avoid timeouts on the BERDL API.

**Strategy**: Use small, focused queries instead of large JOINs

## 1. Setup

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time

sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Imports successful")

In [None]:
# Load auth token
with open('.env', 'r') as f:
    for line in f:
        if line.startswith('KBASE_AUTH_TOKEN'):
            AUTH_TOKEN = line.split('"')[1]
            break

BASE_URL = "https://hub.berdl.kbase.us/apis/mcp"
DATABASE = "kbase_ke_pangenome"

HEADERS = {
    "Authorization": f"Bearer {AUTH_TOKEN}",
    "Content-Type": "application/json"
}

print(f"‚úÖ Connected to BERDL: {BASE_URL}")
print(f"‚úÖ Database: {DATABASE}")

In [None]:
def query_berdl(sql, limit=1000, timeout_retry=False):
    """Query BERDL with optional retry on timeout."""
    url = f"{BASE_URL}/delta/tables/query"
    payload = {"query": sql, "limit": limit}
    
    try:
        response = requests.post(url, headers=HEADERS, json=payload, timeout=60)
        response.raise_for_status()
        data = response.json()
        
        if 'results' in data:
            return pd.DataFrame(data['results'])
        return pd.DataFrame()
    except requests.exceptions.Timeout:
        print("‚è± Query timeout")
        return None
    except requests.exceptions.HTTPError as e:
        if '504' in str(e) or '503' in str(e):
            print(f"‚è± Server timeout: {e}")
            return None
        raise

print("‚úÖ Helper functions defined")

## 2. Start Small - Query Pangenome Table Only

Let's avoid JOINs and just get pangenome stats first

In [None]:
# Simple query - just pangenome table, small limit
sql = f"""
SELECT 
    gtdb_species_clade_id,
    no_genomes,
    no_core,
    no_aux_genome,
    no_singleton_gene_clusters,
    no_gene_clusters
FROM {DATABASE}.pangenome
ORDER BY no_genomes DESC
LIMIT 100
"""

print("Querying top 100 species...")
df_pangenome = query_berdl(sql, limit=100)

if df_pangenome is not None and len(df_pangenome) > 0:
    print(f"‚úÖ Loaded {len(df_pangenome)} species")
    display(df_pangenome.head(10))
else:
    print("‚ùå Query failed or timed out")

## 3. Basic Pangenome Statistics

In [None]:
if df_pangenome is not None and len(df_pangenome) > 0:
    # Calculate percentages
    df_pangenome['pct_core'] = (df_pangenome['no_core'] / df_pangenome['no_gene_clusters'] * 100).round(2)
    df_pangenome['pct_accessory'] = (df_pangenome['no_aux_genome'] / df_pangenome['no_gene_clusters'] * 100).round(2)
    df_pangenome['pct_singletons'] = (df_pangenome['no_singleton_gene_clusters'] / df_pangenome['no_gene_clusters'] * 100).round(2)
    
    print("\nüìä Pangenome Statistics (top 100 species):")
    print(df_pangenome[['no_genomes', 'no_core', 'no_aux_genome', 'no_singleton_gene_clusters']].describe())
else:
    print("‚ö†Ô∏è No data to analyze")

## 4. Visualizations

In [None]:
if df_pangenome is not None and len(df_pangenome) > 0:
    # Distribution of genomes
    fig = px.histogram(
        df_pangenome, 
        x='no_genomes',
        title='Distribution of Genomes per Species (Top 100)',
        labels={'no_genomes': 'Number of Genomes'},
        nbins=30
    )
    fig.show()
else:
    print("‚ö†Ô∏è No data to visualize")

In [None]:
if df_pangenome is not None and len(df_pangenome) > 0:
    # Core vs Accessory genes
    fig = px.scatter(
        df_pangenome,
        x='no_core',
        y='no_aux_genome',
        size='no_genomes',
        hover_data=['gtdb_species_clade_id', 'no_genomes'],
        title='Core vs Accessory Genes (Top 100 Species)',
        labels={
            'no_core': 'Core Genes',
            'no_aux_genome': 'Accessory Genes',
            'no_genomes': 'Genomes'
        },
        log_x=True,
        log_y=True
    )
    fig.show()
else:
    print("‚ö†Ô∏è No data to visualize")

In [None]:
if df_pangenome is not None and len(df_pangenome) > 0:
    # Pangenome composition
    fig = go.Figure()
    
    fig.add_trace(go.Box(y=df_pangenome['pct_core'], name='Core %'))
    fig.add_trace(go.Box(y=df_pangenome['pct_accessory'], name='Accessory %'))
    fig.add_trace(go.Box(y=df_pangenome['pct_singletons'], name='Singletons %'))
    
    fig.update_layout(
        title='Pangenome Composition Distribution (Top 100 Species)',
        yaxis_title='Percentage',
        showlegend=True
    )
    fig.show()
else:
    print("‚ö†Ô∏è No data to visualize")

## 5. Try to Get More Species (Incrementally)

Let's try to get more data by querying in batches

In [None]:
# Try to get 500 species total
sql = f"""
SELECT 
    gtdb_species_clade_id,
    no_genomes,
    no_core,
    no_aux_genome,
    no_singleton_gene_clusters,
    no_gene_clusters
FROM {DATABASE}.pangenome
ORDER BY no_genomes DESC
LIMIT 500
"""

print("Attempting to load 500 species...")
df_pangenome_larger = query_berdl(sql, limit=500)

if df_pangenome_larger is not None and len(df_pangenome_larger) > 0:
    print(f"‚úÖ Loaded {len(df_pangenome_larger)} species")
    df_pangenome = df_pangenome_larger  # Use the larger dataset
    
    # Recalculate percentages
    df_pangenome['pct_core'] = (df_pangenome['no_core'] / df_pangenome['no_gene_clusters'] * 100).round(2)
    df_pangenome['pct_accessory'] = (df_pangenome['no_aux_genome'] / df_pangenome['no_gene_clusters'] * 100).round(2)
    df_pangenome['pct_singletons'] = (df_pangenome['no_singleton_gene_clusters'] / df_pangenome['no_gene_clusters'] * 100).round(2)
    
    print("\nUpdated statistics:")
    print(df_pangenome[['no_genomes', 'no_core', 'no_aux_genome']].describe())
else:
    print("‚ùå Query failed - sticking with previous dataset")

## 6. Query Species Names (Separate Query)

Get species names from gtdb_species_clade table separately

In [None]:
# Get a few species names
if df_pangenome is not None and len(df_pangenome) > 0:
    # Take top 10 species IDs
    top_species_ids = df_pangenome.head(10)['gtdb_species_clade_id'].tolist()
    
    # Query their names
    species_ids_str = "','" .join(top_species_ids)
    sql = f"""
    SELECT 
        gtdb_species_clade_id,
        GTDB_species,
        mean_intra_species_ANI,
        ANI_circumscription_radius
    FROM {DATABASE}.gtdb_species_clade
    WHERE gtdb_species_clade_id IN ('{species_ids_str}')
    """
    
    print("Fetching species names for top 10...")
    df_species_info = query_berdl(sql, limit=20)
    
    if df_species_info is not None and len(df_species_info) > 0:
        print(f"‚úÖ Got {len(df_species_info)} species names")
        
        # Merge with pangenome data
        df_merged = df_pangenome.merge(df_species_info, on='gtdb_species_clade_id', how='left')
        
        print("\nTop 10 Species:")
        display(df_merged[['GTDB_species', 'no_genomes', 'no_core', 'no_aux_genome', 'mean_intra_species_ANI']].head(10))
    else:
        print("‚ùå Could not fetch species names")
else:
    print("‚ö†Ô∏è No pangenome data available")

## 7. Export Data for Further Analysis

Save the data we've collected

In [None]:
if df_pangenome is not None and len(df_pangenome) > 0:
    output_file = 'pangenome_data_sample.csv'
    df_pangenome.to_csv(output_file, index=False)
    print(f"‚úÖ Saved {len(df_pangenome)} species to {output_file}")
    print(f"\nColumns: {list(df_pangenome.columns)}")
else:
    print("‚ö†Ô∏è No data to export")

## Summary

This notebook demonstrates a more robust approach to querying BERDL:

1. **Start small** - Query limited rows first
2. **Avoid complex JOINs** - Query tables separately and merge in pandas
3. **Handle timeouts gracefully** - Don't crash on 504 errors
4. **Incremental loading** - Try to get more data in batches

Next steps:
- Query specific species of interest
- Explore genome quality metrics
- Analyze functional annotations for specific clades
- Use saved CSV files for offline analysis