# BERDL Pangenome Data Exploration

Exploratory data analysis of the KBase pangenome database containing:
- **293,059 genomes** across **27,690 species-level clades**
- Gene clusters, functional annotations, and ANI metrics
- GTDB taxonomy and quality metadata

**Database**: `kbase_ke_pangenome` in BERDL Data Lakehouse

## 1. Setup & Connection

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
from typing import Dict, List, Optional

# Set plotting style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Imports successful")

In [None]:
# Load authentication token from .env
with open('.env', 'r') as f:
    for line in f:
        if line.startswith('KB_AUTH_TOKEN'):
            AUTH_TOKEN = line.split('"')[1]
            break

# BERDL API configuration
BASE_URL = "https://hub.berdl.kbase.us/apis/mcp"
DATABASE = "kbase_ke_pangenome"

HEADERS = {
    "Authorization": f"Bearer {AUTH_TOKEN}",
    "Content-Type": "application/json"
}

print(f"✅ Connected to BERDL API: {BASE_URL}")
print(f"✅ Database: {DATABASE}")

### Helper Functions

In [None]:
def query_berdl(sql: str, limit: int = 10000, offset: int = 0) -> pd.DataFrame:
    """Execute a SQL query against BERDL and return results as DataFrame."""
    url = f"{BASE_URL}/delta/tables/query"
    payload = {
        "query": sql,
        "limit": limit,
        "offset": offset
    }
    
    response = requests.post(url, headers=HEADERS, json=payload)
    response.raise_for_status()
    
    data = response.json()
    
    # Handle both 'result' and 'results' keys (API format changed)
    results = data.get('result', data.get('results', []))
    
    if results and len(results) > 0:
        return pd.DataFrame(results)
    else:
        return pd.DataFrame()

def get_table_count(table: str) -> int:
    """Get total row count for a table."""
    url = f"{BASE_URL}/delta/tables/count"
    payload = {"database": DATABASE, "table": table}
    
    response = requests.post(url, headers=HEADERS, json=payload)
    response.raise_for_status()
    
    return response.json()['count']

def get_table_schema(table: str) -> List[str]:
    """Get column names for a table."""
    url = f"{BASE_URL}/delta/databases/tables/schema"
    payload = {"database": DATABASE, "table": table}
    
    response = requests.post(url, headers=HEADERS, json=payload)
    response.raise_for_status()
    
    return response.json()['columns']

def sample_table(table: str, limit: int = 5) -> pd.DataFrame:
    """Get a sample of rows from a table."""
    url = f"{BASE_URL}/delta/tables/sample"
    payload = {"database": DATABASE, "table": table, "limit": limit}
    
    response = requests.post(url, headers=HEADERS, json=payload)
    response.raise_for_status()
    
    data = response.json()
    return pd.DataFrame(data['sample'])

print("✅ Helper functions defined")

## 2. Data Overview

In [None]:
# Get table counts with timeout handling
import time

tables = ['genome', 'pangenome', 'gtdb_species_clade', 'gene_cluster', 
          'eggnog_mapper_annotations', 'gapmind_pathways', 'genome_ani']

table_info = []
for table in tables:
    try:
        print(f"Fetching info for {table}...", end=" ")
        count = get_table_count(table)
        columns = get_table_schema(table)
        table_info.append({
            'Table': table,
            'Row Count': f"{count:,}",
            'Columns': len(columns)
        })
        print("✓")
    except requests.exceptions.HTTPError as e:
        if '504' in str(e) or '503' in str(e):
            print(f"⏱ Timeout (will skip)")
            table_info.append({
                'Table': table,
                'Row Count': 'Timeout - Table too large',
                'Columns': 'N/A'
            })
        else:
            print(f"✗ Error: {e}")
            table_info.append({
                'Table': table,
                'Row Count': 'Error',
                'Columns': 'N/A'
            })
    except Exception as e:
        print(f"✗ Error: {e}")
        table_info.append({
            'Table': table,
            'Row Count': 'Error',
            'Columns': 'N/A'
        })
    time.sleep(0.5)  # Small delay between requests

df_tables = pd.DataFrame(table_info)
display(df_tables)

## 3. Species-Level Analysis

### 3.1 Distribution of Genomes per Species

In [None]:
# Query pangenome statistics
sql = f"""
SELECT 
    p.gtdb_species_clade_id,
    s.GTDB_species,
    s.GTDB_taxonomy,
    p.no_genomes,
    p.no_core,
    p.no_aux_genome as no_accessory,
    p.no_singleton_gene_clusters as no_singletons,
    p.no_gene_clusters,
    s.mean_intra_species_ANI,
    s.ANI_circumscription_radius
FROM {DATABASE}.pangenome p
JOIN {DATABASE}.gtdb_species_clade s 
    ON p.gtdb_species_clade_id = s.gtdb_species_clade_id
ORDER BY p.no_genomes DESC
"""

df_pangenome = query_berdl(sql, limit=30000)
print(f"Loaded {len(df_pangenome):,} species pangenomes")
df_pangenome.head(10)

In [None]:
# Distribution of genomes per species
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Distribution of Genomes per Species', 'Log Scale')
)

fig.add_trace(
    go.Histogram(x=df_pangenome['no_genomes'], nbinsx=50, name='Linear'),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=df_pangenome['no_genomes'], nbinsx=50, name='Log'),
    row=1, col=2
)

fig.update_xaxes(title_text="Genomes per Species", row=1, col=1)
fig.update_xaxes(title_text="Genomes per Species (log)", type="log", row=1, col=2)
fig.update_yaxes(title_text="Number of Species", row=1, col=1)
fig.update_yaxes(title_text="Number of Species", row=1, col=2)

fig.update_layout(height=400, showlegend=False, title_text="Genome Sampling Across Species")
fig.show()

# Summary statistics
print(f"\nGenomes per species statistics:")
print(df_pangenome['no_genomes'].describe())

### 3.2 Top Species by Genome Count

In [None]:
# Top 20 species by genome count
top_species = df_pangenome.nlargest(20, 'no_genomes')[['GTDB_species', 'no_genomes', 'no_core', 'no_accessory']]

fig = px.bar(
    top_species, 
    x='no_genomes', 
    y='GTDB_species',
    orientation='h',
    title='Top 20 Species by Genome Count',
    labels={'no_genomes': 'Number of Genomes', 'GTDB_species': 'Species'},
    height=600
)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

display(top_species)

## 4. Pangenome Characteristics

### 4.1 Core vs Accessory vs Singleton Genes

In [None]:
# Calculate percentages
df_pangenome['pct_core'] = (df_pangenome['no_core'] / df_pangenome['no_gene_clusters'] * 100).round(2)
df_pangenome['pct_accessory'] = (df_pangenome['no_accessory'] / df_pangenome['no_gene_clusters'] * 100).round(2)
df_pangenome['pct_singletons'] = (df_pangenome['no_singletons'] / df_pangenome['no_gene_clusters'] * 100).round(2)

# Summary stats
print("Pangenome composition (%):\n")
print(df_pangenome[['pct_core', 'pct_accessory', 'pct_singletons']].describe())

In [None]:
# Scatter: Core genes vs genome count
fig = px.scatter(
    df_pangenome.sample(min(5000, len(df_pangenome))),
    x='no_genomes',
    y='pct_core',
    color='pct_singletons',
    title='Core Gene Percentage vs Genome Sampling',
    labels={
        'no_genomes': 'Number of Genomes in Species',
        'pct_core': 'Core Genes (%)',
        'pct_singletons': 'Singletons (%)'
    },
    hover_data=['GTDB_species'],
    color_continuous_scale='Viridis'
)
fig.update_xaxes(type="log")
fig.show()

### 4.2 Open vs Closed Pangenomes

In [None]:
# Calculate accessory:core ratio (higher = more open)
df_pangenome['accessory_core_ratio'] = (df_pangenome['no_accessory'] / df_pangenome['no_core']).round(2)

# Filter species with at least 10 genomes for meaningful analysis
df_filtered = df_pangenome[df_pangenome['no_genomes'] >= 10].copy()

fig = px.histogram(
    df_filtered,
    x='accessory_core_ratio',
    nbins=50,
    title='Pangenome Openness (Accessory:Core Ratio, species with ≥10 genomes)',
    labels={'accessory_core_ratio': 'Accessory:Core Gene Ratio'},
    marginal='box'
)
fig.show()

print(f"\nAccessory:Core ratio statistics (species with ≥10 genomes):")
print(df_filtered['accessory_core_ratio'].describe())

## 5. Taxonomic Patterns

In [None]:
# Parse taxonomy into levels
def parse_taxonomy(tax_string):
    """Parse GTDB taxonomy string into components."""
    levels = {}
    if pd.isna(tax_string):
        return levels
    
    parts = tax_string.split(';')
    for part in parts:
        if '__' in part:
            level, name = part.split('__', 1)
            levels[level] = name
    return levels

# Extract phylum and class
tax_parsed = df_pangenome['GTDB_taxonomy'].apply(parse_taxonomy)
df_pangenome['phylum'] = tax_parsed.apply(lambda x: x.get('p', 'Unknown'))
df_pangenome['class'] = tax_parsed.apply(lambda x: x.get('c', 'Unknown'))

print("✅ Taxonomy parsed")

### 5.1 Species Count by Phylum

In [None]:
# Count species per phylum
phylum_counts = df_pangenome['phylum'].value_counts().head(20)

fig = px.bar(
    x=phylum_counts.values,
    y=phylum_counts.index,
    orientation='h',
    title='Top 20 Phyla by Species Count',
    labels={'x': 'Number of Species', 'y': 'Phylum'},
    height=600
)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

### 5.2 Genome Sampling by Phylum

In [None]:
# Total genomes per phylum
genomes_by_phylum = df_pangenome.groupby('phylum')['no_genomes'].sum().sort_values(ascending=False).head(20)

fig = px.bar(
    x=genomes_by_phylum.values,
    y=genomes_by_phylum.index,
    orientation='h',
    title='Top 20 Phyla by Total Genome Count',
    labels={'x': 'Total Genomes', 'y': 'Phylum'},
    height=600
)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

## 6. ANI (Average Nucleotide Identity) Analysis

In [None]:
# ANI distribution across species
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Mean Intra-Species ANI', 'ANI Circumscription Radius')
)

fig.add_trace(
    go.Histogram(x=df_pangenome['mean_intra_species_ANI'], nbinsx=50, name='Mean ANI'),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=df_pangenome['ANI_circumscription_radius'], nbinsx=50, name='Circumscription'),
    row=1, col=2
)

fig.update_xaxes(title_text="ANI (%)", row=1, col=1)
fig.update_xaxes(title_text="ANI Radius (%)", row=1, col=2)
fig.update_yaxes(title_text="Number of Species", row=1, col=1)
fig.update_yaxes(title_text="Number of Species", row=1, col=2)

fig.update_layout(height=400, showlegend=False)
fig.show()

print("\nANI Statistics:")
print(df_pangenome[['mean_intra_species_ANI', 'ANI_circumscription_radius']].describe())

## 7. Sample Species Deep Dive

Let's look at a specific well-sampled species in detail.

In [None]:
# Pick the most sampled species
top_species_id = df_pangenome.nlargest(1, 'no_genomes')['gtdb_species_clade_id'].values[0]
top_species_name = df_pangenome.nlargest(1, 'no_genomes')['GTDB_species'].values[0]

print(f"Exploring: {top_species_name}")
print(f"Species ID: {top_species_id}")

# Use LIKE query to avoid '--' metacharacter issue
species_name_for_query = top_species_name.replace('s__', '')

sql = f"""
SELECT 
    g.genome_id,
    g.gtdb_species_clade_id,
    g.ncbi_biosample_id,
    m.checkm_completeness,
    m.checkm_contamination,
    m.genome_size,
    m.gc_percentage,
    m.contig_count
FROM {DATABASE}.genome g
LEFT JOIN {DATABASE}.gtdb_metadata m ON g.genome_id = m.accession
WHERE g.gtdb_species_clade_id LIKE '%{species_name_for_query}%'
LIMIT 500
"""

print(f"\nFetching genomes for {top_species_name}...")
df_species_genomes = query_berdl(sql, limit=500)

if df_species_genomes is not None and len(df_species_genomes) > 0:
    # Convert string columns to numeric
    numeric_cols = ['checkm_completeness', 'checkm_contamination', 'genome_size', 'gc_percentage', 'contig_count']
    for col in numeric_cols:
        if col in df_species_genomes.columns:
            df_species_genomes[col] = pd.to_numeric(df_species_genomes[col], errors='coerce')
    
    print(f"✅ Loaded {len(df_species_genomes)} genomes")
    display(df_species_genomes.head(10))
else:
    print("❌ Failed to fetch genome data")
    df_species_genomes = None

In [None]:
# Quality metrics for this species
if df_species_genomes is not None and len(df_species_genomes) > 0:
    # Check which columns are available
    has_completeness = 'checkm_completeness' in df_species_genomes.columns
    has_contamination = 'checkm_contamination' in df_species_genomes.columns
    has_size = 'genome_size' in df_species_genomes.columns
    has_gc = 'gc_percentage' in df_species_genomes.columns
    
    if has_completeness or has_contamination or has_size or has_gc:
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Genome Completeness' if has_completeness else 'N/A',
                'Contamination' if has_contamination else 'N/A',
                'Genome Size' if has_size else 'N/A',
                'GC Content' if has_gc else 'N/A'
            )
        )
        
        if has_completeness:
            fig.add_trace(
                go.Histogram(x=df_species_genomes['checkm_completeness'].dropna(), nbinsx=30),
                row=1, col=1
            )
        
        if has_contamination:
            fig.add_trace(
                go.Histogram(x=df_species_genomes['checkm_contamination'].dropna(), nbinsx=30),
                row=1, col=2
            )
        
        if has_size:
            fig.add_trace(
                go.Histogram(x=df_species_genomes['genome_size'].dropna(), nbinsx=30),
                row=2, col=1
            )
        
        if has_gc:
            fig.add_trace(
                go.Histogram(x=df_species_genomes['gc_percentage'].dropna(), nbinsx=30),
                row=2, col=2
            )
        
        fig.update_xaxes(title_text="Completeness (%)", row=1, col=1)
        fig.update_xaxes(title_text="Contamination (%)", row=1, col=2)
        fig.update_xaxes(title_text="Size (bp)", row=2, col=1)
        fig.update_xaxes(title_text="GC %", row=2, col=2)
        
        fig.update_layout(height=800, showlegend=False, title_text=f"Genome Quality Metrics: {top_species_name}")
        fig.show()
        
        # Print summary stats
        if has_completeness:
            print(f"\nCompleteness: mean={df_species_genomes['checkm_completeness'].mean():.2f}%, median={df_species_genomes['checkm_completeness'].median():.2f}%")
        if has_contamination:
            print(f"Contamination: mean={df_species_genomes['checkm_contamination'].mean():.2f}%, median={df_species_genomes['checkm_contamination'].median():.2f}%")
        if has_size:
            print(f"Genome size: mean={df_species_genomes['genome_size'].mean()/1e6:.2f} Mbp, median={df_species_genomes['genome_size'].median()/1e6:.2f} Mbp")
        if has_gc:
            print(f"GC content: mean={df_species_genomes['gc_percentage'].mean():.2f}%, median={df_species_genomes['gc_percentage'].median():.2f}%")
    else:
        print("⚠️ No quality metric columns available in the data")
else:
    print("⚠️ No genome data available for quality metrics visualization")

## 8. Summary Statistics

In [None]:
# Overall dataset summary
summary = {
    'Total Species': len(df_pangenome),
    'Total Genomes': df_pangenome['no_genomes'].sum(),
    'Median Genomes/Species': df_pangenome['no_genomes'].median(),
    'Median Core Genes': df_pangenome['no_core'].median(),
    'Median Accessory Genes': df_pangenome['no_accessory'].median(),
    'Median Singleton Genes': df_pangenome['no_singletons'].median(),
    'Mean ANI (%)': df_pangenome['mean_intra_species_ANI'].mean(),
    'Unique Phyla': df_pangenome['phylum'].nunique(),
}

summary_df = pd.DataFrame(list(summary.items()), columns=['Metric', 'Value'])
display(summary_df)

## 9. Next Steps

Potential analyses to explore:

1. **Functional Analysis**: Query `eggnog_mapper_annotations` and `gapmind_pathways` for functional enrichment
2. **Mobile Elements**: Analyze `genomad_mobile_elements` for plasmid/virus patterns
3. **Pairwise ANI**: Explore `genome_ani` table for within-species diversity
4. **Gene Cluster Analysis**: Dive into `gene_cluster` and `gene_genecluster_junction` tables
5. **Environment Correlations**: Link to `sample` and `ncbi_env` metadata
6. **Phylogenetic Patterns**: Compare ANI vs phylogenetic distance
7. **Species-specific Analyses**: Deep dives into specific clades of interest