# Quantitative Laws Governing Bacterial Pangenome Structure

**Analysis notebook for publication**

This notebook generates analyses and figures for a paper on universal scaling laws of bacterial pangenomes across 27,690 species.

## Key Questions:
1. Is there a universal rarefaction curve for core genomes?
2. What determines pangenome openness (open vs closed)?
3. How does taxonomy structure pangenome architecture?
4. Can we predict pangenome openness from genome features?

## Setup

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Publication-quality settings
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
sns.set_style("whitegrid")

# Color palette for consistency
COLORS = {
    'core': '#2ecc71',
    'accessory': '#3498db', 
    'singleton': '#e74c3c',
    'primary': '#2c3e50',
    'secondary': '#7f8c8d'
}

# Load authentication
with open('.env', 'r') as f:
    for line in f:
        if line.startswith('KB_AUTH_TOKEN'):
            AUTH_TOKEN = line.split('"')[1]
            break

BASE_URL = "https://hub.berdl.kbase.us/apis/mcp"
DATABASE = "kbase_ke_pangenome"
HEADERS = {"Authorization": f"Bearer {AUTH_TOKEN}", "Content-Type": "application/json"}

def query_berdl(sql, limit=10000, offset=0):
    """Execute SQL query against BERDL."""
    url = f"{BASE_URL}/delta/tables/query"
    payload = {"query": sql, "limit": limit, "offset": offset}
    response = requests.post(url, headers=HEADERS, json=payload)
    response.raise_for_status()
    data = response.json()
    results = data.get('result', data.get('results', []))
    return pd.DataFrame(results) if results else pd.DataFrame()

def spearmanr(x, y):
    """Calculate Spearman correlation."""
    x, y = np.array(x), np.array(y)
    mask = ~(np.isnan(x) | np.isnan(y))
    x, y = x[mask], y[mask]
    n = len(x)
    if n < 3:
        return np.nan, 1.0
    x_ranks = np.argsort(np.argsort(x)) + 1
    y_ranks = np.argsort(np.argsort(y)) + 1
    d = x_ranks - y_ranks
    rho = 1 - (6 * np.sum(d**2)) / (n * (n**2 - 1))
    # t-test for significance
    t = rho * np.sqrt((n - 2) / (1 - rho**2 + 1e-10))
    return rho, 2 * (1 - min(0.9999, abs(t) / np.sqrt(n)))

print("Setup complete")

## 1. Data Loading & Quality Control

In [None]:
# Query pangenome data with taxonomy
sql = f"""
SELECT 
    p.gtdb_species_clade_id,
    s.GTDB_species,
    s.GTDB_taxonomy,
    p.no_genomes,
    p.no_core,
    p.no_aux_genome as no_accessory,
    p.no_singleton_gene_clusters as no_singletons,
    p.no_gene_clusters,
    p.no_CDSes,
    s.mean_intra_species_ANI,
    s.ANI_circumscription_radius
FROM {DATABASE}.pangenome p
JOIN {DATABASE}.gtdb_species_clade s 
    ON p.gtdb_species_clade_id = s.gtdb_species_clade_id
ORDER BY p.no_genomes DESC
"""

df = query_berdl(sql, limit=30000)
print(f"Loaded {len(df):,} species pangenomes")

# Parse taxonomy
def parse_taxonomy(tax_string):
    levels = {}
    if pd.isna(tax_string):
        return levels
    for part in tax_string.split(';'):
        if '__' in part:
            level, name = part.split('__', 1)
            levels[level] = name
    return levels

tax_parsed = df['GTDB_taxonomy'].apply(parse_taxonomy)
df['domain'] = tax_parsed.apply(lambda x: x.get('d', 'Unknown'))
df['phylum'] = tax_parsed.apply(lambda x: x.get('p', 'Unknown'))
df['class'] = tax_parsed.apply(lambda x: x.get('c', 'Unknown'))
df['order'] = tax_parsed.apply(lambda x: x.get('o', 'Unknown'))
df['family'] = tax_parsed.apply(lambda x: x.get('f', 'Unknown'))
df['genus'] = tax_parsed.apply(lambda x: x.get('g', 'Unknown'))

# Calculate derived metrics
df['pct_core'] = (df['no_core'] / df['no_gene_clusters'] * 100).round(2)
df['pct_accessory'] = (df['no_accessory'] / df['no_gene_clusters'] * 100).round(2)
df['pct_singletons'] = (df['no_singletons'] / df['no_gene_clusters'] * 100).round(2)
df['pct_shared'] = (100 - df['pct_singletons']).round(2)  # genes in 2+ genomes
df['openness'] = (df['no_accessory'] / df['no_core']).round(3)  # accessory:core ratio
df['log_openness'] = np.log10(df['openness'] + 0.01)
df['avg_genes_per_genome'] = (df['no_CDSes'] / df['no_genomes']).round(0)

print(f"Total genomes: {df['no_genomes'].sum():,}")
print(f"Unique phyla: {df['phylum'].nunique()}")
print(f"Unique genera: {df['genus'].nunique()}")

In [None]:
# Summary statistics
print("\n" + "="*60)
print("DATASET SUMMARY")
print("="*60)
print(f"Species: {len(df):,}")
print(f"Genomes: {df['no_genomes'].sum():,}")
print(f"Gene clusters: ~{df['no_gene_clusters'].sum()/1e6:.1f}M (sum across species)")
print(f"\nGenomes per species:")
print(f"  Min: {df['no_genomes'].min()}")
print(f"  Median: {df['no_genomes'].median():.0f}")
print(f"  Max: {df['no_genomes'].max():,}")
print(f"\nCore genome %:")
print(f"  Min: {df['pct_core'].min():.1f}%")
print(f"  Median: {df['pct_core'].median():.1f}%")
print(f"  Max: {df['pct_core'].max():.1f}%")
print(f"\nOpenness (accessory:core ratio):")
print(f"  Min: {df['openness'].min():.2f}")
print(f"  Median: {df['openness'].median():.2f}")
print(f"  Max: {df['openness'].max():.2f}")

---
# Figure 1: The Landscape of Bacterial Pangenomes

In [None]:
# Figure 1: Multi-panel overview
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 1A: Distribution of core genome %
ax = axes[0, 0]
ax.hist(df['pct_core'], bins=50, color=COLORS['core'], edgecolor='white', alpha=0.8)
ax.axvline(df['pct_core'].median(), color='black', linestyle='--', linewidth=2, label=f'Median: {df["pct_core"].median():.1f}%')
ax.set_xlabel('Core Genome (%)')
ax.set_ylabel('Number of Species')
ax.set_title('A. Distribution of Core Genome Size')
ax.legend()

# 1B: Distribution of openness (log scale)
ax = axes[0, 1]
ax.hist(df['log_openness'], bins=50, color=COLORS['accessory'], edgecolor='white', alpha=0.8)
ax.axvline(np.log10(df['openness'].median()), color='black', linestyle='--', linewidth=2, 
           label=f'Median: {df["openness"].median():.2f}')
ax.set_xlabel('Openness (log₁₀ Accessory:Core Ratio)')
ax.set_ylabel('Number of Species')
ax.set_title('B. Distribution of Pangenome Openness')
ax.legend()

# 1C: Core % vs genome sampling (by phylum)
ax = axes[1, 0]
top_phyla = df['phylum'].value_counts().head(8).index.tolist()
df_plot = df[df['phylum'].isin(top_phyla)].copy()
for phylum in top_phyla:
    subset = df_plot[df_plot['phylum'] == phylum]
    ax.scatter(subset['no_genomes'], subset['pct_core'], alpha=0.4, s=10, label=phylum)
ax.set_xscale('log')
ax.set_xlabel('Number of Genomes (log scale)')
ax.set_ylabel('Core Genome (%)')
ax.set_title('C. Core Genome vs Sampling Depth')
ax.legend(loc='upper right', fontsize=8, ncol=2)

# 1D: Pangenome composition summary
ax = axes[1, 1]
categories = ['Core\n(>95% genomes)', 'Shared Accessory\n(2-95%)', 'Singletons\n(1 genome)']
medians = [
    df['pct_core'].median(),
    df['pct_accessory'].median() - df['pct_singletons'].median(),
    df['pct_singletons'].median()
]
colors = [COLORS['core'], COLORS['accessory'], COLORS['singleton']]
bars = ax.bar(categories, medians, color=colors, edgecolor='white')
for bar, val in zip(bars, medians):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{val:.1f}%', 
            ha='center', va='bottom', fontsize=11, fontweight='bold')
ax.set_ylabel('Median % of Pangenome')
ax.set_title('D. Pangenome Composition (Median Values)')
ax.set_ylim(0, 70)

plt.tight_layout()
plt.savefig('figure1_pangenome_landscape.png', dpi=300, bbox_inches='tight')
plt.savefig('figure1_pangenome_landscape.pdf', bbox_inches='tight')
plt.show()
print("Figure 1 saved.")

---
# Figure 2: Sampling Depth Effects (Heap's Law)

In [None]:
# Analyze relationship between sampling depth and core genome
# For species with many genomes, we can model the core genome decay

# Heap's Law: Core genes ~ N^(-alpha) where N is number of genomes
# log(pct_core) ~ -alpha * log(N)

df_sampled = df[df['no_genomes'] >= 5].copy()

# Fit log-log relationship
log_n = np.log10(df_sampled['no_genomes'])
log_core = np.log10(df_sampled['pct_core'])

# Remove inf/nan
mask = np.isfinite(log_n) & np.isfinite(log_core)
log_n_clean = log_n[mask]
log_core_clean = log_core[mask]

# Linear regression
slope, intercept = np.polyfit(log_n_clean, log_core_clean, 1)
print(f"Heap's Law exponent (alpha): {-slope:.3f}")
print(f"Intercept: {intercept:.3f}")

# Correlation
rho, _ = spearmanr(df_sampled['no_genomes'], df_sampled['pct_core'])
print(f"Spearman correlation: {rho:.3f}")

In [None]:
# Figure 2: Heap's Law analysis
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 2A: Log-log plot with fit
ax = axes[0]
ax.scatter(df_sampled['no_genomes'], df_sampled['pct_core'], alpha=0.3, s=8, c=COLORS['primary'])

# Fit line
x_fit = np.logspace(0.5, 4.2, 100)
y_fit = 10**(intercept + slope * np.log10(x_fit))
ax.plot(x_fit, y_fit, 'r-', linewidth=2, label=f'Fit: Core ∝ N^{slope:.2f}')

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Core Genome (%)')
ax.set_title(f'A. Core Genome Decay (Heap\'s Law)\nρ = {rho:.3f}')
ax.legend()

# 2B: Distribution of decay rates by genus
ax = axes[1]
# Calculate slope for each genus with enough data
genus_slopes = []
for genus, grp in df_sampled.groupby('genus'):
    if len(grp) >= 10 and grp['no_genomes'].max() >= 20:
        log_n = np.log10(grp['no_genomes'])
        log_c = np.log10(grp['pct_core'])
        mask = np.isfinite(log_n) & np.isfinite(log_c)
        if mask.sum() >= 5:
            s, _ = np.polyfit(log_n[mask], log_c[mask], 1)
            genus_slopes.append({'genus': genus, 'slope': s, 'n_species': len(grp)})

df_slopes = pd.DataFrame(genus_slopes)
ax.hist(df_slopes['slope'], bins=30, color=COLORS['accessory'], edgecolor='white', alpha=0.8)
ax.axvline(df_slopes['slope'].median(), color='red', linestyle='--', linewidth=2,
           label=f'Median: {df_slopes["slope"].median():.3f}')
ax.axvline(slope, color='black', linestyle=':', linewidth=2, label=f'Global: {slope:.3f}')
ax.set_xlabel('Decay Exponent (α)')
ax.set_ylabel('Number of Genera')
ax.set_title('B. Variation in Decay Rates Across Genera')
ax.legend()
print(f"Calculated decay exponents for {len(df_slopes)} genera")

# 2C: Binned analysis - mean core % by sampling bin
ax = axes[2]
df_sampled['genome_bin'] = pd.cut(df_sampled['no_genomes'], 
                                   bins=[2, 5, 10, 20, 50, 100, 500, 1000, 20000],
                                   labels=['2-5', '5-10', '10-20', '20-50', '50-100', '100-500', '500-1K', '>1K'])
bin_stats = df_sampled.groupby('genome_bin', observed=True).agg({
    'pct_core': ['mean', 'std', 'count']
}).reset_index()
bin_stats.columns = ['bin', 'mean', 'std', 'count']

x_pos = range(len(bin_stats))
ax.bar(x_pos, bin_stats['mean'], yerr=bin_stats['std']/np.sqrt(bin_stats['count']),
       color=COLORS['core'], edgecolor='white', alpha=0.8, capsize=3)
ax.set_xticks(x_pos)
ax.set_xticklabels(bin_stats['bin'], rotation=45, ha='right')
ax.set_xlabel('Genomes per Species')
ax.set_ylabel('Mean Core Genome (%)')
ax.set_title('C. Core Genome by Sampling Depth')

# Add sample sizes
for i, (_, row) in enumerate(bin_stats.iterrows()):
    ax.text(i, row['mean'] + 3, f'n={int(row["count"])}', ha='center', fontsize=8)

plt.tight_layout()
plt.savefig('figure2_heaps_law.png', dpi=300, bbox_inches='tight')
plt.savefig('figure2_heaps_law.pdf', bbox_inches='tight')
plt.show()
print("Figure 2 saved.")

---
# Figure 3: ANI Constrains Core Genome

In [None]:
# Analyze ANI relationship
df_ani = df_sampled[df_sampled['mean_intra_species_ANI'].notna()].copy()

rho_ani, _ = spearmanr(df_ani['mean_intra_species_ANI'], df_ani['pct_core'])
print(f"Spearman correlation (ANI vs core %): {rho_ani:.3f}")

rho_ani_open, _ = spearmanr(df_ani['mean_intra_species_ANI'], df_ani['openness'])
print(f"Spearman correlation (ANI vs openness): {rho_ani_open:.3f}")

In [None]:
# Figure 3: ANI constrains core genome
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 3A: Core % vs ANI
ax = axes[0]
scatter = ax.scatter(df_ani['mean_intra_species_ANI'], df_ani['pct_core'], 
                     c=np.log10(df_ani['no_genomes']), cmap='viridis', 
                     alpha=0.5, s=15)
plt.colorbar(scatter, ax=ax, label='log₁₀(Genomes)')
ax.set_xlabel('Mean Intra-Species ANI (%)')
ax.set_ylabel('Core Genome (%)')
ax.set_title(f'A. Core Genome vs ANI\nρ = {rho_ani:.3f}')

# 3B: Openness vs ANI
ax = axes[1]
scatter = ax.scatter(df_ani['mean_intra_species_ANI'], df_ani['openness'],
                     c=np.log10(df_ani['no_genomes']), cmap='viridis',
                     alpha=0.5, s=15)
ax.set_yscale('log')
plt.colorbar(scatter, ax=ax, label='log₁₀(Genomes)')
ax.set_xlabel('Mean Intra-Species ANI (%)')
ax.set_ylabel('Openness (Accessory:Core Ratio)')
ax.set_title(f'B. Pangenome Openness vs ANI\nρ = {rho_ani_open:.3f}')

# 3C: ANI bins analysis
ax = axes[2]
df_ani['ani_bin'] = pd.cut(df_ani['mean_intra_species_ANI'],
                           bins=[95, 96, 97, 98, 99, 100],
                           labels=['95-96', '96-97', '97-98', '98-99', '99-100'])
ani_stats = df_ani.groupby('ani_bin', observed=True).agg({
    'pct_core': ['mean', 'std', 'count'],
    'openness': ['mean', 'std']
}).reset_index()

x = range(len(ani_stats))
ax.bar(x, ani_stats[('pct_core', 'mean')], 
       yerr=ani_stats[('pct_core', 'std')]/np.sqrt(ani_stats[('pct_core', 'count')]),
       color=COLORS['core'], edgecolor='white', capsize=3)
ax.set_xticks(x)
ax.set_xticklabels([str(b) for b in ani_stats['ani_bin']])
ax.set_xlabel('Intra-Species ANI Range (%)')
ax.set_ylabel('Mean Core Genome (%)')
ax.set_title('C. Core Genome by ANI Category')

plt.tight_layout()
plt.savefig('figure3_ani_core.png', dpi=300, bbox_inches='tight')
plt.savefig('figure3_ani_core.pdf', bbox_inches='tight')
plt.show()
print("Figure 3 saved.")

---
# Figure 4: Taxonomic Structure of Pangenome Architecture

In [None]:
# Analyze taxonomic patterns
# Calculate variance explained at each taxonomic level

df_tax = df_sampled.copy()

# Total variance
total_var = df_tax['pct_core'].var()

# Within-group variance at each level
levels = ['phylum', 'class', 'order', 'family', 'genus']
var_explained = {}

for level in levels:
    group_means = df_tax.groupby(level)['pct_core'].transform('mean')
    between_var = group_means.var()
    var_explained[level] = (between_var / total_var * 100)

print("Variance in core genome % explained by taxonomy:")
for level, var in var_explained.items():
    print(f"  {level}: {var:.1f}%")

In [None]:
# Figure 4: Taxonomic patterns
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# 4A: Core % by phylum
ax = axes[0, 0]
top_phyla = df_tax['phylum'].value_counts().head(12).index.tolist()
df_phyla = df_tax[df_tax['phylum'].isin(top_phyla)]

phyla_order = df_phyla.groupby('phylum')['pct_core'].median().sort_values(ascending=False).index
sns.boxplot(data=df_phyla, x='phylum', y='pct_core', order=phyla_order, ax=ax, palette='Set3')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel('Phylum')
ax.set_ylabel('Core Genome (%)')
ax.set_title('A. Core Genome Distribution by Phylum')

# 4B: Openness by phylum
ax = axes[0, 1]
sns.boxplot(data=df_phyla, x='phylum', y='openness', order=phyla_order, ax=ax, palette='Set3')
ax.set_yscale('log')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel('Phylum')
ax.set_ylabel('Openness (log scale)')
ax.set_title('B. Pangenome Openness by Phylum')

# 4C: Variance explained by taxonomy
ax = axes[1, 0]
x = range(len(var_explained))
bars = ax.bar(x, var_explained.values(), color=COLORS['primary'], edgecolor='white')
ax.set_xticks(x)
ax.set_xticklabels([l.capitalize() for l in var_explained.keys()])
ax.set_xlabel('Taxonomic Level')
ax.set_ylabel('Variance Explained (%)')
ax.set_title('C. Taxonomic Structure of Core Genome Variation')
for bar, val in zip(bars, var_explained.values()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.1f}%',
            ha='center', fontsize=10)

# 4D: Genus-level comparison within a phylum
ax = axes[1, 1]
# Pick Pseudomonadota (most species)
df_pseudo = df_tax[df_tax['phylum'] == 'Pseudomonadota']
top_genera = df_pseudo['genus'].value_counts().head(10).index.tolist()
df_genera = df_pseudo[df_pseudo['genus'].isin(top_genera)]

genera_order = df_genera.groupby('genus')['pct_core'].median().sort_values(ascending=False).index
sns.boxplot(data=df_genera, x='genus', y='pct_core', order=genera_order, ax=ax, palette='husl')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel('Genus')
ax.set_ylabel('Core Genome (%)')
ax.set_title('D. Variation Within Pseudomonadota (Top 10 Genera)')

plt.tight_layout()
plt.savefig('figure4_taxonomic_patterns.png', dpi=300, bbox_inches='tight')
plt.savefig('figure4_taxonomic_patterns.pdf', bbox_inches='tight')
plt.show()
print("Figure 4 saved.")

---
# Figure 5: Predictive Model of Pangenome Openness

In [None]:
# Build a predictive model for pangenome openness
# Features: genome count, ANI, avg genes per genome, taxonomy

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error

# Prepare features
df_model = df_sampled.dropna(subset=['mean_intra_species_ANI', 'avg_genes_per_genome', 'openness']).copy()

# Encode taxonomy
le_phylum = LabelEncoder()
df_model['phylum_enc'] = le_phylum.fit_transform(df_model['phylum'])

# Features
feature_cols = ['no_genomes', 'mean_intra_species_ANI', 'avg_genes_per_genome', 'phylum_enc']
X = df_model[feature_cols].values
y = np.log10(df_model['openness'].values)  # Predict log-openness

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Random Forest Performance:")
print(f"  R² = {r2:.3f}")
print(f"  MAE = {mae:.3f} (log-openness)")

# Feature importance
importance = pd.DataFrame({
    'feature': ['Genome Count', 'Intra-Species ANI', 'Avg Genes/Genome', 'Phylum'],
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print(f"\nFeature Importance:")
print(importance.to_string(index=False))

In [None]:
# Figure 5: Predictive model
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 5A: Feature importance
ax = axes[0]
colors = [COLORS['primary'], COLORS['accessory'], COLORS['core'], COLORS['secondary']]
bars = ax.barh(importance['feature'], importance['importance'], color=colors)
ax.set_xlabel('Feature Importance')
ax.set_title('A. Predictors of Pangenome Openness')
ax.invert_yaxis()

# 5B: Predicted vs Observed
ax = axes[1]
ax.scatter(y_test, y_pred, alpha=0.3, s=10, c=COLORS['primary'])
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2, label='Perfect prediction')
ax.set_xlabel('Observed log₁₀(Openness)')
ax.set_ylabel('Predicted log₁₀(Openness)')
ax.set_title(f'B. Predicted vs Observed Openness\nR² = {r2:.3f}')
ax.legend()

# 5C: Residuals by ANI
ax = axes[2]
residuals = y_test - y_pred
ax.scatter(X_test[:, 1], residuals, alpha=0.3, s=10, c=COLORS['accessory'])
ax.axhline(0, color='red', linestyle='--', linewidth=2)
ax.set_xlabel('Intra-Species ANI (%)')
ax.set_ylabel('Residual (Observed - Predicted)')
ax.set_title('C. Residuals by ANI')

plt.tight_layout()
plt.savefig('figure5_predictive_model.png', dpi=300, bbox_inches='tight')
plt.savefig('figure5_predictive_model.pdf', bbox_inches='tight')
plt.show()
print("Figure 5 saved.")

---
# Summary Statistics for Paper

In [None]:
# Generate key statistics for paper text
print("="*70)
print("KEY STATISTICS FOR PAPER")
print("="*70)

print(f"\n## Dataset")
print(f"- Species: {len(df):,}")
print(f"- Genomes: {df['no_genomes'].sum():,}")
print(f"- Phyla: {df['phylum'].nunique()}")
print(f"- Genera: {df['genus'].nunique()}")

print(f"\n## Core Genome")
print(f"- Median: {df['pct_core'].median():.1f}%")
print(f"- IQR: {df['pct_core'].quantile(0.25):.1f}% - {df['pct_core'].quantile(0.75):.1f}%")
print(f"- Range: {df['pct_core'].min():.1f}% - {df['pct_core'].max():.1f}%")

print(f"\n## Pangenome Openness (Accessory:Core Ratio)")
print(f"- Median: {df['openness'].median():.2f}")
print(f"- IQR: {df['openness'].quantile(0.25):.2f} - {df['openness'].quantile(0.75):.2f}")
print(f"- Range: {df['openness'].min():.2f} - {df['openness'].max():.2f}")

print(f"\n## Heap's Law")
print(f"- Global decay exponent (α): {-slope:.3f}")
print(f"- Sampling-core correlation: ρ = {rho:.3f}")

print(f"\n## ANI Relationship")
print(f"- ANI-core correlation: ρ = {rho_ani:.3f}")
print(f"- ANI-openness correlation: ρ = {rho_ani_open:.3f}")

print(f"\n## Predictive Model")
print(f"- R² = {r2:.3f}")
print(f"- MAE = {mae:.3f} (log-openness)")
print(f"- Top predictor: {importance.iloc[0]['feature']} ({importance.iloc[0]['importance']:.3f})")

print(f"\n## Taxonomic Structure")
for level, var in var_explained.items():
    print(f"- {level.capitalize()}: {var:.1f}% variance explained")

In [None]:
# Save data for supplementary materials
df.to_csv('supplementary_table_s1_all_species.csv', index=False)
print(f"Saved supplementary table with {len(df):,} species")

---
# Next Steps

1. **Figure 6**: Query genomad mobile elements OR environmental metadata
2. **Supplementary**: Quality control filters, sensitivity analyses
3. **Additional**: Within-species rarefaction curves for top 20 species