# NB02: PHB Pathway Distribution Across the Bacterial Tree

**Purpose**: Map PHB pathway completeness across the GTDB taxonomy to identify clades enriched or depleted for PHB capability.

**Requires**: BERDL JupyterHub (Spark session)

**Inputs**: `data/phb_species_summary.tsv` from NB01

**Outputs**:
- `data/phb_by_taxonomy.tsv` — PHB prevalence by phylum/class/order/family
- `figures/phb_prevalence_by_phylum.png` — PHB prevalence heatmap across phyla
- `figures/phb_core_vs_accessory.png` — Core vs accessory phaC across clades
- `figures/phb_pathway_completeness.png` — Pathway completeness distribution

In [None]:
spark = get_spark_session()

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_DIR = os.path.expanduser('~/BERIL-research-observatory/projects/phb_granule_ecology')
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
FIG_DIR = os.path.join(PROJECT_DIR, 'figures')

# Load NB01 results
species_phb = pd.read_csv(os.path.join(DATA_DIR, 'phb_species_summary.tsv'), sep='\t')
print(f'Loaded {len(species_phb):,} species with PHB genes')

In [None]:
# Get full taxonomy for all species
taxonomy = spark.sql("""
    SELECT sc.gtdb_species_clade_id,
           sc.GTDB_taxonomy,
           t.gtdb_phylum, t.gtdb_class, t.gtdb_order, t.gtdb_family, t.gtdb_genus,
           p.no_genomes, p.no_core, p.no_aux_genome, p.no_gene_clusters
    FROM kbase_ke_pangenome.gtdb_species_clade sc
    JOIN kbase_ke_pangenome.gtdb_taxonomy_r214v1 t
        ON sc.representative_genome_id = t.gtdb_taxonomy_id
    JOIN kbase_ke_pangenome.pangenome p
        ON sc.gtdb_species_clade_id = p.gtdb_species_clade_id
""").toPandas()

print(f'Total species with taxonomy: {len(taxonomy):,}')
print(f'Phyla: {taxonomy["gtdb_phylum"].nunique()}')

In [None]:
# Merge PHB status with taxonomy
# Species not in species_phb are PHB-absent
tax_phb = taxonomy.merge(species_phb[['gtdb_species_clade_id', 'phb_status', 'phb_genes_str', 
                                       'phaC_is_core', 'phaC_is_aux']],
                          on='gtdb_species_clade_id', how='left')
tax_phb['phb_status'] = tax_phb['phb_status'].fillna('absent')
tax_phb['has_phaC'] = tax_phb['phb_status'].isin(['complete', 'synthase_only'])
tax_phb['has_complete_pathway'] = tax_phb['phb_status'] == 'complete'

print('PHB pathway status across all species:')
print(tax_phb['phb_status'].value_counts())
print(f'\nOverall phaC prevalence: {tax_phb["has_phaC"].mean()*100:.1f}%')
print(f'Overall complete pathway: {tax_phb["has_complete_pathway"].mean()*100:.1f}%')

In [None]:
# PHB prevalence by phylum
phylum_stats = tax_phb.groupby('gtdb_phylum').agg(
    n_species=('gtdb_species_clade_id', 'count'),
    n_phaC=('has_phaC', 'sum'),
    pct_phaC=('has_phaC', lambda x: x.mean() * 100),
    n_complete=('has_complete_pathway', 'sum'),
    pct_complete=('has_complete_pathway', lambda x: x.mean() * 100),
).round(1).sort_values('n_species', ascending=False)

print('PHB prevalence by phylum (>50 species):')
phylum_stats[phylum_stats['n_species'] >= 50]

In [None]:
# Figure 1: PHB prevalence by phylum (top 20 phyla by species count)
top_phyla = phylum_stats.head(20).sort_values('pct_phaC', ascending=True)

fig, ax = plt.subplots(figsize=(10, 8))
bars = ax.barh(range(len(top_phyla)), top_phyla['pct_complete'], 
               label='Complete (phaC+phaA/B)', color='#2196F3', alpha=0.8)
ax.barh(range(len(top_phyla)), top_phyla['pct_phaC'] - top_phyla['pct_complete'],
        left=top_phyla['pct_complete'],
        label='Synthase only (phaC)', color='#90CAF9', alpha=0.8)

ax.set_yticks(range(len(top_phyla)))
ax.set_yticklabels([f"{p} (n={int(n)})" for p, n in 
                     zip(top_phyla.index, top_phyla['n_species'])])
ax.set_xlabel('% of species with PHB pathway')
ax.set_title('PHB Pathway Prevalence Across Bacterial Phyla')
ax.legend(loc='lower right')
ax.set_xlim(0, 100)

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'phb_prevalence_by_phylum.png'), dpi=150, bbox_inches='tight')
plt.show()
print('Saved phb_prevalence_by_phylum.png')

In [None]:
# PHB prevalence by order (finer resolution)
order_stats = tax_phb.groupby(['gtdb_phylum', 'gtdb_class', 'gtdb_order']).agg(
    n_species=('gtdb_species_clade_id', 'count'),
    n_phaC=('has_phaC', 'sum'),
    pct_phaC=('has_phaC', lambda x: x.mean() * 100),
    n_complete=('has_complete_pathway', 'sum'),
    pct_complete=('has_complete_pathway', lambda x: x.mean() * 100),
).round(1).sort_values('n_species', ascending=False)

print(f'Orders with >20 species and >50% phaC prevalence:')
enriched = order_stats[(order_stats['n_species'] >= 20) & (order_stats['pct_phaC'] >= 50)]
enriched.sort_values('pct_phaC', ascending=False)

In [None]:
# Orders depleted for PHB
print(f'Orders with >20 species and <10% phaC prevalence:')
depleted = order_stats[(order_stats['n_species'] >= 20) & (order_stats['pct_phaC'] < 10)]
depleted.sort_values('pct_phaC', ascending=True)

In [None]:
# Figure 2: phaC core vs accessory across phyla
phac_species = tax_phb[tax_phb['has_phaC']].copy()
phac_species['phaC_status'] = 'unknown'
phac_species.loc[phac_species['phaC_is_core'] == 1, 'phaC_status'] = 'core'
phac_species.loc[phac_species['phaC_is_aux'] == 1, 'phaC_status'] = 'accessory'

phac_by_phylum = phac_species.groupby(['gtdb_phylum', 'phaC_status']).size().unstack(fill_value=0)
phac_by_phylum = phac_by_phylum.loc[phac_by_phylum.sum(axis=1) >= 10]
phac_by_phylum_pct = phac_by_phylum.div(phac_by_phylum.sum(axis=1), axis=0) * 100

fig, ax = plt.subplots(figsize=(10, 6))
phac_by_phylum_pct.sort_values('core', ascending=True).plot.barh(
    stacked=True, ax=ax, color={'core': '#4CAF50', 'accessory': '#FF9800', 'unknown': '#9E9E9E'}
)
ax.set_xlabel('% of phaC-carrying species')
ax.set_title('phaC Gene: Core vs Accessory Status by Phylum')
ax.legend(title='phaC status')

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'phb_core_vs_accessory.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Figure 3: Pathway completeness distribution
status_order = ['complete', 'synthase_only', 'precursors_only', 'accessory_only', 'absent']
status_colors = ['#2196F3', '#90CAF9', '#FFB74D', '#FFCC80', '#E0E0E0']

fig, ax = plt.subplots(figsize=(8, 5))
counts = [len(tax_phb[tax_phb['phb_status'] == s]) for s in status_order]
bars = ax.bar(status_order, counts, color=status_colors)

for bar, count in zip(bars, counts):
    ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 100,
            f'{count:,}\n({count/len(tax_phb)*100:.1f}%)',
            ha='center', va='bottom', fontsize=9)

ax.set_ylabel('Number of species')
ax.set_title('PHB Pathway Completeness Across 27K Bacterial Species')
ax.set_xticklabels([s.replace('_', '\n') for s in status_order])

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'phb_pathway_completeness.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Save taxonomy-level results
tax_phb.to_csv(os.path.join(DATA_DIR, 'phb_by_taxonomy.tsv'), sep='\t', index=False)
order_stats.to_csv(os.path.join(DATA_DIR, 'phb_by_order.tsv'), sep='\t')
print(f'Saved taxonomy data: {len(tax_phb):,} species, {len(order_stats):,} orders')

## Summary

### Key Findings (to be filled after execution)
- PHB prevalence across all bacteria: ?%
- Most enriched phyla: ?
- Most depleted phyla: ?
- phaC core vs accessory: ?

### Next Notebook (NB03)
Correlate PHB pathway presence with environmental metadata and AlphaEarth embeddings.