# NB02: Taxonomy Bridge and Functional Features

Build ENIGMA genus -> pangenome genus bridge and compute genus-level functional proxies from eggNOG COG categories.

Inputs:
- `../data/community_taxon_counts.tsv`

Outputs:
- `../data/taxon_bridge.tsv`
- `../data/taxon_functional_features.tsv`


In [1]:
from pathlib import Path
import re
import pandas as pd

DATA_DIR = Path('../data')
community = pd.read_csv(DATA_DIR / 'community_taxon_counts.tsv', sep='	')

print('Community rows:', len(community))
print('Unique ENIGMA genera:', community['genus'].nunique())

spark = get_spark_session()
print('Spark session ready')


Community rows: 41711
Unique ENIGMA genera: 1392


Spark session ready


In [2]:
def norm_genus(x: str) -> str:
    if pd.isna(x):
        return ''
    x = str(x).strip().lower()
    x = re.sub(r'^g__', '', x)
    x = re.sub(r'[^a-z0-9]+', '_', x)
    return x.strip('_')

community_genus = (
    community[['genus']]
    .dropna()
    .drop_duplicates()
    .assign(genus_norm=lambda d: d['genus'].map(norm_genus))
)
community_genus = community_genus[community_genus['genus_norm'] != '']
print('Community genus candidates:', len(community_genus))


Community genus candidates: 1392


In [3]:
species = spark.sql("""
SELECT DISTINCT *
FROM kbase_ke_pangenome.gtdb_species_clade
""").toPandas()

print('gtdb_species_clade columns:', species.columns.tolist())

species_col = None
for c in ['GTDB_species', 'gtdb_species', 'species', 'gtdb_species_name']:
    if c in species.columns:
        species_col = c
        break

if species_col is None:
    raise RuntimeError('Could not find species name column in gtdb_species_clade')
if 'gtdb_species_clade_id' not in species.columns:
    raise RuntimeError('Expected gtdb_species_clade_id in gtdb_species_clade')

species = species[['gtdb_species_clade_id', species_col]].rename(columns={species_col: 'species_name'})
species['genus_guess'] = (
    species['species_name']
    .astype(str)
    .str.replace('s__', '', regex=False)
    .str.split('_')
    .str[0]
)
species['genus_norm'] = species['genus_guess'].map(norm_genus)
species = species[species['genus_norm'] != ''].drop_duplicates()

print('Species rows with genus parsed:', len(species))


gtdb_species_clade columns: ['gtdb_species_clade_id', 'representative_genome_id', 'GTDB_species', 'GTDB_taxonomy', 'ANI_circumscription_radius', 'mean_intra_species_ANI', 'min_intra_species_ANI', 'mean_intra_species_AF', 'min_intra_species_AF', 'no_clustered_genomes_unfiltered', 'no_clustered_genomes_filtered']


Species rows with genus parsed: 27690


In [4]:
bridge = community_genus.merge(
    species[['genus_norm', 'gtdb_species_clade_id', 'species_name']],
    on='genus_norm',
    how='left'
)
bridge['mapping_tier'] = bridge['gtdb_species_clade_id'].apply(lambda v: 'genus_exact' if pd.notna(v) else 'unmapped')

mapped = bridge[bridge['mapping_tier'] == 'genus_exact'].copy()
print('Mapped genera:', mapped['genus'].nunique())
print('Unmapped genera:', (bridge['mapping_tier'] == 'unmapped').sum())

bridge.to_csv(DATA_DIR / 'taxon_bridge.tsv', sep='	', index=False)
print('Saved bridge table')


Mapped genera: 530
Unmapped genera: 862


Saved bridge table


In [5]:
mapped_clades = mapped[['genus', 'genus_norm', 'gtdb_species_clade_id', 'species_name']].drop_duplicates()
if mapped_clades.empty:
    raise RuntimeError('No mapped genera. Cannot compute functional features.')

spark.createDataFrame(mapped_clades[['genus_norm', 'gtdb_species_clade_id']].drop_duplicates()).createOrReplaceTempView('mapped_clades_tmp')

# Pull clade sizes to define a strict representative-clade mode.
clade_sizes = spark.sql("""
SELECT p.gtdb_species_clade_id, CAST(p.no_genomes AS DOUBLE) AS no_genomes
FROM kbase_ke_pangenome.pangenome p
JOIN mapped_clades_tmp m ON p.gtdb_species_clade_id = m.gtdb_species_clade_id
""").toPandas()

mapped_clades2 = mapped_clades.merge(clade_sizes, on='gtdb_species_clade_id', how='left')
mapped_clades2['no_genomes'] = pd.to_numeric(mapped_clades2['no_genomes'], errors='coerce').fillna(-1)

strict_clades = (
    mapped_clades2
    .sort_values(['genus_norm', 'no_genomes', 'gtdb_species_clade_id'], ascending=[True, False, True])
    .drop_duplicates(subset=['genus_norm'])
    [['genus', 'genus_norm', 'gtdb_species_clade_id']]
)

relaxed_clades = mapped_clades2[['genus', 'genus_norm', 'gtdb_species_clade_id']].drop_duplicates()

print('Strict clades:', len(strict_clades), 'Relaxed clades:', len(relaxed_clades))


Strict clades: 530 Relaxed clades: 7380


In [6]:
def build_mode_features(mode_name: str, clades_df: pd.DataFrame) -> pd.DataFrame:
    spark.createDataFrame(clades_df[['genus_norm', 'gtdb_species_clade_id']].drop_duplicates()).createOrReplaceTempView('mode_clades_tmp')

    cog_counts = spark.sql("""
    SELECT
      m.genus_norm,
      gc.gtdb_species_clade_id,
      e.COG_category,
      COUNT(*) AS n_annotations
    FROM kbase_ke_pangenome.gene_cluster gc
    JOIN mode_clades_tmp m ON gc.gtdb_species_clade_id = m.gtdb_species_clade_id
    JOIN kbase_ke_pangenome.eggnog_mapper_annotations e
      ON gc.gene_cluster_id = e.query_name
    WHERE e.COG_category IS NOT NULL AND e.COG_category != ''
    GROUP BY m.genus_norm, gc.gtdb_species_clade_id, e.COG_category
    """).toPandas()

    if cog_counts.empty:
        return pd.DataFrame(columns=['genus', 'genus_norm', 'mapping_mode', 'feature_name', 'feature_value'])

    cog_counts['n_annotations'] = pd.to_numeric(cog_counts['n_annotations'], errors='coerce').fillna(0)
    genus_tot = cog_counts.groupby('genus_norm', as_index=False)['n_annotations'].sum().rename(columns={'n_annotations':'total_ann'})

    def frac_for(letter):
        mask = cog_counts['COG_category'].astype(str).str.contains(letter, regex=False)
        tmp = cog_counts[mask].groupby('genus_norm', as_index=False)['n_annotations'].sum().rename(columns={'n_annotations':f'{letter}_ann'})
        out = genus_tot.merge(tmp, on='genus_norm', how='left').fillna(0)
        out[f'cog_{letter.lower()}_fraction'] = out[f'{letter}_ann'] / out['total_ann'].replace(0, pd.NA)
        return out[['genus_norm', f'cog_{letter.lower()}_fraction']]

    defense = frac_for('V').rename(columns={'cog_v_fraction': 'cog_defense_fraction'})
    mobilome = frac_for('X').rename(columns={'cog_x_fraction': 'cog_mobilome_fraction'})

    met_letters = ['C','E','G','H','I','P']
    mask_met = cog_counts['COG_category'].astype(str).apply(lambda s: any(ch in s for ch in met_letters))
    met = cog_counts[mask_met].groupby('genus_norm', as_index=False)['n_annotations'].sum().rename(columns={'n_annotations':'met_ann'})
    met = genus_tot.merge(met, on='genus_norm', how='left').fillna(0)
    met['cog_metabolism_fraction'] = met['met_ann'] / met['total_ann'].replace(0, pd.NA)
    met = met[['genus_norm', 'cog_metabolism_fraction']]

    wide = defense.merge(mobilome, on='genus_norm', how='outer').merge(met, on='genus_norm', how='outer').fillna(0)
    genus_lookup = clades_df[['genus', 'genus_norm']].drop_duplicates()
    wide = genus_lookup.merge(wide, on='genus_norm', how='right')
    wide['mapping_mode'] = mode_name

    return wide.melt(
        id_vars=['genus', 'genus_norm', 'mapping_mode'],
        var_name='feature_name',
        value_name='feature_value'
    )

feats_strict = build_mode_features('strict_single_clade', strict_clades)
feats_relaxed = build_mode_features('relaxed_all_clades', relaxed_clades)
feats = pd.concat([feats_strict, feats_relaxed], ignore_index=True)

feats.to_csv(DATA_DIR / 'taxon_functional_features.tsv', sep='	', index=False)
print('Saved functional feature table')
print(feats.groupby('mapping_mode')['genus_norm'].nunique().rename('n_genera_with_features'))
print(feats.head().to_string(index=False))


Saved functional feature table
mapping_mode
relaxed_all_clades     530
strict_single_clade    530
Name: n_genera_with_features, dtype: int64
          genus      genus_norm        mapping_mode         feature_name  feature_value
          67-14           67_14 strict_single_clade cog_defense_fraction       0.012452
    Abiotrophia     abiotrophia strict_single_clade cog_defense_fraction       0.039560
Acetoanaerobium acetoanaerobium strict_single_clade cog_defense_fraction       0.022270
 Acetobacterium  acetobacterium strict_single_clade cog_defense_fraction       0.026019
  Achromobacter   achromobacter strict_single_clade cog_defense_fraction       0.018887
