# Shared Gene Content Analysis

This notebook extends the exploratory analysis to focus on **shared gene content** across genomes and species.

## Key Questions:
1. How do core, accessory, and singleton genes distribute within species?
2. How similar are core genomes across related species?
3. What patterns exist in gene sharing within genera?
4. How does genome sampling affect observed core genome size?

## 1. Setup & Data Loading

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Spearman correlation without scipy
def spearmanr(x, y):
    """Calculate Spearman rank correlation using numpy."""
    x = np.array(x)
    y = np.array(y)
    # Remove NaN values
    mask = ~(np.isnan(x) | np.isnan(y))
    x, y = x[mask], y[mask]
    n = len(x)
    if n < 3:
        return np.nan, 1.0
    # Rank the data
    x_ranks = np.argsort(np.argsort(x)) + 1
    y_ranks = np.argsort(np.argsort(y)) + 1
    # Pearson on ranks
    d = x_ranks - y_ranks
    rho = 1 - (6 * np.sum(d**2)) / (n * (n**2 - 1))
    return rho, 0.0  # Simplified

# Set plotting style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load authentication token
with open('.env', 'r') as f:
    for line in f:
        if line.startswith('KBASE_AUTH_TOKEN'):
            AUTH_TOKEN = line.split('"')[1]
            break

# BERDL API configuration
BASE_URL = "https://hub.berdl.kbase.us/apis/mcp"
DATABASE = "kbase_ke_pangenome"
HEADERS = {"Authorization": f"Bearer {AUTH_TOKEN}", "Content-Type": "application/json"}

def query_berdl(sql, limit=10000, offset=0):
    """Execute a SQL query against BERDL."""
    url = f"{BASE_URL}/delta/tables/query"
    payload = {"query": sql, "limit": limit, "offset": offset}
    response = requests.post(url, headers=HEADERS, json=payload)
    response.raise_for_status()
    data = response.json()
    results = data.get('result', data.get('results', []))
    return pd.DataFrame(results) if results else pd.DataFrame()

print("Setup complete")

In [None]:
# Load full pangenome data with taxonomy
sql = f"""
SELECT 
    p.gtdb_species_clade_id,
    s.GTDB_species,
    s.GTDB_taxonomy,
    p.no_genomes,
    p.no_core,
    p.no_aux_genome as no_accessory,
    p.no_singleton_gene_clusters as no_singletons,
    p.no_gene_clusters,
    p.no_CDSes,
    s.mean_intra_species_ANI,
    s.ANI_circumscription_radius
FROM {DATABASE}.pangenome p
JOIN {DATABASE}.gtdb_species_clade s 
    ON p.gtdb_species_clade_id = s.gtdb_species_clade_id
ORDER BY p.no_genomes DESC
"""

df = query_berdl(sql, limit=30000)

# Parse taxonomy
def parse_taxonomy(tax_string):
    levels = {}
    if pd.isna(tax_string):
        return levels
    for part in tax_string.split(';'):
        if '__' in part:
            level, name = part.split('__', 1)
            levels[level] = name
    return levels

tax_parsed = df['GTDB_taxonomy'].apply(parse_taxonomy)
df['domain'] = tax_parsed.apply(lambda x: x.get('d', 'Unknown'))
df['phylum'] = tax_parsed.apply(lambda x: x.get('p', 'Unknown'))
df['class'] = tax_parsed.apply(lambda x: x.get('c', 'Unknown'))
df['order'] = tax_parsed.apply(lambda x: x.get('o', 'Unknown'))
df['family'] = tax_parsed.apply(lambda x: x.get('f', 'Unknown'))
df['genus'] = tax_parsed.apply(lambda x: x.get('g', 'Unknown'))

# Calculate derived metrics
df['pct_core'] = (df['no_core'] / df['no_gene_clusters'] * 100).round(2)
df['pct_accessory'] = (df['no_accessory'] / df['no_gene_clusters'] * 100).round(2)
df['pct_singletons'] = (df['no_singletons'] / df['no_gene_clusters'] * 100).round(2)
df['shared_genes'] = df['no_core'] + df['no_accessory'] - df['no_singletons']  # genes in 2+ genomes
df['pct_shared'] = (df['shared_genes'] / df['no_gene_clusters'] * 100).round(2)
df['core_to_total_ratio'] = (df['no_core'] / df['no_gene_clusters']).round(3)
df['accessory_core_ratio'] = (df['no_accessory'] / df['no_core']).round(2)

print(f"Loaded {len(df):,} species pangenomes")
df.head()

## 2. Within-Species Gene Sharing Patterns

### 2.1 Gene Sharing Spectrum

The pangenome divides into:
- **Core genes**: Present in >95% of genomes (shared by nearly all)
- **Accessory genes**: Present in 5-95% of genomes (variably shared)
- **Singleton genes**: Present in only 1 genome (unique, not shared)

In [None]:
# Calculate "truly shared" genes (present in 2+ genomes = core + non-singleton accessory)
# Since accessory includes singletons, shared = total - singletons

df['truly_shared'] = df['no_gene_clusters'] - df['no_singletons'].fillna(0)
df['pct_truly_shared'] = (df['truly_shared'] / df['no_gene_clusters'] * 100).round(2)

# Distribution of sharing percentages
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Core Genes (in >95% genomes)',
        'Accessory Genes (5-95%)',
        'Singleton Genes (unique)',
        'Truly Shared (in 2+ genomes)'
    )
)

fig.add_trace(go.Histogram(x=df['pct_core'], nbinsx=50, name='Core', marker_color='#2ecc71'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['pct_accessory'], nbinsx=50, name='Accessory', marker_color='#3498db'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['pct_singletons'], nbinsx=50, name='Singletons', marker_color='#e74c3c'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['pct_truly_shared'], nbinsx=50, name='Shared', marker_color='#9b59b6'), row=2, col=2)

fig.update_xaxes(title_text="% of pangenome", row=1, col=1)
fig.update_xaxes(title_text="% of pangenome", row=1, col=2)
fig.update_xaxes(title_text="% of pangenome", row=2, col=1)
fig.update_xaxes(title_text="% of pangenome", row=2, col=2)

fig.update_layout(height=600, showlegend=False, title_text="Gene Sharing Categories Across Species")
fig.show()

print("Gene sharing summary statistics:")
print(df[['pct_core', 'pct_accessory', 'pct_singletons', 'pct_truly_shared']].describe())

### 2.2 Effect of Genome Sampling on Core Genome Size

In [None]:
# Filter species with enough genomes for meaningful analysis
df_sampled = df[df['no_genomes'] >= 5].copy()

fig = px.scatter(
    df_sampled,
    x='no_genomes',
    y='pct_core',
    color='pct_singletons',
    size='no_gene_clusters',
    size_max=20,
    hover_data=['GTDB_species', 'no_core', 'no_gene_clusters'],
    title='Core Genome Percentage vs Genome Sampling Depth',
    labels={
        'no_genomes': 'Number of Genomes Sampled',
        'pct_core': 'Core Genes (%)',
        'pct_singletons': 'Singletons (%)'
    },
    color_continuous_scale='RdYlGn_r'
)
fig.update_xaxes(type="log")
fig.show()

# Correlation analysis
corr, pval = spearmanr(df_sampled['no_genomes'], df_sampled['pct_core'])
print(f"\nSpearman correlation (genomes vs % core): r = {corr:.3f}")
print("Negative correlation indicates: more genomes sampled -> smaller core genome (expected)")

### 2.3 Core Genome Conservation Patterns

In [None]:
# Relationship between core genome size and ANI
fig = px.scatter(
    df_sampled,
    x='mean_intra_species_ANI',
    y='pct_core',
    color='no_genomes',
    hover_data=['GTDB_species'],
    title='Core Genome Size vs Intra-Species Diversity',
    labels={
        'mean_intra_species_ANI': 'Mean Intra-Species ANI (%)',
        'pct_core': 'Core Genes (%)',
        'no_genomes': 'Genomes'
    },
    color_continuous_scale='Viridis'
)
fig.show()

corr, pval = spearmanr(df_sampled['mean_intra_species_ANI'], df_sampled['pct_core'])
print(f"\nSpearman correlation (ANI vs % core): r = {corr:.3f}")
print("Positive correlation indicates: higher ANI (more similar genomes) -> larger core genome")

## 3. Cross-Species Gene Sharing Within Genera

### 3.1 Genus-Level Core Genome Comparison

In [None]:
# Identify genera with multiple well-sampled species
df_wellsampled = df[df['no_genomes'] >= 20].copy()

genus_species_counts = df_wellsampled.groupby('genus').agg({
    'GTDB_species': 'count',
    'no_genomes': 'sum',
    'no_core': 'mean',
    'pct_core': 'mean'
}).rename(columns={'GTDB_species': 'n_species'}).sort_values('n_species', ascending=False)

print("Top genera with multiple well-sampled species (>=20 genomes each):")
print(genus_species_counts.head(20))

In [None]:
# Select a genus with multiple species for detailed comparison
# Using Streptococcus as an example (common, clinically relevant, diverse)

target_genus = 'Streptococcus'
df_genus = df_wellsampled[df_wellsampled['genus'] == target_genus].copy()

if len(df_genus) == 0:
    # Fallback to most common genus
    target_genus = genus_species_counts.index[0]
    df_genus = df_wellsampled[df_wellsampled['genus'] == target_genus].copy()

print(f"\n{target_genus} species comparison ({len(df_genus)} species with >=20 genomes):")
display(df_genus[['GTDB_species', 'no_genomes', 'no_core', 'no_accessory', 
                  'no_singletons', 'pct_core', 'mean_intra_species_ANI']].sort_values('no_genomes', ascending=False))

In [None]:
# Visualize core genome sizes within the genus
fig = px.bar(
    df_genus.sort_values('no_core', ascending=True),
    x='no_core',
    y='GTDB_species',
    orientation='h',
    color='pct_core',
    hover_data=['no_genomes', 'no_accessory'],
    title=f'Core Genome Size Comparison Within {target_genus}',
    labels={
        'no_core': 'Number of Core Genes',
        'GTDB_species': 'Species',
        'pct_core': 'Core %'
    },
    color_continuous_scale='Greens'
)
fig.update_layout(height=max(400, len(df_genus) * 30))
fig.show()

In [None]:
# Stacked bar chart showing core/accessory/singleton composition
df_genus_sorted = df_genus.sort_values('no_gene_clusters', ascending=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    name='Core',
    y=df_genus_sorted['GTDB_species'],
    x=df_genus_sorted['no_core'],
    orientation='h',
    marker_color='#2ecc71'
))

# Accessory non-singleton
accessory_non_singleton = df_genus_sorted['no_accessory'] - df_genus_sorted['no_singletons'].fillna(0)
fig.add_trace(go.Bar(
    name='Shared Accessory (2+ genomes)',
    y=df_genus_sorted['GTDB_species'],
    x=accessory_non_singleton,
    orientation='h',
    marker_color='#3498db'
))

fig.add_trace(go.Bar(
    name='Singletons (1 genome)',
    y=df_genus_sorted['GTDB_species'],
    x=df_genus_sorted['no_singletons'],
    orientation='h',
    marker_color='#e74c3c'
))

fig.update_layout(
    barmode='stack',
    title=f'Pangenome Composition Within {target_genus}',
    xaxis_title='Number of Gene Clusters',
    yaxis_title='Species',
    height=max(400, len(df_genus) * 30),
    legend=dict(orientation="h", yanchor="bottom", y=1.02)
)
fig.show()

### 3.2 Comparative Pangenome Metrics Across Genera

In [None]:
# Compare gene sharing patterns across top genera
top_genera = genus_species_counts.head(15).index.tolist()
df_top = df_wellsampled[df_wellsampled['genus'].isin(top_genera)].copy()

fig = px.box(
    df_top,
    x='genus',
    y='pct_core',
    color='genus',
    points='all',
    hover_data=['GTDB_species', 'no_genomes'],
    title='Core Genome Percentage Distribution by Genus',
    labels={'pct_core': 'Core Genes (%)', 'genus': 'Genus'}
)
fig.update_layout(showlegend=False, xaxis_tickangle=-45, height=500)
fig.show()

In [None]:
# Open vs closed pangenomes by genus
fig = px.box(
    df_top,
    x='genus',
    y='accessory_core_ratio',
    color='genus',
    points='all',
    hover_data=['GTDB_species', 'no_genomes'],
    title='Pangenome Openness by Genus (Accessory:Core Ratio)',
    labels={'accessory_core_ratio': 'Accessory:Core Ratio', 'genus': 'Genus'}
)
fig.update_layout(showlegend=False, xaxis_tickangle=-45, height=500)
fig.show()

print("Higher ratio = more open pangenome (more variable gene content)")

## 4. Gene Cluster Type Analysis

In [None]:
# Gene cluster statistics for specific species
# Use existing pangenome data (already loaded) instead of slow gene_cluster queries

target_species = [
    'Staphylococcus_aureus',
    'Escherichia_coli', 
    'Salmonella_enterica',
    'Streptococcus_pneumoniae'
]

# Filter from existing dataframe using species name matching
df_targets = df[df['GTDB_species'].str.contains('|'.join(target_species), case=False, na=False)].copy()

if len(df_targets) > 0:
    # Build cluster stats from pangenome data (already has core/accessory/singleton counts)
    cluster_stats = []
    for _, row in df_targets.iterrows():
        species_name = row['GTDB_species'].replace('s__', '').replace('_', ' ')
        
        # Core genes
        cluster_stats.append({
            'is_core': True, 'is_auxiliary': False, 'is_singleton': False,
            'cluster_count': row['no_core'], 'species': species_name
        })
        
        # Shared accessory (accessory minus singletons)
        shared_acc = row['no_accessory'] - (row['no_singletons'] if pd.notna(row['no_singletons']) else 0)
        cluster_stats.append({
            'is_core': False, 'is_auxiliary': True, 'is_singleton': False,
            'cluster_count': shared_acc, 'species': species_name
        })
        
        # Singletons
        if pd.notna(row['no_singletons']):
            cluster_stats.append({
                'is_core': False, 'is_auxiliary': True, 'is_singleton': True,
                'cluster_count': row['no_singletons'], 'species': species_name
            })
    
    df_clusters = pd.DataFrame(cluster_stats)
    print(f"Retrieved cluster stats for {len(df_targets)} species from pangenome data:")
    display(df_clusters)
else:
    print("No matching species found in dataset")

In [None]:
# Visualize cluster type distribution
if 'df_clusters' in dir() and len(df_clusters) > 0:
    # Create category labels
    def categorize(row):
        if row['is_core']:
            return 'Core'
        elif row['is_singleton']:
            return 'Singleton'
        else:
            return 'Shared Accessory'
    
    df_clusters['category'] = df_clusters.apply(categorize, axis=1)
    
    # Pivot for visualization
    df_pivot = df_clusters.pivot_table(
        index='species', 
        columns='category', 
        values='cluster_count', 
        aggfunc='sum'
    ).fillna(0)
    
    # Stacked bar chart
    fig = go.Figure()
    colors = {'Core': '#2ecc71', 'Shared Accessory': '#3498db', 'Singleton': '#e74c3c'}
    
    for cat in ['Core', 'Shared Accessory', 'Singleton']:
        if cat in df_pivot.columns:
            fig.add_trace(go.Bar(
                name=cat,
                x=df_pivot.index,
                y=df_pivot[cat],
                marker_color=colors.get(cat, '#95a5a6')
            ))
    
    fig.update_layout(
        barmode='stack',
        title='Gene Cluster Type Distribution by Species',
        xaxis_title='Species',
        yaxis_title='Number of Gene Clusters',
        height=500
    )
    fig.show()
else:
    print("No cluster data available for visualization")

## 5. Shared Gene Patterns by Taxonomic Level

In [None]:
# Aggregate gene sharing metrics by phylum
phylum_stats = df.groupby('phylum').agg({
    'GTDB_species': 'count',
    'no_genomes': 'sum',
    'no_core': 'median',
    'pct_core': 'median',
    'pct_truly_shared': 'median',
    'accessory_core_ratio': 'median'
}).rename(columns={'GTDB_species': 'n_species'}).sort_values('n_species', ascending=False)

print("Gene sharing patterns by phylum (top 20):")
display(phylum_stats.head(20))

In [None]:
# Visualize phylum-level patterns
top_phyla = phylum_stats.head(15).index.tolist()
df_phyla = df[df['phylum'].isin(top_phyla)].copy()

fig = px.box(
    df_phyla[df_phyla['no_genomes'] >= 5],
    x='phylum',
    y='pct_truly_shared',
    color='phylum',
    title='Gene Sharing (% genes in 2+ genomes) by Phylum',
    labels={'pct_truly_shared': '% Genes Shared (in 2+ genomes)', 'phylum': 'Phylum'}
)
fig.update_layout(showlegend=False, xaxis_tickangle=-45, height=500)
fig.show()

## 6. Summary: Key Findings on Shared Gene Content

In [None]:
# Summary statistics
print("=" * 60)
print("SHARED GENE CONTENT ANALYSIS SUMMARY")
print("=" * 60)

print(f"\nDataset: {len(df):,} species, {df['no_genomes'].sum():,} genomes")

print(f"\n--- Gene Sharing Categories (median values) ---")
print(f"Core genes (in >95% of genomes):       {df['pct_core'].median():.1f}% of pangenome")
print(f"Accessory genes (5-95%):               {df['pct_accessory'].median():.1f}% of pangenome")
print(f"Singleton genes (unique to 1 genome):  {df['pct_singletons'].median():.1f}% of pangenome")
print(f"Truly shared (in 2+ genomes):          {df['pct_truly_shared'].median():.1f}% of pangenome")

print(f"\n--- Core Genome Size ---")
print(f"Median core genes per species:         {df['no_core'].median():.0f} genes")
print(f"Range:                                 {df['no_core'].min():.0f} - {df['no_core'].max():.0f} genes")

print(f"\n--- Pangenome Openness ---")
df_filtered = df[df['no_genomes'] >= 10]
print(f"Median accessory:core ratio:           {df_filtered['accessory_core_ratio'].median():.2f}")
print(f"(Higher ratio = more open pangenome, more gene content variation)")

print(f"\n--- Effect of Sampling ---")
corr, _ = spearmanr(df_filtered['no_genomes'], df_filtered['pct_core'])
print(f"Correlation (genomes vs % core):       r = {corr:.3f}")
print(f"More genomes sampled -> smaller observed core (as expected)")

print("\n" + "=" * 60)

## 7. Next Steps for Deeper Analysis

1. **Cross-species homology**: Identify orthologous gene clusters across species using functional annotations (eggnog, COG, KEGG)
2. **Horizontal gene transfer**: Analyze genes shared between distantly related species
3. **Mobile elements**: Correlate accessory genome with plasmid/phage content from `genomad_mobile_elements`
4. **Rarefaction curves**: Model core/accessory genome discovery with increasing sampling
5. **Functional enrichment**: Compare functions of core vs accessory genes