# MetaPop: Metagenomic Population Diversity Analysis

This notebook provides an interactive interface for running MetaPop on Google Colab.

**MetaPop** analyzes:
- **Macrodiversity**: Sample-level diversity (richness, Shannon's H, Simpson, etc.)
- **Microdiversity**: Within-population genetic variation (pi, theta, Tajima's D, pN/pS)

---

## 1. Setup and Installation

Run this cell first to install all required dependencies.

In [None]:
# Step 1: Install system dependencies (samtools, bcftools, prodigal)
!apt-get update -qq 2>/dev/null
!apt-get install -y -qq samtools bcftools prodigal 2>/dev/null

# Step 2: Install Python dependencies
!pip install -q numpy pandas scipy pysam matplotlib seaborn scikit-learn plotly ipywidgets

# Step 3: Clone and install MetaPop (with bug fix)
import os
if not os.path.exists('metapop'):
    !git clone https://github.com/espickle1/metapop.git

%cd metapop
# Force reinstall to pick up any code changes
!pip install --force-reinstall --no-deps -q .
%cd ..

print("Base dependencies and MetaPop installed!")

## 2a. Apply Bug Fix (Required)

This cell patches a bug in MetaPop where relative paths cause file-not-found errors during microdiversity analysis.

## 2b. Mount Google Drive (Optional)

Mount your Google Drive to access data files and save results.

## 3. Import Libraries and Initialize

## 3. Import Libraries and Initialize

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# Try to import plotly for interactive plots
try:
    import plotly.express as px
    import plotly.graph_objects as go
    PLOTLY_AVAILABLE = True
except ImportError:
    PLOTLY_AVAILABLE = False

print("Libraries loaded successfully!")
print(f"Interactive plots (Plotly): {'Available' if PLOTLY_AVAILABLE else 'Not available'}")

# File path inputs
print("=== File Paths ===")
print("Enter the full paths to your data directories:")
print()

input_dir = widgets.Text(
    value='',
    description='BAM Directory:',
    placeholder='/content/drive/MyDrive/your_project/bam_files',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='90%')
)
reference = widgets.Text(
    value='',
    description='Reference Directory:',
    placeholder='/content/drive/MyDrive/your_project/references',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='90%')
)
genes = widgets.Text(
    value='',
    description='Genes File (optional):',
    placeholder='Leave empty to auto-generate with Prodigal',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='90%')
)
norm_file = widgets.Text(
    value='',
    description='Normalization File:',
    placeholder='Leave empty to auto-generate from read counts',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='90%')
)
output_dir = widgets.Text(
    value='',
    description='Output Directory:',
    placeholder='/content/drive/MyDrive/your_project/results',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='90%')
)

print("Note: Reference must be a DIRECTORY containing FASTA files, not a single file.")
print()
display(input_dir, reference, output_dir, genes, norm_file)

In [None]:
# Filter parameters
print("=== Filter Parameters ===")
print("Configure quality filtering thresholds:")
print()

min_pct_id = widgets.FloatText(
    value=95.0,
    description='Min % Identity:',
    tooltip='Minimum percent identity for read alignment (80-100)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)
min_length = widgets.IntText(
    value=50,
    description='Min Read Length:',
    tooltip='Minimum read length in base pairs (e.g., 25-200)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)
min_cov = widgets.IntText(
    value=20,
    description='Min Coverage (%):',
    tooltip='Minimum percent of genome covered (0-100)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)
min_dep = widgets.IntText(
    value=10,
    description='Min Depth:',
    tooltip='Minimum truncated average depth (e.g., 1-100)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)
truncation = widgets.FloatText(
    value=10.0,
    description='Truncation (%):',
    tooltip='Truncate highest/lowest N% of depths (0-49)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

print("Default values are shown. Modify as needed for your analysis.")
print()
display(min_pct_id, min_length, min_cov, min_dep, truncation)

In [None]:
# Analysis options
print("=== Analysis Options ===")
print("Select which analyses to run:")
print()

run_preproc = widgets.Checkbox(
    value=True,
    description='Preprocessing (Required for first run)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px'),
    indent=False
)
run_microdiv = widgets.Checkbox(
    value=True,
    description='Microdiversity (SNPs, pN/pS, pi, theta)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px'),
    indent=False
)
run_macrodiv = widgets.Checkbox(
    value=True,
    description='Macrodiversity (Alpha/Beta diversity)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px'),
    indent=False
)
run_viz = widgets.Checkbox(
    value=True,
    description='Generate Visualizations',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px'),
    indent=False
)

print()
print("Computational Settings:")
threads = widgets.IntText(
    value=2,
    description='Threads:',
    tooltip='Number of CPU threads to use (1-8 recommended for Colab)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

display(run_preproc, run_microdiv, run_macrodiv, run_viz)
print()
display(threads)

In [None]:
# Analysis options
print("=== Analysis Options ===")
run_preproc = widgets.Checkbox(
    value=True, description='Run Preprocessing',
    style={'description_width': 'initial'}
)
run_microdiv = widgets.Checkbox(
    value=True, description='Run Microdiversity Analysis',
    style={'description_width': 'initial'}
)
run_macrodiv = widgets.Checkbox(
    value=True, description='Run Macrodiversity Analysis',
    style={'description_width': 'initial'}
)
run_viz = widgets.Checkbox(
    value=True, description='Generate Visualizations',
    style={'description_width': 'initial'}
)
threads = widgets.IntSlider(
    value=2, min=1, max=8, step=1,
    description='Threads:',
    style={'description_width': 'initial'}
)

display(run_preproc, run_microdiv, run_macrodiv, run_viz, threads)

import subprocess

def run_metapop():
    """Run the MetaPop pipeline with configured parameters."""
    clear_output()
    print("Starting MetaPop Pipeline...")
    print("="*50)
    
    # Validate required inputs are provided
    if not input_dir.value or input_dir.value.strip() == '':
        print("Error: BAM Directory is required!")
        print("Please enter the path to your BAM files directory.")
        return
    
    if not reference.value or reference.value.strip() == '':
        print("Error: Reference Directory is required!")
        print("Please enter the path to your reference FASTA files directory.")
        return
    
    if not output_dir.value or output_dir.value.strip() == '':
        print("Error: Output Directory is required!")
        print("Please enter the path where you want results saved.")
        return
    
    # Validate inputs exist
    if not os.path.exists(input_dir.value):
        print(f"Error: BAM directory not found: {input_dir.value}")
        print("Please check the path and try again.")
        return
    
    # Handle reference path - MetaPop expects a directory
    ref_path = reference.value
    if os.path.isfile(ref_path):
        print(f"Warning: Reference path is a file, not a directory.")
        print(f"File: {ref_path}")
        print(f"MetaPop expects a directory containing FASTA files.")
        print(f"Using parent directory: {os.path.dirname(ref_path)}")
        ref_path = os.path.dirname(ref_path)
        print()
    
    if not os.path.exists(ref_path):
        print(f"Error: Reference directory not found: {ref_path}")
        print("Please check the path and try again.")
        return
    
    if not os.path.isdir(ref_path):
        print(f"Error: Reference path is not a directory: {ref_path}")
        return
    
    # Check if reference directory contains valid FASTA files
    ref_files = [f for f in os.listdir(ref_path) if os.path.isfile(os.path.join(ref_path, f))]
    fasta_extensions = ['.fasta', '.fa', '.fna', '.ffn', '.faa']
    fasta_files = [f for f in ref_files if any(f.lower().endswith(ext) for ext in fasta_extensions)]
    compressed_files = [f for f in ref_files if f.endswith('.gz')]
    bam_files = [f for f in ref_files if f.lower().endswith('.bam')]
    
    print(f"Validating directories...")
    print(f"  BAM directory: {input_dir.value}")
    print(f"  Reference directory: {ref_path}")
    print(f"    - Total files: {len(ref_files)}")
    print(f"    - FASTA files: {len(fasta_files)}")
    if compressed_files:
        print(f"    - Compressed files (.gz): {len(compressed_files)}")
    if bam_files:
        print(f"    - BAM files: {len(bam_files)}")
    print()
    
    # Validate reference directory
    if len(fasta_files) == 0:
        print("Error: No FASTA files found in reference directory!")
        print(f"Expected files with extensions: {', '.join(fasta_extensions)}")
        if ref_files:
            print(f"Files found: {', '.join(ref_files[:10])}")
            if len(ref_files) > 10:
                print(f"  ... and {len(ref_files) - 10} more")
        return
    
    if compressed_files:
        print("Warning: Compressed files (.gz) found in reference directory.")
        print("MetaPop cannot read compressed FASTA files directly.")
        print("Please decompress them first using: gunzip *.gz")
        print()
    
    if bam_files:
        print("Error: BAM files found in reference directory!")
        print("BAM files are binary and cannot be read as FASTA files.")
        print()
        print("Required directory structure:")
        print("  - BAM directory: Contains only .bam files")
        print("  - Reference directory: Contains only .fasta/.fa/.fna files")
        return
    
    if input_dir.value == ref_path:
        print("Error: BAM directory and Reference directory cannot be the same!")
        print()
        print("Required directory structure:")
        print("  - BAM directory: Contains only .bam files")
        print("  - Reference directory: Contains only .fasta/.fa/.fna files")
        return
    
    # Create output directory
    os.makedirs(output_dir.value, exist_ok=True)
    
    # Build command
    cmd = [
        'metapop',
        '--input_samples', input_dir.value,
        '--reference', ref_path,
        '--output', output_dir.value,
        '--id_min', str(min_pct_id.value),
        '--min_len', str(min_length.value),
        '--min_cov', str(min_cov.value),
        '--min_dep', str(min_dep.value),
        '--trunc', str(truncation.value),
        '--threads', str(threads.value),
    ]
    
    if genes.value and genes.value.strip():
        cmd.extend(['--genes', genes.value])
    if norm_file.value and norm_file.value.strip():
        cmd.extend(['--norm', norm_file.value])
    if not run_microdiv.value:
        cmd.append('--no_micro')
    if not run_macrodiv.value:
        cmd.append('--no_macro')
    if not run_viz.value:
        cmd.append('--no_viz')
    
    print(f"Configuration Summary:")
    print(f"  Input BAMs: {input_dir.value}")
    print(f"  References: {ref_path} ({len(fasta_files)} FASTA files)")
    print(f"  Output: {output_dir.value}")
    print(f"  Filters: ID≥{min_pct_id.value}%, Len≥{min_length.value}bp, Cov≥{min_cov.value}%, Depth≥{min_dep.value}x")
    print(f"  Threads: {threads.value}")
    print(f"  Analyses: ", end="")
    analyses = []
    if run_preproc.value:
        analyses.append("Preprocessing")
    if run_microdiv.value:
        analyses.append("Microdiversity")
    if run_macrodiv.value:
        analyses.append("Macrodiversity")
    if run_viz.value:
        analyses.append("Visualization")
    print(", ".join(analyses))
    print("="*50)
    print("\nRunning MetaPop...")
    print()
    
    # Run the pipeline
    try:
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1
        )
        
        # Stream output in real-time
        for line in process.stdout:
            print(line, end='')
        
        process.wait()
        
        if process.returncode == 0:
            print("\n" + "="*50)
            print("✓ MetaPop pipeline completed successfully!")
            print(f"✓ Results saved to: {output_dir.value}")
            print()
            print("Next steps:")
            print("  1. Run the cells in Section 6 to view results")
            print("  2. Use Section 7 to download results as a ZIP file")
        else:
            print("\n" + "="*50)
            print(f"✗ MetaPop exited with error code: {process.returncode}")
            print()
            print("Check the output above for error details.")
            
    except FileNotFoundError:
        print("\n" + "="*50)
        print("Error: MetaPop command not found.")
        print()
        print("This usually means MetaPop is not installed.")
        print("Please run Cell 1 (Setup and Installation) first.")
    except Exception as e:
        print("\n" + "="*50)
        print(f"Error during pipeline execution: {e}")
        import traceback
        traceback.print_exc()

# Create run button
run_button = widgets.Button(
    description='▶ Run MetaPop Pipeline',
    button_style='success',
    tooltip='Click to start the MetaPop analysis pipeline',
    icon='play',
    layout=widgets.Layout(width='300px', height='50px')
)
run_button.on_click(lambda b: run_metapop())

print()
print("Click the button below to start the analysis:")
display(run_button)

In [None]:
import subprocess

def run_metapop():
    """Run the MetaPop pipeline with configured parameters."""
    clear_output()
    print("Starting MetaPop Pipeline...")
    print("="*50)
    
    # Validate inputs
    if not os.path.exists(input_dir.value):
        print(f"Error: BAM directory not found: {input_dir.value}")
        return
    
    # Handle reference path - MetaPop expects a directory
    ref_path = reference.value
    if os.path.isfile(ref_path):
        print(f"Warning: Reference path is a file, not a directory.")
        print(f"File: {ref_path}")
        print(f"MetaPop expects a directory containing FASTA files.")
        print(f"Using parent directory: {os.path.dirname(ref_path)}")
        ref_path = os.path.dirname(ref_path)
        print()
    
    if not os.path.exists(ref_path):
        print(f"Error: Reference directory not found: {ref_path}")
        return
    
    if not os.path.isdir(ref_path):
        print(f"Error: Reference path is not a directory: {ref_path}")
        return
    
    # Check if reference directory contains valid FASTA files
    ref_files = [f for f in os.listdir(ref_path) if os.path.isfile(os.path.join(ref_path, f))]
    fasta_extensions = ['.fasta', '.fa', '.fna', '.ffn', '.faa']
    fasta_files = [f for f in ref_files if any(f.lower().endswith(ext) for ext in fasta_extensions)]
    compressed_files = [f for f in ref_files if f.endswith('.gz')]
    bam_files = [f for f in ref_files if f.lower().endswith('.bam')]
    
    print(f"Checking reference directory: {ref_path}")
    print(f"  Total files: {len(ref_files)}")
    print(f"  FASTA files: {len(fasta_files)}")
    if compressed_files:
        print(f"  Compressed files (.gz): {len(compressed_files)}")
    if bam_files:
        print(f"  BAM files: {len(bam_files)}")
    print()
    
    # Validate reference directory
    if len(fasta_files) == 0:
        print("Error: No FASTA files found in reference directory!")
        print(f"Expected files with extensions: {', '.join(fasta_extensions)}")
        print(f"Files found: {ref_files[:10]}")  # Show first 10 files
        return
    
    if compressed_files:
        print("Warning: Compressed files (.gz) found in reference directory.")
        print("MetaPop cannot read compressed FASTA files directly.")
        print("Please decompress them first using: gunzip *.gz")
        print()
    
    if bam_files:
        print("Error: BAM files found in reference directory!")
        print("BAM files are binary and cannot be read as FASTA files.")
        print("Please use separate directories for:")
        print("  - BAM files (--input_samples)")
        print("  - Reference FASTA files (--reference)")
        return
    
    if input_dir.value == ref_path:
        print("Error: Input BAM directory and reference directory are the same!")
        print("These must be different directories:")
        print("  - BAM directory should contain .bam files")
        print("  - Reference directory should contain .fasta/.fa/.fna files")
        return
    
    # Create output directory
    os.makedirs(output_dir.value, exist_ok=True)
    
    # Build command
    cmd = [
        'metapop',
        '--input_samples', input_dir.value,
        '--reference', ref_path,
        '--output', output_dir.value,
        '--id_min', str(min_pct_id.value),
        '--min_len', str(min_length.value),
        '--min_cov', str(min_cov.value),
        '--min_dep', str(min_dep.value),
        '--trunc', str(truncation.value),
        '--threads', str(threads.value),
    ]
    
    if genes.value:
        cmd.extend(['--genes', genes.value])
    if norm_file.value:
        cmd.extend(['--norm', norm_file.value])
    if not run_microdiv.value:
        cmd.append('--no_micro')
    if not run_macrodiv.value:
        cmd.append('--no_macro')
    if not run_viz.value:
        cmd.append('--no_viz')
    
    print(f"Configuration:")
    print(f"  Input BAMs: {input_dir.value}")
    print(f"  References: {ref_path} ({len(fasta_files)} FASTA files)")
    print(f"  Output: {output_dir.value}")
    print(f"  Threads: {threads.value}")
    print("="*50)
    print("\nRunning MetaPop...")
    print()
    
    # Run the pipeline
    try:
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1
        )
        
        # Stream output in real-time
        for line in process.stdout:
            print(line, end='')
        
        process.wait()
        
        if process.returncode == 0:
            print("\n" + "="*50)
            print("MetaPop pipeline completed successfully!")
            print(f"Results saved to: {output_dir.value}")
        else:
            print("\n" + "="*50)
            print(f"MetaPop exited with error code: {process.returncode}")
            
    except FileNotFoundError:
        print("\nError: MetaPop command not found.")
        print("Please make sure MetaPop is installed correctly.")
        print("Try running: !pip install -e /content/metapop")
    except Exception as e:
        print(f"\nError during pipeline execution: {e}")
        import traceback
        traceback.print_exc()

# Create run button
run_button = widgets.Button(
    description='Run MetaPop',
    button_style='success',
    tooltip='Click to start the pipeline',
    icon='play'
)
run_button.on_click(lambda b: run_metapop())

display(run_button)

## 6. View Results

After the pipeline completes, use the cells below to explore results interactively.

In [None]:
def load_results():
    """Load and display MetaPop results."""
    results_path = os.path.join(output_dir.value, 'MetaPop')
    
    if not os.path.exists(results_path):
        print(f"Results not found at: {results_path}")
        print("Please run the pipeline first.")
        return None
    
    results = {}
    
    # Load alpha diversity
    alpha_file = os.path.join(results_path, '11.Macrodiversity', 'Alpha_diversity_stats.tsv')
    if os.path.exists(alpha_file):
        results['alpha'] = pd.read_csv(alpha_file, sep='\t', index_col=0)
        print(f"Loaded alpha diversity: {len(results['alpha'])} samples")
    
    # Load gene microdiversity
    gene_micro_file = os.path.join(results_path, '10.Microdiversity', 'global_gene_microdiversity.tsv')
    if os.path.exists(gene_micro_file):
        results['gene_micro'] = pd.read_csv(gene_micro_file, sep='\t')
        print(f"Loaded gene microdiversity: {len(results['gene_micro'])} genes")
    
    # Load abundance matrix
    abundance_file = os.path.join(results_path, '11.Macrodiversity', 'normalized_abundances_table.tsv')
    if os.path.exists(abundance_file):
        results['abundance'] = pd.read_csv(abundance_file, sep='\t', index_col=0)
        print(f"Loaded abundance matrix: {results['abundance'].shape}")
    
    return results

results = load_results()

In [None]:
# Display alpha diversity summary
if results and 'alpha' in results:
    print("Alpha Diversity Summary:")
    display(results['alpha'].describe())
    
    if PLOTLY_AVAILABLE:
        # Interactive bar chart
        fig = px.bar(results['alpha'].reset_index(), x='index', y='Richness',
                    title='Species Richness by Sample',
                    labels={'index': 'Sample', 'Richness': 'Species Richness'})
        fig.show()
    else:
        # Static matplotlib plot
        plt.figure(figsize=(10, 5))
        results['alpha']['Richness'].plot(kind='bar')
        plt.title('Species Richness by Sample')
        plt.ylabel('Richness')
        plt.tight_layout()
        plt.show()

In [None]:
# Display pN/pS distribution
if results and 'gene_micro' in results:
    pnps = results['gene_micro']['pNpS_ratio'].dropna()
    pnps = pnps[~np.isinf(pnps) & (pnps < 10)]  # Filter outliers
    
    print(f"pN/pS Statistics:")
    print(f"  Mean: {pnps.mean():.3f}")
    print(f"  Median: {pnps.median():.3f}")
    print(f"  Genes with pN/pS < 1 (purifying): {(pnps < 1).sum()}")
    print(f"  Genes with pN/pS > 1 (positive): {(pnps > 1).sum()}")
    
    if PLOTLY_AVAILABLE:
        fig = px.histogram(pnps, nbins=50, title='pN/pS Ratio Distribution',
                          labels={'value': 'pN/pS Ratio', 'count': 'Gene Count'})
        fig.add_vline(x=1, line_dash='dash', line_color='red',
                     annotation_text='Neutral (pN/pS=1)')
        fig.show()
    else:
        plt.figure(figsize=(10, 5))
        plt.hist(pnps, bins=50, edgecolor='black')
        plt.axvline(1, color='red', linestyle='--', label='Neutral')
        plt.xlabel('pN/pS Ratio')
        plt.ylabel('Gene Count')
        plt.title('pN/pS Distribution')
        plt.legend()
        plt.tight_layout()
        plt.show()

## 7. Download Results

Download the results as a ZIP file.

In [None]:
from google.colab import files
import shutil

def download_results():
    """Create ZIP archive and download results."""
    results_path = os.path.join(output_dir.value, 'MetaPop')
    
    if not os.path.exists(results_path):
        print("No results to download.")
        return
    
    zip_path = '/content/metapop_results'
    shutil.make_archive(zip_path, 'zip', results_path)
    
    print(f"Downloading {zip_path}.zip...")
    files.download(f'{zip_path}.zip')

download_button = widgets.Button(
    description='Download Results',
    button_style='info',
    tooltip='Download results as ZIP',
    icon='download'
)
download_button.on_click(lambda b: download_results())

display(download_button)

---

## Need Help?

- **Documentation**: See the MetaPop README for detailed usage instructions
- **Issues**: Report bugs at https://github.com/espickle1/metapop/issues
- **Contact**: gregory.392@osu.edu / kenji.gerhardt@gmail.com