# üß¨ Protein Sequence Analysis Pipeline

A complete workflow for protein sequence embedding and analysis:

1. **FASTA Cleaning** - Clean sequences and parse metadata
2. **Embedding Generation** - Generate ESM-C embeddings
3. **Entropy Analysis** - Identify conserved and variable regions
4. **Logits Analysis** - Analyze amino acid propensities
5. **Export Results** - Save all outputs

---

In [None]:
# ============================================================
# SETUP - Run this first!
# ============================================================

print("üîß Setting up environment...\n")

# Check environment
try:
    from google.colab import files as colab_files
    IN_COLAB = True
    print("‚úÖ Running in Google Colab")
    
    import os
    import subprocess
    
    # Try to clone repository (public repos only)
    if not os.path.exists("sequence-cleaning"):
        print("üì• Cloning repository...")
        result = subprocess.run(
            ["git", "clone", "https://github.com/espickle1/sequence-cleaning.git"],
            capture_output=True, text=True
        )
        if result.returncode != 0:
            print("‚ö†Ô∏è Clone failed - repository may be private")
            print("\nüìã To fix this, either:")
            print("   1. Make your GitHub repo public, OR")
            print("   2. Download repo as ZIP, upload to Colab, and run:")
            print("      !unzip sequence-cleaning-main.zip")
            print("      import os; os.chdir('sequence-cleaning-main')")
            raise Exception("Clone failed")
    
    # Change to repo directory
    os.chdir("sequence-cleaning")
    print(f"üìÅ Working directory: {os.getcwd()}")
    
    # Install dependencies
    print("üì¶ Installing dependencies...")
    !pip install -q esm huggingface_hub ipywidgets pandas torch scikit-learn matplotlib
    
except ImportError:
    IN_COLAB = False
    print("‚úÖ Running in local environment")

# Standard imports
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
import torch

# Add project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Verify packages are available
try:
    from embedding import fasta_cleaner
    from analysis import entropy_lib
    print("‚úÖ Pipeline packages loaded")
except ImportError as e:
    print(f"‚ùå Package import failed: {e}")
    print("   Make sure you're in the sequence-cleaning directory")

# Check GPU
if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
else:
    DEVICE = "cpu"
    print("‚ö†Ô∏è No GPU - running on CPU")

print("\nüéâ Setup complete!")

---
## Step 1: FASTA Cleaning

Clean protein sequences and parse metadata from FASTA headers.

In [None]:
# ============================================================
# STEP 1: FASTA CLEANING
# ============================================================

from embedding.fasta_cleaner import process_fasta_files, save_results, process_fasta_content
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# Storage
sequences_df = None
metadata_df = None

# Upload widget
fasta_upload = widgets.FileUpload(
    accept=".fasta,.fa,.faa,.txt",
    multiple=True,
    description="Upload FASTA",
    button_style="primary"
)

fasta_output = widgets.Output()

def on_fasta_upload(change):
    global sequences_df, metadata_df
    with fasta_output:
        clear_output()
        if not change["new"]:
            return
        
        print("üîÑ Processing FASTA files...")
        
        all_seqs = []
        all_meta = []
        
        # Get uploaded data - handle different ipywidgets versions
        uploaded = change["new"]
        
        # ipywidgets 8.x with dict format: {filename: FileInfo}
        if isinstance(uploaded, dict):
            for filename, file_info in uploaded.items():
                content = file_info["content"].decode("utf-8")
                seq_df, meta_df = process_fasta_content(content, filename)
                all_seqs.append(seq_df)
                all_meta.append(meta_df)
        # ipywidgets 8.x with tuple format: (FileInfo, ...)
        elif isinstance(uploaded, tuple):
            for file_info in uploaded:
                filename = file_info.name
                content = file_info.content.decode("utf-8")
                seq_df, meta_df = process_fasta_content(content, filename)
                all_seqs.append(seq_df)
                all_meta.append(meta_df)
        else:
            print(f"‚ö†Ô∏è Unexpected upload format: {type(uploaded)}")
            print(f"   Content: {uploaded}")
            return
        
        sequences_df = pd.concat(all_seqs, ignore_index=True)
        metadata_df = pd.concat(all_meta, ignore_index=True)
        
        print(f"‚úÖ Processed {len(sequences_df)} sequences")
        print(f"\nüìã Preview:")
        display(sequences_df.head())

fasta_upload.observe(on_fasta_upload, names="value")

# Display
display(HTML("<h3>üìÅ Upload FASTA Files</h3>"))
display(fasta_upload)
display(fasta_output)

In [None]:
# Save cleaned sequences
if sequences_df is not None:
    sequences_df.to_csv("sequences.csv", index=False)
    metadata_df.to_csv("metadata.csv", index=False)
    print("‚úÖ Saved sequences.csv and metadata.csv")
else:
    print("‚ö†Ô∏è Upload FASTA files first")

---
## Step 2: Embedding Generation

Generate ESM-C protein embeddings using HuggingFace models.

In [None]:
# ============================================================
# STEP 2: EMBEDDING GENERATION
# ============================================================

from embedding.esmc_embed_lib import load_esmc_model, embed_single, save_embeddings
from huggingface_hub import login
from datetime import datetime

# Model storage
model = None
embedding_results = None

# Widgets
token_input = widgets.Password(
    placeholder="HuggingFace token",
    description="HF Token:",
    layout=widgets.Layout(width="400px")
)

model_dropdown = widgets.Dropdown(
    options=[("ESMC 600M", "esmc_600m"), ("ESMC 300M", "esmc_300m")],
    value="esmc_600m",
    description="Model:"
)

load_btn = widgets.Button(description="üîê Load Model", button_style="primary")
embed_btn = widgets.Button(description="üöÄ Generate Embeddings", button_style="success")

progress = widgets.IntProgress(value=0, min=0, max=100, description="Progress:")
embed_output = widgets.Output()

def run_embedding_for_sequence(model, seq_id, sequence):
    """Embed a single protein sequence and return its results."""
    result = embed_single(
        model, sequence,
        return_embeddings=True,
        return_logits=True
    )
    return {"seq_id": seq_id, "embeddings": result["embeddings"], "logits": result["logits"]}

def on_load_click(btn):
    global model
    with embed_output:
        clear_output()
        print("üîÑ Loading model...")
        try:
            model = load_esmc_model(token_input.value, model_dropdown.value)
            print(f"‚úÖ Model loaded on {DEVICE}")
        except Exception as e:
            print(f"‚ùå Error: {e}")

def on_embed_click(btn):
    global embedding_results
    with embed_output:
        clear_output()
        if model is None:
            print("‚ö†Ô∏è Load model first")
            return
        if sequences_df is None:
            print("‚ö†Ô∏è Upload FASTA first")
            return

        print("üîÑ Generating embeddings per sequence...")
        progress.max = len(sequences_df)
        progress.value = 0

        # Build combined results dict for downstream steps
        embedding_results = {
            "sequence_id": [],
            "embeddings": [],
            "logits": [],
            "model_name": model_dropdown.value,
            "created_at": datetime.now().isoformat(),
            "errors": [],
        }

        for i, (_, row) in enumerate(sequences_df.iterrows()):
            seq_id = row["sequence_id"]
            sequence = row["sequence"]
            try:
                result = run_embedding_for_sequence(model, seq_id, sequence)
                embedding_results["sequence_id"].append(seq_id)
                embedding_results["embeddings"].append(result["embeddings"])
                embedding_results["logits"].append(result["logits"])
                print(f"   ‚úÖ {seq_id}")
            except Exception as e:
                embedding_results["sequence_id"].append(seq_id)
                embedding_results["embeddings"].append(None)
                embedding_results["logits"].append(None)
                embedding_results["errors"].append((seq_id, str(e)))
                print(f"   ‚ùå {seq_id}: {e}")

            progress.value = i + 1

        print(f"\n‚úÖ Embedded {len(embedding_results['sequence_id'])} sequences")

load_btn.on_click(on_load_click)
embed_btn.on_click(on_embed_click)

# Display
display(HTML("<h3>üîê HuggingFace Login</h3>"))
display(widgets.HBox([token_input, model_dropdown]))
display(widgets.HBox([load_btn, embed_btn]))
display(progress)
display(embed_output)

In [None]:
# Save embeddings
if embedding_results is not None:
    save_embeddings(embedding_results, "embeddings.pt")
    print("‚úÖ Saved embeddings.pt")
else:
    print("‚ö†Ô∏è Generate embeddings first")

---
## Step 3: Entropy Analysis

Calculate Shannon entropy to identify conserved and variable positions.

In [None]:
# ============================================================
# STEP 3: ENTROPY ANALYSIS
# ============================================================

from analysis.entropy_lib import analyze_entropy, entropy_summary

# Load embeddings if needed
if embedding_results is None:
    if Path("embeddings.pt").exists():
        embedding_results = torch.load("embeddings.pt", weights_only=False)
        print("‚úÖ Loaded embeddings.pt")
    else:
        print("‚ö†Ô∏è Run Step 2 first or upload embeddings.pt")

In [None]:
# Run entropy analysis - one sequence at a time
all_entropy_results = []

def run_entropy_for_sequence(seq_id, logits):
    """Run entropy analysis on a single protein sequence."""
    single_result = analyze_entropy(
        {"sequence_id": [seq_id], "logits": [logits]},
        base="e",
        constrained_percentile=10.0,
        flexible_percentile=90.0
    )
    return single_result

if embedding_results is not None:
    print("üîÑ Calculating entropy per sequence...")

    for seq_id, logits in zip(embedding_results["sequence_id"], embedding_results["logits"]):
        if logits is None:
            continue
        result = run_entropy_for_sequence(seq_id, logits)
        all_entropy_results.append(result)
        print(f"   ‚úÖ {seq_id}: mean entropy = {result['mean_entropy'][0]:.3f}")

    print(f"\n‚úÖ Analyzed {len(all_entropy_results)} sequences")
    for r in all_entropy_results:
        print(f"   ‚Ä¢ {r['sequence_id'][0]}: {r['num_residues'][0]} residues, "
              f"mean H = {r['mean_entropy'][0]:.3f}, "
              f"constrained = {len(r['constrained_positions'][0])}, "
              f"flexible = {len(r['flexible_positions'][0])}")

In [None]:
# Visualize entropy distribution for each sequence
if all_entropy_results:
    import matplotlib.pyplot as plt

    for result in all_entropy_results:
        entropy_vals = result["entropy"][0].float().numpy()
        seq_id = result["sequence_id"][0]

        fig, ax = plt.subplots(figsize=(12, 4))
        ax.plot(entropy_vals, alpha=0.7)
        ax.set_xlabel("Residue Position")
        ax.set_ylabel("Entropy (nats)")
        ax.set_title(f"Entropy Profile: {seq_id}")

        # Mark constrained and flexible regions
        constrained = result["constrained_positions"][0].long().numpy()
        flexible = result["flexible_positions"][0].long().numpy()

        ax.scatter(constrained, entropy_vals[constrained], c="blue", s=10, alpha=0.5, label="Constrained")
        ax.scatter(flexible, entropy_vals[flexible], c="red", s=10, alpha=0.5, label="Flexible")
        ax.legend()

        plt.tight_layout()
        plt.show()

---
## Step 4: Logits Analysis

Analyze amino acid propensities at specific positions.

In [None]:
# ============================================================
# STEP 4: LOGITS ANALYSIS
# ============================================================

from analysis.logits_lib import analyze_residues, plot_heatmap, AA_VOCAB

def run_logits_for_sequence(seq_id, logits):
    """Run logits analysis on a single protein sequence, analyzing every residue."""
    seq_length = logits.shape[0]
    residues_of_interest = {i: f"Position {i+1}" for i in range(seq_length)}

    single_result = analyze_residues(
        {"sequence_id": [seq_id], "logits": [logits]},
        residues_of_interest=residues_of_interest,
        pool_method="mean",
        scale_method="minmax"
    )
    return single_result

In [None]:
# Analyze logits - one sequence at a time
all_logits_analyses = []

if embedding_results is not None:
    print("üîÑ Analyzing logits per sequence...")

    for seq_id, logits in zip(embedding_results["sequence_id"], embedding_results["logits"]):
        if logits is None:
            continue
        analysis = run_logits_for_sequence(seq_id, logits)
        all_logits_analyses.append({"seq_id": seq_id, "analysis": analysis})
        print(f"   ‚úÖ {seq_id}: {len(analysis['residue_labels'])} residues analyzed")

    print(f"\n‚úÖ Logits analysis complete for {len(all_logits_analyses)} sequences")

    # Display probabilities for each sequence
    for item in all_logits_analyses:
        print(f"\nüìã Amino acid probabilities: {item['seq_id']}")
        display(item["analysis"]["probs"])

In [None]:
# Generate heatmap for each sequence
if all_logits_analyses:
    for item in all_logits_analyses:
        analysis = item["analysis"]
        plot_heatmap(
            analysis["probs"],
            row_labels=analysis["residue_labels"],
            col_labels=AA_VOCAB,
            title=f"Amino Acid Propensity Heatmap: {item['seq_id']}",
            figsize=(12, 5),
            cmap="coolwarm"
        )

---
## Step 5: Export Results

Save all analysis results.

In [None]:
# ============================================================
# STEP 5: EXPORT RESULTS
# ============================================================

from analysis.entropy_lib import save_entropy_results, entropy_summary
from analysis.logits_lib import save_analysis

output_dir = Path("results")
output_dir.mkdir(exist_ok=True)

# Save entropy results (one CSV per sequence)
if all_entropy_results:
    for r in all_entropy_results:
        seq_id = r["sequence_id"][0]
        df = entropy_summary(r)
        df.to_csv(output_dir / f"entropy_{seq_id}.csv", index=False)
    print(f"‚úÖ Saved {len(all_entropy_results)} entropy CSVs to results/")

# Save logits analysis (one CSV per sequence)
if all_logits_analyses:
    for item in all_logits_analyses:
        seq_id = item["seq_id"]
        save_analysis(item["analysis"], str(output_dir / f"logits_{seq_id}.csv"))
    print(f"‚úÖ Saved {len(all_logits_analyses)} logits CSVs to results/")

# Save embeddings
if embedding_results is not None:
    save_embeddings(embedding_results, str(output_dir / "embeddings.pt"))
    print("‚úÖ Saved results/embeddings.pt")

print("\nüéâ All results saved!")

In [None]:
# Download results (Colab only)
if IN_COLAB:
    import shutil
    
    # Create zip of results
    shutil.make_archive("results", "zip", "results")
    colab_files.download("results.zip")
    print("üì• Downloading results.zip...")
else:
    print(f"üìÅ Results saved to: {output_dir.absolute()}")

---

## üìñ Pipeline Summary

This notebook orchestrates the complete protein analysis workflow:

| Step | Library | Input | Output |
|------|---------|-------|--------|
| 1. FASTA Cleaning | `embedding.fasta_cleaner` | FASTA files | `sequences.csv`, `metadata.csv` |
| 2. Embedding | `embedding.esmc_embed_lib` | `sequences.csv` | `embeddings.pt` |
| 3. Entropy | `analysis.entropy_lib` | `embeddings.pt` | `entropy_summary.csv` |
| 4. Logits | `analysis.logits_lib` | `embeddings.pt` | `logits_analysis.csv`, heatmaps |

### Using the libraries directly:

```python
# Import libraries
from embedding import process_fasta_files, load_esmc_model, embed_from_csv
from analysis import analyze_entropy, analyze_residues

# Process FASTA
seq_df, meta_df = process_fasta_files("proteins.fasta")

# Generate embeddings
model = load_esmc_model("hf_token")
results = embed_from_csv(model, "sequences.csv")

# Analyze
entropy = analyze_entropy(results)
logits = analyze_residues(results, residues_of_interest={100: "D100"})
```