# üß¨ Protein Sequence Analysis Pipeline

A complete workflow for protein sequence embedding and analysis:

1. **FASTA Cleaning** - Clean sequences and parse metadata
2. **Embedding Generation** - Generate ESM-C embeddings
3. **Entropy Analysis** - Identify conserved and variable regions
4. **Logits Analysis** - Analyze amino acid propensities
5. **Export Results** - Save all outputs

---

In [None]:
# ============================================================
# SETUP
# ============================================================

print("üîß Setting up environment...\n")

# Check environment
try:
    from google.colab import files as colab_files
    IN_COLAB = True
    print("‚úÖ Running in Google Colab")
    
    # Install dependencies
    !pip install -q esm huggingface_hub ipywidgets pandas torch scikit-learn matplotlib
except ImportError:
    IN_COLAB = False
    print("‚úÖ Running in local environment")

# Standard imports
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
import torch

# Add project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Check GPU
if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
else:
    DEVICE = "cpu"
    print("‚ö†Ô∏è No GPU - running on CPU")

print("\nüéâ Setup complete!")

---
## Step 1: FASTA Cleaning

Clean protein sequences and parse metadata from FASTA headers.

In [None]:
# ============================================================
# STEP 1: FASTA CLEANING
# ============================================================

from embedding.fasta_cleaner import process_fasta_files, save_results
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# Storage
sequences_df = None
metadata_df = None

# Upload widget
fasta_upload = widgets.FileUpload(
    accept=".fasta,.fa,.faa,.txt",
    multiple=True,
    description="Upload FASTA",
    button_style="primary"
)

fasta_output = widgets.Output()

def on_fasta_upload(change):
    global sequences_df, metadata_df
    with fasta_output:
        clear_output()
        if not change["new"]:
            return
        
        print("üîÑ Processing FASTA files...")
        
        # Process uploaded files
        from embedding.fasta_cleaner import process_fasta_content
        from io import StringIO
        
        all_seqs = []
        all_meta = []
        
        for file_info in change["new"]:
            content = file_info["content"].decode("utf-8")
            seq_df, meta_df = process_fasta_content(content, file_info["name"])
            all_seqs.append(seq_df)
            all_meta.append(meta_df)
        
        sequences_df = pd.concat(all_seqs, ignore_index=True)
        metadata_df = pd.concat(all_meta, ignore_index=True)
        
        print(f"‚úÖ Processed {len(sequences_df)} sequences")
        print(f"\nüìã Preview:")
        display(sequences_df.head())

fasta_upload.observe(on_fasta_upload, names="value")

# Display
display(HTML("<h3>üìÅ Upload FASTA Files</h3>"))
display(fasta_upload)
display(fasta_output)

In [None]:
# Save cleaned sequences
if sequences_df is not None:
    sequences_df.to_csv("sequences.csv", index=False)
    metadata_df.to_csv("metadata.csv", index=False)
    print("‚úÖ Saved sequences.csv and metadata.csv")
else:
    print("‚ö†Ô∏è Upload FASTA files first")

---
## Step 2: Embedding Generation

Generate ESM-C protein embeddings using HuggingFace models.

In [None]:
# ============================================================
# STEP 2: EMBEDDING GENERATION
# ============================================================

from embedding.esmc_embed_lib import load_esmc_model, embed_sequences, save_embeddings
from huggingface_hub import login

# Model storage
model = None
embedding_results = None

# Widgets
token_input = widgets.Password(
    placeholder="HuggingFace token",
    description="HF Token:",
    layout=widgets.Layout(width="400px")
)

model_dropdown = widgets.Dropdown(
    options=[("ESMC 600M", "esmc_600m"), ("ESMC 300M", "esmc_300m")],
    value="esmc_600m",
    description="Model:"
)

load_btn = widgets.Button(description="üîê Load Model", button_style="primary")
embed_btn = widgets.Button(description="üöÄ Generate Embeddings", button_style="success")

progress = widgets.IntProgress(value=0, min=0, max=100, description="Progress:")
embed_output = widgets.Output()

def on_load_click(btn):
    global model
    with embed_output:
        clear_output()
        print("üîÑ Loading model...")
        try:
            model = load_esmc_model(token_input.value, model_dropdown.value)
            print(f"‚úÖ Model loaded on {DEVICE}")
        except Exception as e:
            print(f"‚ùå Error: {e}")

def on_embed_click(btn):
    global embedding_results
    with embed_output:
        clear_output()
        if model is None:
            print("‚ö†Ô∏è Load model first")
            return
        if sequences_df is None:
            print("‚ö†Ô∏è Upload FASTA first")
            return
        
        print("üîÑ Generating embeddings...")
        progress.max = len(sequences_df)
        
        def update_progress(current, total):
            progress.value = current
        
        embedding_results = embed_sequences(
            model, sequences_df,
            return_embeddings=True,
            return_logits=True,
            progress_callback=update_progress
        )
        
        print(f"‚úÖ Embedded {len(embedding_results['sequence_id'])} sequences")

load_btn.on_click(on_load_click)
embed_btn.on_click(on_embed_click)

# Display
display(HTML("<h3>üîê HuggingFace Login</h3>"))
display(widgets.HBox([token_input, model_dropdown]))
display(widgets.HBox([load_btn, embed_btn]))
display(progress)
display(embed_output)

In [None]:
# Save embeddings
if embedding_results is not None:
    save_embeddings(embedding_results, "embeddings.pt")
    print("‚úÖ Saved embeddings.pt")
else:
    print("‚ö†Ô∏è Generate embeddings first")

---
## Step 3: Entropy Analysis

Calculate Shannon entropy to identify conserved and variable positions.

In [None]:
# ============================================================
# STEP 3: ENTROPY ANALYSIS
# ============================================================

from analysis.entropy_lib import analyze_entropy, entropy_summary

# Load embeddings if needed
if embedding_results is None:
    if Path("embeddings.pt").exists():
        embedding_results = torch.load("embeddings.pt", weights_only=False)
        print("‚úÖ Loaded embeddings.pt")
    else:
        print("‚ö†Ô∏è Run Step 2 first or upload embeddings.pt")

In [None]:
# Run entropy analysis
entropy_results = None

if embedding_results is not None:
    print("üîÑ Calculating entropy...")
    
    entropy_results = analyze_entropy(
        embedding_results,
        base="e",
        constrained_percentile=10.0,
        flexible_percentile=90.0
    )
    
    # Summary
    df = entropy_summary(entropy_results)
    print(f"\n‚úÖ Analyzed {len(df)} sequences")
    print(f"\nüìä Global mean entropy: {entropy_results['global_mean']:.3f}")
    print(f"\nüìã Summary:")
    display(df)

In [None]:
# Visualize entropy distribution for first sequence
if entropy_results is not None and len(entropy_results["entropy"]) > 0:
    import matplotlib.pyplot as plt
    
    entropy_vals = entropy_results["entropy"][0].numpy()
    
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.plot(entropy_vals, alpha=0.7)
    ax.set_xlabel("Residue Position")
    ax.set_ylabel("Entropy (nats)")
    ax.set_title(f"Entropy Profile: {entropy_results['sequence_id'][0]}")
    
    # Mark constrained and flexible regions
    constrained = entropy_results["constrained_positions"][0].numpy()
    flexible = entropy_results["flexible_positions"][0].numpy()
    
    ax.scatter(constrained, entropy_vals[constrained], c="blue", s=10, alpha=0.5, label="Constrained")
    ax.scatter(flexible, entropy_vals[flexible], c="red", s=10, alpha=0.5, label="Flexible")
    ax.legend()
    
    plt.tight_layout()
    plt.show()

---
## Step 4: Logits Analysis

Analyze amino acid propensities at specific positions.

In [None]:
# ============================================================
# STEP 4: LOGITS ANALYSIS
# ============================================================

from analysis.logits_lib import analyze_residues, plot_heatmap, AA_VOCAB

# Define residues of interest (customize as needed)
# Format: {position: "label"}
residues_of_interest = {
    0: "Position 1",
    10: "Position 11",
    20: "Position 21",
    50: "Position 51",
    100: "Position 101",
}

print("üìã Residues of interest:")
for pos, label in residues_of_interest.items():
    print(f"   ‚Ä¢ {pos}: {label}")

In [None]:
# Analyze residues
logits_analysis = None

if embedding_results is not None:
    print("üîÑ Analyzing logits...")
    
    logits_analysis = analyze_residues(
        embedding_results,
        residues_of_interest=residues_of_interest,
        pool_method="mean",
        scale_method="minmax"
    )
    
    print("‚úÖ Analysis complete")
    print(f"\nüìã Amino acid probabilities:")
    display(logits_analysis["probs"])

In [None]:
# Generate heatmap
if logits_analysis is not None:
    plot_heatmap(
        logits_analysis["scaled_logits"],
        row_labels=logits_analysis["residue_labels"],
        col_labels=AA_VOCAB,
        title="Amino Acid Propensity Heatmap",
        figsize=(12, 5),
        cmap="coolwarm"
    )

---
## Step 5: Export Results

Save all analysis results.

In [None]:
# ============================================================
# STEP 5: EXPORT RESULTS
# ============================================================

from analysis.entropy_lib import save_entropy_results
from analysis.logits_lib import save_analysis

output_dir = Path("results")
output_dir.mkdir(exist_ok=True)

# Save entropy results
if entropy_results is not None:
    entropy_summary(entropy_results).to_csv(output_dir / "entropy_summary.csv", index=False)
    print("‚úÖ Saved results/entropy_summary.csv")

# Save logits analysis
if logits_analysis is not None:
    save_analysis(logits_analysis, str(output_dir / "logits_analysis.csv"))
    print("‚úÖ Saved results/logits_analysis.csv")

# Save embeddings
if embedding_results is not None:
    save_embeddings(embedding_results, str(output_dir / "embeddings.pt"))
    print("‚úÖ Saved results/embeddings.pt")

print("\nüéâ All results saved!")

In [None]:
# Download results (Colab only)
if IN_COLAB:
    import shutil
    
    # Create zip of results
    shutil.make_archive("results", "zip", "results")
    colab_files.download("results.zip")
    print("üì• Downloading results.zip...")
else:
    print(f"üìÅ Results saved to: {output_dir.absolute()}")

---

## üìñ Pipeline Summary

This notebook orchestrates the complete protein analysis workflow:

| Step | Library | Input | Output |
|------|---------|-------|--------|
| 1. FASTA Cleaning | `embedding.fasta_cleaner` | FASTA files | `sequences.csv`, `metadata.csv` |
| 2. Embedding | `embedding.esmc_embed_lib` | `sequences.csv` | `embeddings.pt` |
| 3. Entropy | `analysis.entropy_lib` | `embeddings.pt` | `entropy_summary.csv` |
| 4. Logits | `analysis.logits_lib` | `embeddings.pt` | `logits_analysis.csv`, heatmaps |

### Using the libraries directly:

```python
# Import libraries
from embedding import process_fasta_files, load_esmc_model, embed_from_csv
from analysis import analyze_entropy, analyze_residues

# Process FASTA
seq_df, meta_df = process_fasta_files("proteins.fasta")

# Generate embeddings
model = load_esmc_model("hf_token")
results = embed_from_csv(model, "sequences.csv")

# Analyze
entropy = analyze_entropy(results)
logits = analyze_residues(results, residues_of_interest={100: "D100"})
```