# üß¨ FASTA File Cleaner

A simple tool to clean and consolidate FASTA files with amino acid sequences.

## How to use:
1. **Run all cells** (Runtime ‚Üí Run all)
2. **Upload your FASTA file(s)** using the button below
3. **Click "Process Files"** to clean and parse your sequences
4. **Download** the resulting CSV files

---

In [None]:
# ============================================================
# SETUP - Run this cell first!
# ============================================================

import hashlib
import re
from collections import defaultdict
from typing import Dict, List, Tuple

import pandas as pd

# Check if running in Google Colab
try:
    from google.colab import files as colab_files
    IN_COLAB = True
    print("‚úÖ Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("‚úÖ Running in local Jupyter environment")

# Import widgets for file upload
try:
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
    print(f"‚úÖ Widgets loaded (ipywidgets version: {widgets.__version__})")
except ImportError:
    print("‚ö†Ô∏è ipywidgets not found. Installing...")
    !pip install ipywidgets
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output

print("\nüéâ Setup complete! Proceed to the next cell.")

In [None]:
# ============================================================
# HELPER FUNCTIONS
# ============================================================

# Canonical 20 amino acids
CANONICAL_AA = set("ACDEFGHIKLMNPQRSTVWY")


def clean_sequence(sequence: str) -> str:
    """
    Clean an amino acid sequence.
    - Convert to uppercase
    - Replace non-canonical amino acids with underscore
    - Remove whitespace and newlines
    """
    sequence = sequence.upper().replace(" ", "").replace("\n", "").replace("\r", "")
    cleaned = ""
    for char in sequence:
        if char in CANONICAL_AA:
            cleaned += char
        elif char.isalpha():  # Non-canonical amino acid letter
            cleaned += "_"
        # Skip non-letter characters (numbers, symbols, etc.)
    return cleaned


def hash_sequence(sequence: str) -> str:
    """
    Generate a unique ID for a sequence using SHA-256.
    Returns first 12 characters for readability.
    """
    return hashlib.sha256(sequence.encode()).hexdigest()[:12]


def parse_header(header: str) -> Dict[str, str]:
    """
    Parse FASTA header to extract metadata fields.
    Handles common formats:
    - UniProt: sp|P12345|PROTEIN_NAME|date|...
    - GenBank: gb|ABC123|NAME|...
    - Custom: delimited by | ; / or tab
    Attempts to identify 'name' and 'date' fields.
    """
    # Remove leading > if present
    header = header.lstrip(">")
    
    result = {
        "original_header": header,
        "name": "",
        "date": ""
    }
    
    # Date pattern: YYYY-MM-DD, DD/MM/YYYY, MM-DD-YYYY, etc.
    date_pattern = re.compile(
        r"\b(\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b"
    )
    
    # Known database prefixes to skip when finding name
    db_prefixes = {"sp", "tr", "gb", "ref", "emb", "dbj", "pir", "prf", "uniprot"}
    
    # Try splitting by common delimiters
    delimiters = ["|", ";", "/", "\t"]
    fields = [header]
    
    for delim in delimiters:
        if delim in header:
            fields = [f.strip() for f in header.split(delim)]
            break
    
    # Process fields to find name and date
    extra_fields = []
    name_found = False
    
    for field in fields:
        field = field.strip()
        if not field:
            continue
        
        # Skip database prefixes
        if field.lower() in db_prefixes:
            continue
        
        # Skip accession numbers (mostly alphanumeric, short)
        if re.match(r"^[A-Z0-9]{4,12}$", field) and not name_found:
            extra_fields.append(field)  # Keep as extra field
            continue
        
        # Check for date
        date_match = date_pattern.search(field)
        if date_match and not result["date"]:
            result["date"] = date_match.group(1)
            # If field is just the date, don't add to extras
            if field == date_match.group(1):
                continue
        
        # First meaningful field is the name
        if not name_found:
            result["name"] = field
            name_found = True
        else:
            extra_fields.append(field)
    
    # Add extra fields with numbered keys
    for i, field in enumerate(extra_fields, 1):
        result[f"field_{i}"] = field
    
    return result


def parse_fasta(content: str) -> List[Tuple[str, str]]:
    """
    Parse FASTA format content.
    Returns list of (header, sequence) tuples.
    """
    sequences = []
    current_header = None
    current_seq = []
    
    for line in content.split("\n"):
        line = line.strip()
        if not line:
            continue
        
        if line.startswith(">"):
            # Save previous sequence if exists
            if current_header is not None:
                sequences.append((current_header, "".join(current_seq)))
            current_header = line[1:]  # Remove >
            current_seq = []
        else:
            current_seq.append(line)
    
    # Don't forget the last sequence
    if current_header is not None:
        sequences.append((current_header, "".join(current_seq)))
    
    return sequences


def handle_duplicate_metadata(metadata_list: List[Dict]) -> List[Dict]:
    """
    Handle entries with identical metadata but distinct sequences.
    Appends version marker to name field (e.g., _v2, _v3).
    """
    # Group by (name, date) to find duplicates
    seen = defaultdict(list)
    
    for i, meta in enumerate(metadata_list):
        key = (meta.get("name", ""), meta.get("date", ""))
        seen[key].append(i)
    
    # Mark duplicates
    for key, indices in seen.items():
        if len(indices) > 1:
            # Check if sequences are actually different
            seq_ids = [metadata_list[i]["sequence_id"] for i in indices]
            if len(set(seq_ids)) > 1:  # Different sequences
                for version, idx in enumerate(indices, 1):
                    if version > 1:  # Don't mark the first one
                        original_name = metadata_list[idx].get("name", "")
                        metadata_list[idx]["name"] = f"{original_name}_v{version}"
    
    return metadata_list


def get_file_content(data) -> str:
    """
    Extract file content as string, handling different ipywidgets versions.
    """
    if isinstance(data, bytes):
        return data.decode("utf-8")
    elif hasattr(data, "tobytes"):
        return data.tobytes().decode("utf-8")
    elif isinstance(data, str):
        return data
    else:
        return str(data)


print("‚úÖ Helper functions loaded!")

In [None]:
# ============================================================
# FILE UPLOAD WIDGET
# ============================================================

# Storage for uploaded files
uploaded_files = {}

# Create upload widget
upload_widget = widgets.FileUpload(
    accept=".fasta,.fa,.faa,.txt",  # Accept common FASTA extensions
    multiple=True,
    description="Upload FASTA",
    button_style="primary",
    layout=widgets.Layout(width="200px")
)

# Output area for status messages
upload_output = widgets.Output()

def on_upload_change(change):
    """Handle file upload events - compatible with ipywidgets 7.x and 8.x."""
    global uploaded_files
    with upload_output:
        clear_output()
        new_value = change["new"]
        
        if not new_value:
            return
        
        # Handle ipywidgets 8.x format (dict with filename keys)
        if isinstance(new_value, dict):
            for filename, file_data in new_value.items():
                # file_data can be dict with 'content' key or direct bytes
                if isinstance(file_data, dict) and "content" in file_data:
                    content = get_file_content(file_data["content"])
                else:
                    content = get_file_content(file_data)
                uploaded_files[filename] = content
                print(f"üìÑ Uploaded: {filename} ({len(content):,} bytes)")
        
        # Handle ipywidgets 7.x format (tuple/list of dicts)
        elif isinstance(new_value, (list, tuple)):
            for file_info in new_value:
                if isinstance(file_info, dict):
                    name = file_info.get("name", "unknown")
                    content = get_file_content(file_info.get("content", b""))
                    uploaded_files[name] = content
                    print(f"üìÑ Uploaded: {name} ({len(content):,} bytes)")
        
        print(f"\nüìÅ Total files ready: {len(uploaded_files)}")

upload_widget.observe(on_upload_change, names="value")

# Display
display(HTML("<h3>üì§ Step 1: Upload Your FASTA Files</h3>"))
display(HTML("<p>Click the button below to select one or more FASTA files:</p>"))
display(upload_widget)
display(upload_output)

In [None]:
# ============================================================
# PROCESS FILES
# ============================================================

# Output area for processing
process_output = widgets.Output()

# Storage for results
sequences_df = None
metadata_df = None


def process_files(btn):
    """Process all uploaded FASTA files."""
    global sequences_df, metadata_df
    
    with process_output:
        clear_output()
        
        if not uploaded_files:
            print("‚ö†Ô∏è No files uploaded! Please upload FASTA files first.")
            return
        
        print("üîÑ Processing files...\n")
        
        all_sequences = []  # (sequence_id, cleaned_seq, length)
        all_metadata = []   # {sequence_id, original_header, name, date, ...}
        seen_sequences = {} # cleaned_seq -> sequence_id (for deduplication)
        
        for filename, content in uploaded_files.items():
            print(f"üìÑ Processing: {filename}")
            parsed = parse_fasta(content)
            print(f"   Found {len(parsed)} sequences")
            
            for header, raw_seq in parsed:
                # Clean sequence
                cleaned = clean_sequence(raw_seq)
                
                if not cleaned:
                    print(f"   ‚ö†Ô∏è Skipping empty sequence: {header[:50]}...")
                    continue
                
                # Get or create sequence ID
                if cleaned in seen_sequences:
                    seq_id = seen_sequences[cleaned]
                else:
                    seq_id = hash_sequence(cleaned)
                    seen_sequences[cleaned] = seq_id
                    all_sequences.append({
                        "sequence_id": seq_id,
                        "sequence": cleaned,
                        "length": len(cleaned)
                    })
                
                # Parse metadata
                meta = parse_header(header)
                meta["sequence_id"] = seq_id
                meta["source_file"] = filename
                all_metadata.append(meta)
        
        # Handle duplicate metadata with different sequences
        all_metadata = handle_duplicate_metadata(all_metadata)
        
        # Create DataFrames
        sequences_df = pd.DataFrame(all_sequences)
        metadata_df = pd.DataFrame(all_metadata)
        
        # Reorder metadata columns
        priority_cols = ["sequence_id", "original_header", "name", "date", "source_file"]
        other_cols = [c for c in metadata_df.columns if c not in priority_cols]
        metadata_df = metadata_df[priority_cols + other_cols]
        
        print("\n" + "="*50)
        print("‚úÖ PROCESSING COMPLETE!")
        print("="*50)
        print("\nüìä Results:")
        print(f"   ‚Ä¢ Unique sequences: {len(sequences_df)}")
        print(f"   ‚Ä¢ Metadata entries: {len(metadata_df)}")
        
        # Show preview
        print("\nüìã Sequences Preview:")
        display(sequences_df.head())
        
        print("\nüìã Metadata Preview:")
        display(metadata_df.head())
        
        print("\nüëá Proceed to the next cell to download the CSV files.")


# Create process button
process_btn = widgets.Button(
    description="üî¨ Process Files",
    button_style="success",
    layout=widgets.Layout(width="200px", height="40px")
)
process_btn.on_click(process_files)

# Display
display(HTML("<h3>‚öôÔ∏è Step 2: Process Your Files</h3>"))
display(HTML("<p>Click the button below to clean and parse your sequences:</p>"))
display(process_btn)
display(process_output)

In [None]:
# ============================================================
# DOWNLOAD RESULTS
# ============================================================

download_output = widgets.Output()


def download_sequences(btn):
    """Download sequences CSV."""
    with download_output:
        clear_output()
        if sequences_df is None or sequences_df.empty:
            print("‚ö†Ô∏è No sequences to download. Process files first!")
            return
        
        filename = "sequences.csv"
        if IN_COLAB:
            sequences_df.to_csv(filename, index=False)
            colab_files.download(filename)
            print(f"‚úÖ Downloading {filename}...")
        else:
            sequences_df.to_csv(filename, index=False)
            print(f"‚úÖ Saved to: {filename}")


def download_metadata(btn):
    """Download metadata CSV."""
    with download_output:
        clear_output()
        if metadata_df is None or metadata_df.empty:
            print("‚ö†Ô∏è No metadata to download. Process files first!")
            return
        
        filename = "metadata.csv"
        if IN_COLAB:
            metadata_df.to_csv(filename, index=False)
            colab_files.download(filename)
            print(f"‚úÖ Downloading {filename}...")
        else:
            metadata_df.to_csv(filename, index=False)
            print(f"‚úÖ Saved to: {filename}")


def download_both(btn):
    """Download both CSV files."""
    download_sequences(btn)
    download_metadata(btn)


# Create download buttons
seq_btn = widgets.Button(
    description="üì• Sequences CSV",
    button_style="info",
    layout=widgets.Layout(width="150px")
)
seq_btn.on_click(download_sequences)

meta_btn = widgets.Button(
    description="üì• Metadata CSV",
    button_style="info",
    layout=widgets.Layout(width="150px")
)
meta_btn.on_click(download_metadata)

both_btn = widgets.Button(
    description="üì• Download Both",
    button_style="warning",
    layout=widgets.Layout(width="150px")
)
both_btn.on_click(download_both)

# Display
display(HTML("<h3>üíæ Step 3: Download Your Results</h3>"))
display(HTML("<p>Click the buttons below to download your cleaned data:</p>"))
display(widgets.HBox([seq_btn, meta_btn, both_btn]))
display(download_output)

---

## üìñ Output File Descriptions

### `sequences.csv`
| Column | Description |
|--------|-------------|
| `sequence_id` | Unique 12-character hash ID for each sequence |
| `sequence` | Cleaned amino acid sequence (non-canonical AAs replaced with `_`) |
| `length` | Number of amino acids in the sequence |

### `metadata.csv`
| Column | Description |
|--------|-------------|
| `sequence_id` | Links to `sequences.csv` |
| `original_header` | Original FASTA header line |
| `name` | Extracted protein/sequence name |
| `date` | Extracted date (if present) |
| `source_file` | Original filename |
| `field_N` | Additional parsed fields from header |

> **Note:** Empty cells indicate missing data in the original header. These are intentionally preserved as empty.