<a href="https://colab.research.google.com/github/eoinleen/PDB-tools/blob/main/Multi_chain_pdb_to_fasta_robust.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Enhanced PDB to FASTA Converter for Google Colab
# =================================================
# This script extracts amino acid sequences from PDB files
# and saves them in FASTA format. It supports both Google Drive
# and direct file upload methods, with multi-chain extraction.
#
# Features:
# - Two input methods: Google Drive path or direct upload
# - Extracts sequences from specified chains or all chains
# - Handles multiple filename patterns
# - Outputs individual FASTA files per design
# - Creates a combined 'all_sequences.txt' FASTA file
# - Multi-chain support with chain-specific naming
#
# Requirements:
# - Ensure Biopython is installed
#
# Usage:
# - Choose input method (1 for Google Drive, 2 for upload)
# - Set chain preferences (specific chains or 'ALL')
# - Run the script in Google Colab
# =================================================

# ===== SETTINGS - EDIT THESE =====
INPUT_METHOD = 2  # 1 = Google Drive, 2 = Direct Upload
PDB_DIRECTORY = "/content/drive/MyDrive/PDB-files/"  # Only used if INPUT_METHOD = 1
CHAIN_IDS = "ALL"  # Options: "A", "A,B", "ALL" for all chains
# ================================

# Install required packages
%pip install -q biopython

# Import necessary libraries
import os
import re
import glob
from google.colab import drive
from google.colab import files
import tempfile
import shutil

# Dictionary to convert three-letter amino acid codes to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
    'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
    'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
    'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

def get_all_chains_from_pdb(pdb_file):
    """Get all unique chain IDs from a PDB file."""
    chains = set()
    with open(pdb_file, 'r', encoding="utf-8") as f:
        for line in f:
            if line.startswith('ATOM'):
                chain_id = line[21]
                if chain_id.strip():  # Only add non-empty chain IDs
                    chains.add(chain_id)
    return sorted(list(chains))

def extract_sequence_from_pdb(pdb_file, chain_id):
    """Extract the amino acid sequence from a specified chain in a PDB file."""
    sequence = ""
    seen_residues = set()

    with open(pdb_file, 'r', encoding="utf-8") as f:
        for line in f:
            if line.startswith('ATOM') and line[21] == chain_id:
                res_name = line[17:20].strip()
                res_num = line[22:26].strip()

                if res_num not in seen_residues:
                    if res_name in three_to_one:
                        sequence += three_to_one[res_name]
                    seen_residues.add(res_num)

    return sequence

def parse_design_number(filename):
    """Extracts the most relevant design number from the filename."""
    match = re.search(r'dldesign_(\d+)', filename)
    if match:
        return match.group(1)

    match = re.search(r'(\d+)_bind', filename)
    if match:
        return match.group(1)

    match = re.findall(r'\d+', filename)
    if match:
        return max(match, key=int)  # Get the largest number as fallback

    return os.path.splitext(filename)[0]

def parse_sequence_number(filename):
    """Extracts sequence-related numbers if present."""
    match = re.search(r'_bind_(\d+)', filename)
    if match:
        return match.group(1)

    match = re.search(r'mpnn(\d+)_model(\d+)', filename)
    if match:
        return f"mpnn{match.group(1)}_model{match.group(2)}"

    return ""

def setup_upload_directory():
    """Create a temporary directory and handle file uploads."""
    print("Please upload your PDB files:")
    uploaded = files.upload()

    if not uploaded:
        print("No files uploaded.")
        return None

    # Create temporary directory
    temp_dir = tempfile.mkdtemp()

    # Move uploaded files to temp directory
    for filename in uploaded.keys():
        if filename.endswith('.pdb'):
            src_path = os.path.join('/content', filename)
            dst_path = os.path.join(temp_dir, filename)
            shutil.move(src_path, dst_path)
            print(f"Moved {filename} to processing directory")
        else:
            print(f"Skipping {filename} (not a PDB file)")

    return temp_dir

def parse_chain_ids(chain_input):
    """Parse the chain ID input string."""
    if chain_input.upper() == "ALL":
        return "ALL"
    else:
        # Split by comma and clean up
        chains = [chain.strip().upper() for chain in chain_input.split(',')]
        return [chain for chain in chains if chain]  # Remove empty strings

def convert_pdb_to_fasta(pdb_dir, chain_ids):
    """Convert all PDB files in the directory to FASTA format."""
    if not os.path.isdir(pdb_dir):
        print(f"Error: Directory not found: {pdb_dir}")
        return False

    pdb_files = glob.glob(os.path.join(pdb_dir, "*.pdb"))
    if not pdb_files:
        print(f"No PDB files found in {pdb_dir}")
        return False

    print(f"Processing {len(pdb_files)} PDB files...")
    all_fasta_entries = []
    processed_count = 0

    for pdb_file in pdb_files:
        filename = os.path.basename(pdb_file)
        design_num = parse_design_number(filename)
        seq_num = parse_sequence_number(filename)

        # Determine which chains to process
        if chain_ids == "ALL":
            chains_to_process = get_all_chains_from_pdb(pdb_file)
            print(f"Found chains {chains_to_process} in {filename}")
        else:
            chains_to_process = chain_ids

        file_fasta_entries = []

        for chain_id in chains_to_process:
            sequence = extract_sequence_from_pdb(pdb_file, chain_id)

            if not sequence:
                print(f"Warning: No sequence found for chain {chain_id} in {filename}")
                continue

            # Create header with chain information
            if len(chains_to_process) > 1:
                header = f">{design_num}_{seq_num}_chain{chain_id}" if seq_num else f">{design_num}_chain{chain_id}"
            else:
                header = f">{design_num}_{seq_num}" if seq_num else f">{design_num}"

            # Format sequence with line breaks every 60 characters
            formatted_sequence = "\n".join([sequence[i:i+60] for i in range(0, len(sequence), 60)])
            fasta_entry = header + "\n" + formatted_sequence

            file_fasta_entries.append(fasta_entry)
            all_fasta_entries.append(fasta_entry)

            print(f"Processed {filename}, chain {chain_id}: {len(sequence)} residues")

        if file_fasta_entries:
            # Save individual file
            if len(chains_to_process) > 1:
                output_file = os.path.join(pdb_dir, f"{design_num}_{seq_num}_allchains.txt" if seq_num else f"{design_num}_allchains.txt")
            else:
                output_file = os.path.join(pdb_dir, f"{design_num}_{seq_num}.txt" if seq_num else f"{design_num}.txt")

            with open(output_file, 'w', encoding="utf-8") as f:
                f.write("\n".join(file_fasta_entries))

            processed_count += 1

    if processed_count > 0:
        all_sequences_file = os.path.join(pdb_dir, "all_sequences.txt")
        with open(all_sequences_file, 'w', encoding="utf-8") as f:
            f.write("\n".join(all_fasta_entries))
        print(f"\nCreated combined FASTA file: {all_sequences_file}")

    print(f"\nProcessed {processed_count} PDB files.")
    return True, all_fasta_entries if processed_count > 0 else []

# Main execution
print("=== Enhanced PDB to FASTA Converter ===")
print(f"Input method: {'Google Drive' if INPUT_METHOD == 1 else 'Direct Upload'}")
print(f"Chain selection: {CHAIN_IDS}")

# Parse chain IDs
chains = parse_chain_ids(CHAIN_IDS)

# Setup input directory based on method
if INPUT_METHOD == 1:
    # Mount Google Drive
    print("\nMounting Google Drive...")
    drive.mount('/content/drive')
    pdb_directory = PDB_DIRECTORY
    print(f"Using Google Drive directory: {pdb_directory}")

elif INPUT_METHOD == 2:
    # Handle file upload
    print("\nSetting up file upload...")
    pdb_directory = setup_upload_directory()
    if pdb_directory is None:
        print("Upload failed. Exiting.")
        exit()
    print(f"Using upload directory: {pdb_directory}")

else:
    print("Invalid INPUT_METHOD. Use 1 for Google Drive or 2 for Direct Upload.")
    exit()

# Run the conversion
conversion_result = convert_pdb_to_fasta(pdb_directory, chains)
if conversion_result[0]:  # If conversion was successful
    print("\n✅ Conversion completed successfully!")

    if INPUT_METHOD == 2:
        print("\nPreparing files for download...")

        # Copy files to /content/ directory for reliable downloading
        download_dir = "/content/fasta_output"
        os.makedirs(download_dir, exist_ok=True)

        # Copy all FASTA files to download directory
        fasta_files = glob.glob(os.path.join(pdb_directory, "*.txt"))
        copied_files = []

        for fasta_file in fasta_files:
            filename = os.path.basename(fasta_file)
            dest_path = os.path.join(download_dir, filename)
            shutil.copy2(fasta_file, dest_path)
            copied_files.append(dest_path)
            print(f"Prepared {filename} for download")

        # Try automatic download first
        print("\nAttempting automatic download...")
        download_success = []
        for file_path in copied_files:
            try:
                files.download(file_path)
                download_success.append(True)
                print(f"✓ Initiated download for {os.path.basename(file_path)}")
            except Exception as e:
                download_success.append(False)
                print(f"✗ Error downloading {os.path.basename(file_path)}: {e}")

        # If downloads seem to fail, provide manual options
        if not all(download_success):
            print("\n⚠️  Some automatic downloads may have failed.")

        print("\n" + "="*50)
        print("📁 DOWNLOAD OPTIONS:")
        print("="*50)
        print("If files aren't in your Downloads folder, try these options:")
        print("\n1. 📂 MANUAL DOWNLOAD (Most Reliable):")
        print("   - Look at the file browser panel on the left")
        print("   - Navigate to: /content/fasta_output/")
        print("   - Right-click each file → Download")

        print("\n2. 🔄 RE-RUN DOWNLOAD:")
        print("   Run this code in a new cell:")
        print("   ```python")
        print("   from google.colab import files")
        for file_path in copied_files:
            print(f"   files.download('/content/fasta_output/{os.path.basename(file_path)}')")
        print("   ```")

        print("\n3. 📋 VIEW FILE CONTENTS:")
        print("   Run this to see file contents (for copy/paste):")
        print("   ```python")
        for file_path in copied_files:
            print(f"   with open('/content/fasta_output/{os.path.basename(file_path)}', 'r') as f:")
            print(f"       print('=== {os.path.basename(file_path)} ===')")
            print(f"       print(f.read())")
            print(f"       print('\\n')")
        print("   ```")

        print("\n4. 💾 SAVE TO GOOGLE DRIVE:")
        print("   ```python")
        print("   import shutil")
        print("   drive_folder = '/content/drive/MyDrive/FASTA_Output/'")
        print("   import os; os.makedirs(drive_folder, exist_ok=True)")
        for file_path in copied_files:
            print(f"   shutil.copy('/content/fasta_output/{os.path.basename(file_path)}', drive_folder)")
        print("   ```")

        # Don't clean up immediately so users can access files
        print(f"\n📌 Files are temporarily saved in: /content/fasta_output/")
        print("   (These will be deleted when the Colab session ends)")

        # Clean up original temp directory but keep download directory
        shutil.rmtree(pdb_directory)
        print("\nOriginal temporary files cleaned up.")
else:
    print("\n❌ Conversion failed.")

=== Enhanced PDB to FASTA Converter ===
Input method: Direct Upload
Chain selection: ALL

Setting up file upload...
Please upload your PDB files:


Saving Elsa_C6_003.pdb to Elsa_C6_003.pdb
Moved Elsa_C6_003.pdb to processing directory
Using upload directory: /tmp/tmpmvx18w_g
Processing 1 PDB files...
Found chains ['A', 'B', 'C', 'D', 'E', 'F'] in Elsa_C6_003.pdb
Processed Elsa_C6_003.pdb, chain A: 90 residues
Processed Elsa_C6_003.pdb, chain B: 90 residues
Processed Elsa_C6_003.pdb, chain C: 90 residues
Processed Elsa_C6_003.pdb, chain D: 90 residues
Processed Elsa_C6_003.pdb, chain E: 90 residues
Processed Elsa_C6_003.pdb, chain F: 90 residues

Created combined FASTA file: /tmp/tmpmvx18w_g/all_sequences.txt

Processed 1 PDB files.

✅ Conversion completed successfully!

Preparing files for download...
Prepared 6_allchains.txt for download
Prepared all_sequences.txt for download

Attempting automatic download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Initiated download for 6_allchains.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Initiated download for all_sequences.txt

📁 DOWNLOAD OPTIONS:
If files aren't in your Downloads folder, try these options:

1. 📂 MANUAL DOWNLOAD (Most Reliable):
   - Look at the file browser panel on the left
   - Navigate to: /content/fasta_output/
   - Right-click each file → Download

2. 🔄 RE-RUN DOWNLOAD:
   Run this code in a new cell:
   ```python
   from google.colab import files
   files.download('/content/fasta_output/6_allchains.txt')
   files.download('/content/fasta_output/all_sequences.txt')
   ```

3. 📋 VIEW FILE CONTENTS:
   Run this to see file contents (for copy/paste):
   ```python
   with open('/content/fasta_output/6_allchains.txt', 'r') as f:
       print('=== 6_allchains.txt ===')
       print(f.read())
       print('\n')
   with open('/content/fasta_output/all_sequences.txt', 'r') as f:
       print('=== all_sequences.txt ===')
       print(f.read())
       print('\n')
   ```

4. 💾 SAVE TO GOOGLE DRIVE:
   ```python
   import shutil
   drive_folder = '/content