<a href="https://colab.research.google.com/github/eoinleen/PDB-tools/blob/main/robust-input_PBD_to_fasta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# PDB to FASTA Converter for Google Colab
# =================================================
# This script extracts amino acid sequences from PDB files
# and saves them in FASTA format. It is designed to work
# with diverse PDB filename formats and supports Google Colab
# integration by automatically mounting Google Drive.
#
# Features:
# - Extracts sequences from a specified chain (default: A)
# - Handles multiple filename patterns, including:
#   - 'dldesign_####' format
#   - '_bind_####' format
#   - 'mpnn##_model##' format
#   - Any numeric fallback if no standard format detected
# - Outputs individual FASTA files per design
# - Creates a combined 'all_sequences.txt' FASTA file
#
# Requirements:
# - Ensure Biopython is installed
# - PDB files should be stored in Google Drive or local directory
#
# Usage:
# - Set 'PDB_DIRECTORY' to the path containing PDB files
# - Set 'CHAIN_ID' to the chain you want to extract (default: A)
# - Run the script in Google Colab
# =================================================

# ===== SETTINGS - EDIT THESE =====
PDB_DIRECTORY = "/content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders"
CHAIN_ID = "A"
# ================================

# Install required packages
%pip install -q biopython

# Import necessary libraries
import os
import re
import glob
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Dictionary to convert three-letter amino acid codes to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
    'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
    'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
    'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

def extract_sequence_from_pdb(pdb_file, chain_id):
    """Extract the amino acid sequence from a specified chain in a PDB file."""
    sequence = ""
    seen_residues = set()

    with open(pdb_file, 'r', encoding="utf-8") as f:
        for line in f:
            if line.startswith('ATOM') and line[21] == chain_id:
                res_name = line[17:20].strip()
                res_num = line[22:26].strip()

                if res_num not in seen_residues:
                    if res_name in three_to_one:
                        sequence += three_to_one[res_name]
                    seen_residues.add(res_num)

    return sequence


def parse_design_number(filename):
    """Extracts the most relevant design number from the filename."""
    match = re.search(r'dldesign_(\d+)', filename)
    if match:
        return match.group(1)

    match = re.search(r'(\d+)_bind', filename)
    if match:
        return match.group(1)

    match = re.findall(r'\d+', filename)
    if match:
        return max(match, key=int)  # Get the largest number as fallback

    return os.path.splitext(filename)[0]


def parse_sequence_number(filename):
    """Extracts sequence-related numbers if present."""
    match = re.search(r'_bind_(\d+)', filename)
    if match:
        return match.group(1)

    match = re.search(r'mpnn(\d+)_model(\d+)', filename)
    if match:
        return f"mpnn{match.group(1)}_model{match.group(2)}"

    return ""


def convert_pdb_to_fasta(pdb_dir, chain_id):
    """Convert all PDB files in the directory to FASTA format."""
    if not os.path.isdir(pdb_dir):
        print(f"Error: Directory not found: {pdb_dir}")
        return False

    pdb_files = glob.glob(os.path.join(pdb_dir, "*.pdb"))
    if not pdb_files:
        print(f"No PDB files found in {pdb_dir}")
        return False

    print(f"Processing {len(pdb_files)} PDB files...")
    all_fasta_entries = []
    processed_count = 0

    for pdb_file in pdb_files:
        filename = os.path.basename(pdb_file)
        design_num = parse_design_number(filename)
        seq_num = parse_sequence_number(filename)

        header = f">{design_num}_{seq_num}" if seq_num else f">{design_num}"
        sequence = extract_sequence_from_pdb(pdb_file, chain_id)

        if not sequence:
            print(f"Warning: No sequence found for {filename}")
            continue

        fasta_entry = header + "\n" + "\n".join([sequence[i:i+60] for i in range(0, len(sequence), 60)])
        all_fasta_entries.append(fasta_entry)

        output_file = os.path.join(pdb_dir, f"{design_num}_{seq_num}.txt" if seq_num else f"{design_num}.txt")
        with open(output_file, 'w', encoding="utf-8") as f:
            f.write(fasta_entry)

        print(f"Processed {filename} -> {os.path.basename(output_file)}")
        processed_count += 1

    if processed_count > 0:
        all_sequences_file = os.path.join(pdb_dir, "all_sequences.txt")
        with open(all_sequences_file, 'w', encoding="utf-8") as f:
            f.write("\n".join(all_fasta_entries))
        print(f"\nCreated combined FASTA file: {all_sequences_file}")

    print(f"\nProcessed {processed_count} PDB files.")
    return True


# Main execution
print(f"\nPDB Directory: {PDB_DIRECTORY}")
print(f"Chain ID: {CHAIN_ID}")

# Run the conversion
convert_pdb_to_fasta(PDB_DIRECTORY, CHAIN_ID)
