<a href="https://colab.research.google.com/github/eoinleen/protein-design-final-dir/blob/main/Extract_fasta_from_pdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# PDB to FASTA Converter for Google Colab
# This notebook extracts amino acid sequences from PDB files and saves them in FASTA format

# ===== SETTINGS - EDIT THESE =====
# Directory containing your PDB files (edit this path)
PDB_DIRECTORY = "/content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders"
# Chain to extract (default is 'A')
CHAIN_ID = "A"
# ================================

# Install required packages
!pip install -q biopython

# Import necessary libraries
import os
import re
import glob
from google.colab import drive

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Dictionary to convert three-letter amino acid codes to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
    'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
    'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
    'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

def extract_sequence_from_pdb(pdb_file, chain_id):
    """Extract the amino acid sequence from a specified chain in a PDB file."""
    sequence = ""
    current_res_num = None

    # Read the PDB file and extract the sequence
    with open(pdb_file, 'r') as f:
        for line in f:
            if line.startswith('ATOM') and line[21] == chain_id:
                # Extract residue information
                res_name = line[17:20].strip()
                res_num = int(line[22:26])

                # Avoid duplicate residues (only add each residue once)
                if res_num != current_res_num:
                    if res_name in three_to_one:
                        sequence += three_to_one[res_name]
                    current_res_num = res_num

    return sequence

def parse_design_number(filename):
    """Parse the design number from the filename."""
    # Extract the first number before '_bind_'
    match = re.search(r'(\d+)_bind_', filename)
    if match:
        return match.group(1)

    # If no match found, return the filename without extension
    return os.path.splitext(os.path.basename(filename))[0]

def parse_sequence_number(filename):
    """Parse the sequence number from the filename."""
    match = re.search(r'dldesign_(\d+)_', filename)
    if match:
        return match.group(1)

    # If no match found, return an empty string
    return ""

def convert_pdb_to_fasta(pdb_dir, chain_id):
    """Convert all PDB files in the directory to FASTA format."""
    # Verify the directory exists
    if not os.path.isdir(pdb_dir):
        print(f"Error: Directory not found: {pdb_dir}")
        return False

    # Get all PDB files in the directory
    pdb_files = glob.glob(os.path.join(pdb_dir, "*.pdb"))

    if not pdb_files:
        print(f"No PDB files found in {pdb_dir}")
        return False

    print(f"Found {len(pdb_files)} PDB files in {pdb_dir}")
    print(f"Extracting sequences from chain {chain_id}...")

    # Create a list to store all FASTA entries
    all_fasta_entries = []

    processed_count = 0
    for pdb_file in pdb_files:
        # Extract filename from path
        filename = os.path.basename(pdb_file)

        # Extract design and sequence numbers
        design_num = parse_design_number(filename)
        seq_num = parse_sequence_number(filename)

        # Generate FASTA header
        if seq_num:
            header = f">{design_num}_{seq_num}"
        else:
            header = f">{design_num}"

        # Extract sequence
        sequence = extract_sequence_from_pdb(pdb_file, chain_id)

        if not sequence:
            print(f"Warning: No sequence found for chain {chain_id} in {filename}")
            continue

        # Format the FASTA entry
        fasta_entry = header + "\n"
        for i in range(0, len(sequence), 60):
            fasta_entry += sequence[i:i+60] + "\n"

        # Add to the list of all entries
        all_fasta_entries.append(fasta_entry)

        # Write to individual FASTA file
        output_file = os.path.join(pdb_dir, f"{design_num}_{seq_num}.txt")
        with open(output_file, 'w') as f:
            f.write(fasta_entry)

        print(f"Processed {filename} -> {os.path.basename(output_file)}")
        processed_count += 1

    # Write all sequences to a single file
    if processed_count > 0:
        all_sequences_file = os.path.join(pdb_dir, "all_sequences.txt")
        with open(all_sequences_file, 'w') as f:
            f.write("".join(all_fasta_entries))
        print(f"\nCreated combined FASTA file: {all_sequences_file}")

    print(f"\nProcessed {processed_count} PDB files.")
    print(f"FASTA files have been saved to {pdb_dir}")
    return True

# Main execution
print(f"\nPDB Directory: {PDB_DIRECTORY}")
print(f"Chain ID: {CHAIN_ID}")

# Ask user if they want to change the default settings
change_settings = input("Do you want to change these settings? (yes/no): ").lower().strip()

if change_settings in ["yes", "y"]:
    # Get directory path
    new_dir = input(f"Enter PDB directory path (current: {PDB_DIRECTORY}): ").strip()
    if new_dir:
        PDB_DIRECTORY = new_dir

    # Get chain ID
    new_chain = input(f"Enter chain ID to extract (current: {CHAIN_ID}): ").strip()
    if new_chain:
        CHAIN_ID = new_chain

# Run the conversion
print(f"\nProcessing PDB files in {PDB_DIRECTORY}, extracting chain {CHAIN_ID}...")
success = convert_pdb_to_fasta(PDB_DIRECTORY, CHAIN_ID)

if success:
    print("\nProcess completed successfully!")
else:
    print("\nFailed to process PDB files. Please check the directory path and try again.")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

PDB Directory: /content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders
Chain ID: A
Do you want to change these settings? (yes/no): no

Processing PDB files in /content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders, extracting chain A...
Found 13 PDB files in /content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders
Extracting sequences from chain A...
Processed 1_bind_0_dldesign_9965_af2pred.pdb -> 1_9965.txt
Processed 1_bind_0_dldesign_2056_af2pred.pdb -> 1_2056.txt
Processed 1_bind_0_dldesign_5160_af2pred.pdb -> 1_5160.txt
Processed 1_bind_0_dldesign_7384_af2pred.pdb -> 1_7384.txt
Processed 1_bind_0_dldesign_2304_af2pred.pdb -> 1_2304.txt
Processed 1_bind_0_dldesign_7578_af2pred.pdb -> 1_7578.txt
Processed 1_bind_0_dldesign_8480_af2pred.pdb -> 1_8