<a href="https://colab.research.google.com/github/eoinleen/protein-design-final-dir/blob/main/extract_binder_fasta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
"""
RF Diffusion Sequence Extractor
------------------------------
This script extracts designed sequences from RF_diffusion output FASTA files.

Input format:
>design:X n:Y|mpnn:1.234|plddt:0.567|i_ptm:0.123|i_pae:12.345|rmsd:6.789
UBIQUITINSEQUENCE/DESIGNEDSEQUENCE

It is designed to take binder data from the design.fasta file in RF_diffusion output.

Output format:
>dX_nY
DESIGNEDSEQUENCE

The script:
1. Reads the input FASTA file
2. Extracts only the designed sequence part (after the '/')
3. Creates concise headers in format dX_nY (e.g., d25_n44)
4. Preserves any alignment dashes in the sequences
5. Writes output to a new FASTA file"""

#First install required package:
!pip install biopython

#Then run the script:

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os

def extract_designed_sequences(input_file, output_file):
    """Extract designed sequences from RF_diffusion output and save to new FASTA file."""

    # Check if input file exists
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Initialize list to store extracted sequences
    extracted_sequences = []
    design_lengths = set()

    # Parse the FASTA file and extract sequences
    print(f"Processing input file: {input_file}")
    for idx, record in enumerate(SeqIO.parse(input_file, "fasta")):
        try:
            # Extract design and n numbers
            desc_parts = record.description.split('|')
            header_parts = desc_parts[0].strip().split()

            # Get numbers after 'design:' and 'n:'
            design_num = [p.split(':')[1] for p in header_parts if p.startswith('design:')][0]
            n_num = [p.split(':')[1] for p in header_parts if p.startswith('n:')][0]

            # Create concise ID
            new_id = f"d{design_num}_n{n_num}"  # Will look like "d25_n44"

            # Extract sequence after '/'
            if '/' in str(record.seq):
                full_seq = str(record.seq)
                designed_seq = full_seq.split('/')[1].strip()

                # Remove any whitespace between sequence parts
                designed_seq = ''.join(designed_seq.split())

                # Store length (excluding dashes)
                effective_length = len(designed_seq.replace('-', ''))
                design_lengths.add(effective_length)

                # Create new record
                new_record = SeqRecord(
                    Seq(designed_seq),
                    id=new_id,
                    description=""  # Clear description for clean output
                )
                extracted_sequences.append(new_record)
            else:
                print(f"Warning: No '/' found in sequence {record.id}")

        except Exception as e:
            print(f"Warning: Could not process sequence {idx}: {str(e)}")
            continue

    # Check if we found any sequences
    if not extracted_sequences:
        raise ValueError("No valid sequences found in input file")

    print(f"Sequence lengths before alignment (excluding dashes):")
    print(f"Unique lengths found: {sorted(list(design_lengths))}")
    print(f"Raw sequence length (including dashes): {len(extracted_sequences[0].seq)}")

    # Write extracted sequences to output file
    SeqIO.write(extracted_sequences, output_file, "fasta")
    print(f"Extracted {len(extracted_sequences)} sequences")
    print(f"Saved to: {output_file}")

    return extracted_sequences

def print_example_sequences(sequences, num_examples=10):
    """Print the first num_examples sequences in FASTA format"""
    print(f"\nFirst {min(num_examples, len(sequences))} sequences from output:")
    print("-" * 60)
    for seq in sequences[:num_examples]:
        print(f">{seq.id}")
        print(f"{seq.seq}")
        print()  # Empty line between sequences

if __name__ == "__main__":
    # First install Biopython if not already installed
    try:
        import Bio
    except ImportError:
        print("Installing Biopython...")
        !pip install biopython
        print("Biopython installed successfully!")

    # Input and output file paths
    input_file = "/content/drive/MyDrive/Fasta-files/3NOB_90-110/3NOB_90-110_design.fasta"  # Update this path
    output_file = "/content/drive/MyDrive/Fasta-files/3NOB_90-110/extracted_sequences.fasta"

    try:
        sequences = extract_designed_sequences(input_file, output_file)
        print_example_sequences(sequences)  # Print first 10 sequences
    except Exception as e:
        print(f"Error: {str(e)}")

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Processing input file: /content/drive/MyDrive/Fasta-files/3NOB_90-110/3NOB_90-110_design.fasta
Sequence lengths before alignment (excluding dashes):
Unique lengths found: [102]
Raw sequence length (including dashes): 102
Extracted 2048 sequences
Saved to: /content/drive/MyDrive/Fasta-files/3NOB_90-110/extracted_sequences.fasta

First 10 sequences from output:
------------------------------------------------------------
>d0_n0
SLLAALEKAAKEEEAKKVTAEIEAIMSKITGKEVKLKPLSIDELEELKKLEEEVLKETDLETAEAALRELIMKILNKASDGSNKDANETVAIYIVAKIIEER

>d0_n1
SLLEKKEKEEAEEEAKKFTEEIEKILSELTGKKVTA