<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/RFdiff_design_seq_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os

def extract_designed_sequences(input_file, output_file):
    """Extract designed sequences from RF_diffusion output and save to new FASTA file."""

    # Check if input file exists
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Initialize list to store extracted sequences
    extracted_sequences = []

    # Parse the FASTA file and extract sequences
    print(f"Processing input file: {input_file}")
    for idx, record in enumerate(SeqIO.parse(input_file, "fasta")):
        try:
            # Parse the n number from the description
            desc_parts = record.description.split('|')
            id_parts = desc_parts[0].split()  # Split 'design:0 n:0'
            n_value = [p.split(':')[1] for p in id_parts if p.startswith('n:')][0]

            # Create new ID in format "0_0"
            new_id = f"0_{n_value}"

            # Extract sequence after '/'
            if '/' in str(record.seq):
                designed_seq = str(record.seq).split('/')[1].strip()

                # Create new record
                new_record = SeqRecord(
                    Seq(designed_seq),
                    id=new_id,
                    description=""  # Clear description for clean output
                )
                extracted_sequences.append(new_record)
            else:
                print(f"Warning: No '/' found in sequence {record.id}")

        except Exception as e:
            print(f"Warning: Could not process sequence {idx}: {str(e)}")
            continue

    # Check if we found any sequences
    if not extracted_sequences:
        raise ValueError("No valid sequences found in input file")

    # Write extracted sequences to output file
    SeqIO.write(extracted_sequences, output_file, "fasta")
    print(f"Extracted {len(extracted_sequences)} sequences")
    print(f"Saved to: {output_file}")

    return extracted_sequences

if __name__ == "__main__":
    # Input and output file paths
    input_file = "/content/drive/MyDrive/Fasta-files/3NOB_90-110/3NOB_90-110_design.fasta"  # Update this path
    output_file = "/content/drive/MyDrive/Fasta-files/3NOB_90-110/extracted_sequences.fasta"

    try:
        sequences = extract_designed_sequences(input_file, output_file)
        print(f"\nFirst few sequences:")
        for seq in sequences[:3]:  # Show first 3 sequences
            print(f">{seq.id}")
            print(seq.seq)
    except Exception as e:
        print(f"Error: {str(e)}")

Processing input file: /content/drive/MyDrive/Fasta-files/3NOB_90-110/3NOB_90-110_design.fasta
Extracted 2048 sequences
Saved to: /content/drive/MyDrive/Fasta-files/3NOB_90-110/extracted_sequences.fasta

First few sequences:
>0_0
SLLAALEKAAKEEEAKKVTAEIEAIMSKITGKEVKLKPLSIDELEELKKLEEEVLKETDLETAEAALRELIMKILNKASDGSNKDANETVAIYIVAKIIEER
>0_1
SLLEKKEKEEAEEEAKKFTEEIEKILSELTGKKVTAKPISIEELKELKDLVTTVLEKTNLETAIDKLREKVMELLKKTTDGSDETAFETAAIRIVSKIIDKR
>0_2
AALAAAAAAAAAAAAAALTARIEAVMSEISGKPVTLKPISIAELNELLNLLTNLLEETDLETAEAALREKIMKLLKKASGGENKDINETVAIKLVSEFIDSI
