<a href="https://colab.research.google.com/github/eoinleen/protein-design-final-dir/blob/main/AF2_Seq_Extractor_Formatter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
"""
AF2 Sequence Extractor and Formatter
==================================
Created: January 31, 2025
Author: Claude & User
Version: 1.0

Purpose:
--------
Extracts and formats protein sequences from AF2 output files, designed specifically
for handling dual-sequence entries (sequence1/sequence2) from AF2 scoring data.

Input Required:
--------------
- CSV file containing columns:
  * design (column 2): Design number
  * n (column 3): Sequence number within design
  * seq: Full sequence string in format "sequence1/sequence2"

Output Generated:
---------------
1. Individual .fasta files (in /fasta_files/):
   - Named as 'dXnY.fasta' where X=design number, Y=sequence number
   - Each file contains one sequence in FASTA format

2. Individual .txt files (in /txt_files/):
   - Named as 'dXnY.txt'
   - Content identical to FASTA files

3. Combined sequence file:
   - Named 'all_sequences.fasta'
   - Contains all sequences in FASTA format
   - Saved in same directory as input CSV

Usage:
------
1. Mount Google Drive in Colab
2. Update input_csv path to point to your AF2 scores file
3. Run script
"""

import pandas as pd
import os
from google.colab import drive

def extract_and_save_sequences(csv_path: str):
    """
    Main function to process sequences and create output files

    Args:
        csv_path (str): Path to input CSV file containing AF2 scores and sequences
    """
    # Read the input CSV file
    df = pd.read_csv(csv_path)

    # Setup output directory structure using input CSV location
    output_dir = os.path.dirname(csv_path)

    # Create subdirectories for individual files
    fasta_dir = os.path.join(output_dir, 'fasta_files')
    txt_dir = os.path.join(output_dir, 'txt_files')
    os.makedirs(fasta_dir, exist_ok=True)
    os.makedirs(txt_dir, exist_ok=True)

    # Define path for the combined output file
    combined_fasta = os.path.join(output_dir, 'all_sequences.fasta')

    # Open combined file for writing all sequences
    with open(combined_fasta, 'w') as combined:
        # Iterate through each row in the CSV
        for _, row in df.iterrows():
            # Split the sequence at '/' and extract second sequence
            sequences = row['seq'].split('/')
            if len(sequences) != 2:
                print(f"Warning: Unexpected sequence format in design {row['design']}, sequence {row['n']}")
                continue

            second_sequence = sequences[1].strip()

            # Create shortened header in format dXnY
            header = f"d{row['design']}n{row['n']}"

            # Define paths for individual files
            fasta_path = os.path.join(fasta_dir, f"{header}.fasta")
            txt_path = os.path.join(txt_dir, f"{header}.txt")

            # Write individual files (both .fasta and .txt)
            for path in [fasta_path, txt_path]:
                with open(path, 'w') as f:
                    f.write(f">{header}\n")
                    f.write(f"{second_sequence}\n")

            # Append to combined FASTA file
            combined.write(f">{header}\n")
            combined.write(f"{second_sequence}\n")

    # Print summary of files created
    print(f"Created:")
    print(f"- {len(df)} individual FASTA files in {fasta_dir}")
    print(f"- {len(df)} individual TXT files in {txt_dir}")
    print(f"- Combined FASTA file with all sequences: {combined_fasta}")

# Main execution
if __name__ == "__main__":
    # Mount Google Drive for file access
    drive.mount('/content/drive')

    # Set path to input CSV file - UPDATE THIS PATH
    input_csv = '/content/drive/MyDrive/PDB-files/202501xx/3NOB-70-110-all_pdb/af2_scores.csv'

    # Run sequence extraction and file creation
    extract_and_save_sequences(input_csv)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Created:
- 2048 individual FASTA files in /content/drive/MyDrive/PDB-files/202501xx/3NOB-70-110-all_pdb/fasta_files
- 2048 individual TXT files in /content/drive/MyDrive/PDB-files/202501xx/3NOB-70-110-all_pdb/txt_files
- Combined FASTA file with all sequences: /content/drive/MyDrive/PDB-files/202501xx/3NOB-70-110-all_pdb/all_sequences.fasta
