<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/BindCraft_independent_sort.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Design Ranking and Consolidation Script for Protein Binder Analysis
----------------------------------------------------------------

This script ranks and organizes protein binder designs based on their interface prediction
template modeling (iPTM) scores and other metrics. It is a standalone adaptation of the
ranking code from the BindCraft protein design pipeline.

Original Source:
---------------
Modified from BindCraft Colab notebook:
https://colab.research.google.com/github/martinpacesa/BindCraft/blob/main/notebooks/BindCraft.ipynb

BindCraft Publication:
https://www.biorxiv.org/content/10.1101/2024.09.30.615802v2

Required Directory Structure:
---------------------------
base_directory/
├── Accepted/           # Contains original .pdb files of accepted designs
│   └── Ranked/        # Will be created if doesn't exist, stores ranked copies
└── mpnn_scores.csv    # CSV file containing design scores

Required CSV Columns:
-------------------
- Design: Name matching PDB files (before "_model")
- Average_i_pTM: Interface prediction template modeling score
- i_pae_interaction_mean: Mean interface predicted aligned error
- i_pae_interaction_median: Median interface predicted aligned error

Functionality:
-------------
1. Creates necessary directories if they don't exist
2. Cleans existing ranked files
3. Loads and sorts designs by Average_i_pTM score
4. Creates ranked copies of PDB files with format: {rank}_{design_name}_model{number}.pdb
5. Generates final CSV with rankings and statistics

Usage:
------
1. Update base_path to point to your design directory
2. Update mpnn_csv_path to point to your scores CSV file
3. Run the script

Modified by: Claude 3.5 Sonnet (Anthropic)
Date: February 11, 2025
"""

import os
import shutil
import pandas as pd

def setup_directories(base_path):
    """Create necessary directories if they don't exist."""
    directories = {
        "Accepted": os.path.join(base_path, "Accepted"),
        "Ranked": os.path.join(base_path, "Accepted", "Ranked")
    }

    for dir_path in directories.values():
        os.makedirs(dir_path, exist_ok=True)

    return directories

def rank_designs(base_path, mpnn_csv_path):
    """Rank protein designs based on iPTM scores and organize files."""

    # Setup directories
    design_paths = setup_directories(base_path)

    # Define column labels
    design_labels = ['Design', 'Average_i_pTM', 'i_pae_interaction_mean', 'i_pae_interaction_median']
    final_labels = ['Rank'] + design_labels

    # Clean ranked directory
    ranked_dir = design_paths["Ranked"]
    for f in os.listdir(ranked_dir):
        os.remove(os.path.join(ranked_dir, f))

    # Get accepted binders
    accepted_binders = [f for f in os.listdir(design_paths["Accepted"])
                       if f.endswith('.pdb')]

    # Load and sort design dataframe
    try:
        design_df = pd.read_csv(mpnn_csv_path)
        design_df = design_df.sort_values('Average_i_pTM', ascending=False)
    except FileNotFoundError:
        print(f"Error: Could not find CSV file at {mpnn_csv_path}")
        return

    # Create final dataframe
    final_df = pd.DataFrame(columns=final_labels)

    # Rank designs
    rank = 1
    for _, row in design_df.iterrows():
        for binder in accepted_binders:
            # Extract binder name and model number
            try:
                binder_name, model = binder.rsplit('_model', 1)
                if binder_name == row['Design']:
                    # Prepare row data
                    row_data = {'Rank': rank}
                    for label in design_labels:
                        row_data[label] = row[label]

                    # Add to final dataframe
                    final_df = pd.concat([final_df, pd.DataFrame([row_data])],
                                       ignore_index=True)

                    # Copy and rename file
                    old_path = os.path.join(design_paths["Accepted"], binder)
                    new_name = f"{rank}_{binder_name}_model{model.rsplit('.', 1)[0]}.pdb"
                    new_path = os.path.join(ranked_dir, new_name)
                    shutil.copyfile(old_path, new_path)

                    rank += 1
                    break
            except ValueError:
                print(f"Warning: Couldn't parse filename: {binder}")
                continue

    # Save final rankings
    final_csv_path = os.path.join(base_path, "final_designs_stats.csv")
    final_df.to_csv(final_csv_path, index=False)

    print(f"Ranked {rank-1} designs successfully")
    print(f"Results saved to: {final_csv_path}")
    print(f"Ranked PDB files saved to: {ranked_dir}")

if __name__ == "__main__":
    # Example usage
    base_path = "path/to/your/design/directory"  # Change this to your directory
    mpnn_csv_path = "path/to/your/mpnn_scores.csv"  # Change this to your CSV file

    rank_designs(base_path, mpnn_csv_path)