<a href="https://colab.research.google.com/github/eoinleen/Biophysics-general/blob/main/out_sc-total_analysis_ProteinMPNN_best-binders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Integrated Protein Binder Analysis & Visualization Script

This script performs comprehensive analysis of protein binding scores from
out.sc files and generates visualizations with transparent backgrounds.

Specifically configured for Google Colab with default paths set to:
- Input: /content/drive/MyDrive/Analysis-text-files-ProteinMPNN/2025031-binder0/out.sc
- Output: /content/drive/MyDrive/Analysis-text-files-ProteinMPNN/2025031-binder0/

For Google Colab usage:
1. First mount your Google Drive:
   from google.colab import drive
   drive.mount('/content/drive')

2. Run the script with default paths:
   !python protein_analysis.py

Author: Claude
"""

import argparse
import os
import re
import subprocess
import tempfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

# Set plot style for transparent backgrounds
plt.rcParams['figure.facecolor'] = 'none'
plt.rcParams['axes.facecolor'] = 'none'
plt.rcParams['savefig.facecolor'] = 'none'

def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description='Analyze and visualize protein binding data')
    parser.add_argument('-f', '--file', default='/content/drive/MyDrive/Analysis-text-files-ProteinMPNN/2025031-binder0/out.sc',
                       help='Input score file (out.sc format) - default is set for the specific project')
    parser.add_argument('-o', '--output', default='/content/drive/MyDrive/Analysis-text-files-ProteinMPNN/2025031-binder0',
                       help='Output directory (default: specific project directory)')
    parser.add_argument('-n', '--number', type=int, default=30, help='Number of top candidates to consider (default: 30)')
    parser.add_argument('-v', '--visualize-only', action='store_true', help='Skip analysis and only visualize existing results')
    return parser.parse_args()

def run_analysis(input_file, output_file, top_n):
    """Run the binding score analysis."""
    print(f"Analyzing protein binding data from {input_file}...")

    # Check if file exists
    if not os.path.exists(input_file):
        print(f"Error: File {input_file} not found.")
        return False

    # Read the input file
    try:
        with open(input_file, 'r') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading input file: {e}")
        return False

    # Extract header
    header = None
    for line in lines:
        if line.startswith("SCORE:") and "binder_aligned_rmsd" in line:
            header = line.replace("SCORE:", "").strip()
            break

    if not header:
        print("Error: Could not find header in input file.")
        return False

    # Create temporary files for each category
    temp_dir = tempfile.mkdtemp()
    plddt_file = os.path.join(temp_dir, "plddt_binder.txt")
    pae_file = os.path.join(temp_dir, "pae_interaction.txt")
    rmsd_file = os.path.join(temp_dir, "binder_aligned_rmsd.txt")
    combined_file = os.path.join(temp_dir, "combined.txt")

    # Extract data rows (skip header)
    data_rows = [line for line in lines if line.startswith("SCORE:") and "binder_aligned_rmsd" not in line]

    # Get total number of designs
    total_designs = len(data_rows)
    if top_n > total_designs:
        top_n = total_designs
        print(f"Note: Only {total_designs} designs available, using all of them.")

    # Parse rows into a DataFrame
    data = []
    for line in data_rows:
        row = line.replace("SCORE:", "").strip().split()
        # Extract values and ensure description is handled correctly
        values = row[:len(header.split())-1]
        description = " ".join(row[len(header.split())-1:])
        data.append(values + [description])

    # Create DataFrame
    df = pd.DataFrame(data, columns=header.split())

    # Convert numeric columns
    numeric_cols = ['binder_aligned_rmsd', 'pae_binder', 'pae_interaction',
                   'pae_target', 'plddt_binder', 'plddt_target',
                   'plddt_total', 'target_aligned_rmsd', 'time']

    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Create sorted DataFrames for each metric
    plddt_df = df.sort_values(by='plddt_binder', ascending=False).reset_index(drop=True)
    pae_df = df.sort_values(by='pae_interaction', ascending=True).reset_index(drop=True)
    rmsd_df = df.sort_values(by='binder_aligned_rmsd', ascending=True).reset_index(drop=True)

    # Get top designs for each metric
    plddt_designs = set(plddt_df.head(top_n)['description'].tolist())
    pae_designs = set(pae_df.head(top_n)['description'].tolist())
    rmsd_designs = set(rmsd_df.head(top_n)['description'].tolist())

    # Find designs in all three sets
    combined_designs = plddt_designs.intersection(pae_designs, rmsd_designs)

    # Create combined DataFrame with these designs, sorted by PAE interaction
    combined_df = df[df['description'].isin(combined_designs)].sort_values(by='pae_interaction', ascending=True)

    # Write results to output file
    with open(output_file, 'w') as f:
        f.write("=============================================\n")
        f.write("COMPREHENSIVE PROTEIN BINDER ANALYSIS\n")
        f.write("=============================================\n")
        f.write(f"Input file: {input_file}\n")
        f.write(f"Total designs analyzed: {total_designs}\n")
        f.write(f"Top candidates considered per metric: {top_n}\n")
        f.write(f"Date of analysis: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("=============================================\n\n")

        f.write("=============================================\n")
        f.write(f"TOP {top_n} DESIGNS BY PLDDT_BINDER (HIGHER IS BETTER)\n")
        f.write("=============================================\n")
        f.write(f"SCORE: {header}\n")
        for _, row in plddt_df.head(top_n).iterrows():
            values = " ".join([f"{row[col]}" for col in plddt_df.columns if col != 'description'])
            f.write(f"SCORE: {values} {row['description']}\n")
        f.write("\n")

        f.write("=============================================\n")
        f.write(f"TOP {top_n} DESIGNS BY PAE_INTERACTION (LOWER IS BETTER)\n")
        f.write("=============================================\n")
        f.write(f"SCORE: {header}\n")
        for _, row in pae_df.head(top_n).iterrows():
            values = " ".join([f"{row[col]}" for col in pae_df.columns if col != 'description'])
            f.write(f"SCORE: {values} {row['description']}\n")
        f.write("\n")

        f.write("=============================================\n")
        f.write(f"TOP {top_n} DESIGNS BY BINDER_ALIGNED_RMSD (LOWER IS BETTER)\n")
        f.write("=============================================\n")
        f.write(f"SCORE: {header}\n")
        for _, row in rmsd_df.head(top_n).iterrows():
            values = " ".join([f"{row[col]}" for col in rmsd_df.columns if col != 'description'])
            f.write(f"SCORE: {values} {row['description']}\n")
        f.write("\n")

        f.write("=============================================\n")
        f.write(f"DESIGNS APPEARING IN ALL THREE TOP {top_n} LISTS\n")
        f.write("RANKED BY PAE_INTERACTION (LOWER IS BETTER)\n")
        f.write("=============================================\n")

        if combined_df.empty:
            f.write("No designs found in all three categories.\n")
        else:
            f.write(f"SCORE: {header}\n")
            for _, row in combined_df.iterrows():
                values = " ".join([f"{row[col]}" for col in combined_df.columns if col != 'description'])
                f.write(f"SCORE: {values} {row['description']}\n")

            # Extract best design info
            best_design = combined_df.iloc[0]['description']
            best_pae = combined_df.iloc[0]['pae_interaction']
            best_plddt = combined_df.iloc[0]['plddt_binder']
            best_rmsd = combined_df.iloc[0]['binder_aligned_rmsd']

            f.write("\n")
            f.write("=============================================\n")
            f.write("SUMMARY OF BEST OVERALL DESIGN\n")
            f.write("=============================================\n")
            f.write(f"Best overall design: {best_design}\n")
            f.write(f"PAE_interaction: {best_pae} (lower is better)\n")
            f.write(f"PLDDT_binder: {best_plddt} (higher is better)\n")
            f.write(f"Binder_aligned_RMSD: {best_rmsd} (lower is better)\n")

    print(f"Analysis complete! Results written to {output_file}")
    print(f"Found {len(combined_designs)} designs that appear in all three top {top_n} lists.")

    # Clean up temporary directory
    import shutil
    shutil.rmtree(temp_dir)

    return True

def parse_scores_section(section_content):
    """Parse scores from a section of the analysis file."""
    if not section_content or "No designs found" in section_content:
        return None

    lines = section_content.strip().split("\n")
    # Find the header line
    header_idx = -1
    for i, line in enumerate(lines):
        if "SCORE:" in line and "description" in line:
            header_idx = i
            break

    if header_idx == -1:
        return None

    # Parse header
    header = lines[header_idx].replace("SCORE:", "").strip().split()

    # Parse data rows
    data = []
    for line in lines[header_idx+1:]:
        if line.strip() and "SCORE:" in line:
            row = line.replace("SCORE:", "").strip().split()
            # Ensure the row has enough elements
            if len(row) >= len(header):
                data_row = row[:len(header)-1] + [" ".join(row[len(header)-1:])]
                data.append(data_row)

    # Create DataFrame
    df = pd.DataFrame(data, columns=header)

    # Convert numeric columns to float
    numeric_cols = ['binder_aligned_rmsd', 'pae_binder', 'pae_interaction',
                   'pae_target', 'plddt_binder', 'plddt_target',
                   'plddt_total', 'target_aligned_rmsd', 'time']

    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

def extract_section(content, section_title, num_lines=None):
    """Extract a section from the analysis file."""
    pattern = f"={{{5,}}}\n{section_title}\n={{{5,}}}"
    match = re.search(pattern, content)
    if not match:
        return None

    start_idx = match.end()
    next_section = re.search("={5,}", content[start_idx:])

    if next_section:
        end_idx = start_idx + next_section.start()
        section_content = content[start_idx:end_idx].strip()
    else:
        section_content = content[start_idx:].strip()

    if num_lines:
        section_content = "\n".join(section_content.split("\n")[:num_lines])

    return section_content

def create_correlation_heatmap(combined_df, output_path):
    """Create correlation heatmap between metrics."""
    if combined_df is None or combined_df.empty:
        return

    plt.figure(figsize=(10, 8))

    # Select only numeric columns for correlation
    numeric_cols = ['binder_aligned_rmsd', 'pae_binder', 'pae_interaction',
                   'pae_target', 'plddt_binder', 'plddt_target',
                   'plddt_total', 'target_aligned_rmsd', 'time']

    # Filter to only include columns that exist
    available_cols = [col for col in numeric_cols if col in combined_df.columns]
    numeric_df = combined_df[available_cols].copy()

    # Calculate correlation
    corr = numeric_df.corr()

    # Create custom colormap (blue to white to red)
    colors = ["#4363d8", "#ffffff", "#e6194B"]
    cmap = LinearSegmentedColormap.from_list("custom_diverging", colors, N=256)

    # Create heatmap
    mask = np.zeros_like(corr, dtype=bool)
    mask[np.triu_indices_from(mask, k=1)] = True

    with sns.axes_style("white"):
        sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
                    annot=True, fmt=".2f", square=True, linewidths=.5,
                    cbar_kws={"shrink": .7})

    plt.title('Correlation between Binding Metrics', fontsize=16, pad=20)
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)
    plt.close()

def create_parallel_coordinates(combined_df, output_path):
    """Create parallel coordinates plot for the top designs."""
    if combined_df is None or combined_df.empty:
        return

    # Select columns to include in parallel plot
    cols_to_plot = ['binder_aligned_rmsd', 'pae_interaction', 'plddt_binder',
                    'plddt_total', 'description']

    # Filter to only columns that exist
    available_cols = [col for col in cols_to_plot if col in combined_df.columns]

    if set(available_cols) - set(['description']) and 'description' in available_cols:
        plot_df = combined_df[available_cols].copy()

        # Normalize the data for better visualization
        for col in plot_df.columns:
            if col != 'description':
                if col.startswith('plddt'):
                    # For plddt higher is better, so we invert it for consistent direction
                    plot_df[col] = 1 - (plot_df[col] - plot_df[col].min()) / (plot_df[col].max() - plot_df[col].min() + 1e-10)
                else:
                    # For other metrics lower is better
                    plot_df[col] = (plot_df[col] - plot_df[col].min()) / (plot_df[col].max() - plot_df[col].min() + 1e-10)

        # Create the plot
        plt.figure(figsize=(12, 8))
        pd.plotting.parallel_coordinates(
            plot_df, 'description',
            colormap=plt.cm.viridis,
            alpha=0.7
        )

        plt.title('Parallel Coordinates Plot of Top Designs', fontsize=16)
        plt.xticks(rotation=30)
        plt.ylabel('Normalized Values (lower is better for all metrics)')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

        plt.tight_layout()
        plt.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)
        plt.close()

def create_scatterplot_matrix(combined_df, output_path):
    """Create scatterplot matrix for the combined metrics."""
    if combined_df is None or combined_df.empty:
        return

    # Select metrics to plot
    metrics = ['binder_aligned_rmsd', 'pae_interaction', 'plddt_binder', 'plddt_total']

    # Filter to only include columns that exist
    available_metrics = [col for col in metrics if col in combined_df.columns]

    if len(available_metrics) > 1:
        plt.figure(figsize=(12, 10))

        scatter_df = combined_df[available_metrics + ['description']].copy()

        # Create pairplot
        g = sns.pairplot(scatter_df,
                        vars=available_metrics,
                        diag_kind='kde',
                        plot_kws={'alpha': 0.6, 's': 80, 'edgecolor': 'k', 'linewidth': 0.5},
                        diag_kws={'fill': True, 'alpha': 0.6})

        # Add titles and adjust layout
        g.fig.suptitle('Scatterplot Matrix of Binding Metrics', fontsize=16, y=1.02)
        g.fig.tight_layout()

        # Save with transparent background
        g.fig.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)
        plt.close()

def create_ranking_barplots(plddt_df, pae_df, rmsd_df, output_dir):
    """Create bar plots for top designs by each metric."""
    metrics = [
        {'df': plddt_df, 'metric': 'plddt_binder', 'title': 'Top Designs by PLDDT_binder',
         'color': 'green', 'higher_better': True, 'filename': 'top_plddt_binder.png'},
        {'df': pae_df, 'metric': 'pae_interaction', 'title': 'Top Designs by PAE_interaction',
         'color': 'blue', 'higher_better': False, 'filename': 'top_pae_interaction.png'},
        {'df': rmsd_df, 'metric': 'binder_aligned_rmsd', 'title': 'Top Designs by Binder_aligned_RMSD',
         'color': 'purple', 'higher_better': False, 'filename': 'top_binder_rmsd.png'}
    ]

    for item in metrics:
        df = item['df']
        if df is None or df.empty:
            continue

        metric = item['metric']
        if metric not in df.columns:
            continue

        # Sort data and limit to top 10 for visualization
        if item['higher_better']:
            sorted_df = df.sort_values(by=metric, ascending=False).head(10)
        else:
            sorted_df = df.sort_values(by=metric, ascending=True).head(10)

        plt.figure(figsize=(12, 8))

        # Create horizontal bar chart
        bars = plt.barh(sorted_df['description'], sorted_df[metric], color=item['color'], alpha=0.7)

        # Add value labels
        for bar in bars:
            width = bar.get_width()
            plt.text(width + (width * 0.01),
                    bar.get_y() + bar.get_height()/2,
                    f'{width:.2f}',
                    ha='left', va='center')

        # Add titles and labels
        plt.xlabel(metric)
        plt.ylabel('Design')
        plt.title(item['title'], fontsize=16)

        # Direction indicator
        direction = "Higher is better" if item['higher_better'] else "Lower is better"
        plt.figtext(0.5, 0.01, direction, ha='center', fontsize=12)

        plt.tight_layout()
        output_path = os.path.join(output_dir, item['filename'])
        plt.savefig(output_path, dpi=300, bbox_inches='tight', transparent=True)
        plt.close()

def visualize_results(analysis_file, output_dir):
    """Create visualizations from analysis file."""
    print(f"Generating visualizations from {analysis_file}...")

    if not os.path.exists(analysis_file):
        print(f"Error: Analysis file {analysis_file} not found.")
        return False

    # Read the analysis file
    with open(analysis_file, 'r') as f:
        content = f.read()

    # Extract sections
    plddt_section = extract_section(content, "TOP .* DESIGNS BY PLDDT_BINDER")
    pae_section = extract_section(content, "TOP .* DESIGNS BY PAE_INTERACTION")
    rmsd_section = extract_section(content, "TOP .* DESIGNS BY BINDER_ALIGNED_RMSD")
    combined_section = extract_section(content, "DESIGNS APPEARING IN ALL THREE TOP .* LISTS")

    # Parse data from sections
    plddt_df = parse_scores_section(plddt_section)
    pae_df = parse_scores_section(pae_section)
    rmsd_df = parse_scores_section(rmsd_section)
    combined_df = parse_scores_section(combined_section)

    # Create plots
    print("Generating plots...")

    # 1. Create correlation heatmap
    print("Creating correlation heatmap...")
    corr_output = os.path.join(output_dir, "metric_correlations.png")
    create_correlation_heatmap(combined_df, corr_output)

    # 2. Create parallel coordinates plot
    print("Creating parallel coordinates plot...")
    parallel_output = os.path.join(output_dir, "parallel_coordinates.png")
    create_parallel_coordinates(combined_df, parallel_output)

    # 3. Create scatterplot matrix
    print("Creating scatterplot matrix...")
    scatter_output = os.path.join(output_dir, "scatterplot_matrix.png")
    create_scatterplot_matrix(combined_df, scatter_output)

    # 4. Create ranking barplots
    print("Creating ranking barplots...")
    create_ranking_barplots(plddt_df, pae_df, rmsd_df, output_dir)

    print(f"All plots saved to {output_dir}")
    print("Plots generated:")
    print(f"1. {os.path.basename(corr_output)} - Correlation heatmap between metrics")
    print(f"2. {os.path.basename(parallel_output)} - Parallel coordinates plot of top designs")
    print(f"3. {os.path.basename(scatter_output)} - Scatterplot matrix of binding metrics")
    print("4. top_plddt_binder.png - Bar chart of top designs by PLDDT score")
    print("5. top_pae_interaction.png - Bar chart of top designs by PAE interaction")
    print("6. top_binder_rmsd.png - Bar chart of top designs by binder RMSD")

    return True

def main():
    """Main function."""
    # First check if running in Google Colab and mount drive if needed
    try:
        import google.colab
        # We're in Colab, check if drive is mounted
        if not os.path.exists('/content/drive'):
            print("Mounting Google Drive...")
            from google.colab import drive
            drive.mount('/content/drive')
            print("Google Drive mounted successfully.")
    except ImportError:
        # Not in Colab, continue normally
        pass

    # Parse arguments
    args = parse_args()

    # Verify input file path
    if not os.path.exists(args.file):
        print(f"WARNING: Input file {args.file} not found.")
        user_input = input("Would you like to specify a different path? (y/n): ")
        if user_input.lower() == 'y':
            new_path = input("Enter the path to your out.sc file: ")
            args.file = new_path

    # Determine output directory
    output_dir = args.output

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Analysis file path
    analysis_file = os.path.join(output_dir, "0_top_binders_analysis.txt")

    print(f"Input file: {args.file}")
    print(f"Output directory: {output_dir}")
    print(f"Analysis file will be saved as: {analysis_file}")

    # Run analysis if not skipped
    if not args.visualize_only:
        success = run_analysis(args.file, analysis_file, args.number)
        if not success:
            return

    # Create visualizations
    visualize_results(analysis_file, output_dir)

if __name__ == "__main__":
    main()

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
