<a href="https://colab.research.google.com/github/eoinleen/protein-design-final-dir/blob/main/BindCraft_out_analysis_v2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Protein Design Analysis Script
=============================

This script analyzes protein design data from BindCraft and AF2-ProteinMPNN, creating both
visualization outputs and detailed analysis files.

Input:
------
- final_design_stats.csv: Raw data file containing protein design statistics
  Located in: /content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/

Outputs:
--------
1. CSV File (protein_analysis_complete.csv):
   - Sorted by iPTM score (descending)
   - Contains metrics and their quality assessments side by side
   - Quality categories: Excellent, Good, Moderate

2. Visualization (correlation_analysis.pptx):
   - Single A4 slide with 2x3 matrix of correlation plots
   - All metrics plotted against iPTM scores
   - Publication-quality formatting
   - Color-coded points based on iPTM and pLDDT thresholds

Quality Score Thresholds:
------------------------
iPTM Score:
- Excellent: ≥ 0.80
- Good: ≥ 0.75
- Moderate: < 0.75

Binding Energy (dG):
- Excellent: ≤ -65 kcal/mol
- Good: ≤ -55 kcal/mol
- Moderate: > -55 kcal/mol

pLDDT Score:
- Excellent: ≥ 0.90
- Good: ≥ 0.85
- Moderate: < 0.85

Packing Statistics:
- Excellent: ≥ 0.60
- Good: ≥ 0.50
- Moderate: < 0.50

Shape Complementarity:
- Excellent: ≥ 0.65
- Good: ≥ 0.60
- Moderate: < 0.60

Dependencies:
------------
- python-pptx
- seaborn
- matplotlib
- pandas
- numpy

Usage:
------
1. Mount Google Drive
2. Ensure input CSV exists in specified directory
3. Run script to generate analysis files and visualizations

Author: Claude
Date: February 14, 2025
"""

# Install required packages
!pip install seaborn python-pptx

import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
import numpy as np
from pptx import Presentation
from pptx.util import Inches

# Mount Google Drive
drive.mount('/content/drive')

# Set the directory path
directory = '/content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/'

# Read and process data
csv_path = os.path.join(directory, 'final_design_stats.csv')
df = pd.read_csv(csv_path)

# Sort by Average_i_pTM
df_sorted = df.sort_values('Average_i_pTM', ascending=False)

# Select and reorder columns with metrics and their qualities adjacent
columns_to_keep = [
    'Design',
    'Average_i_pTM',
    'Average_dG',
    'Average_pLDDT',
    'Average_PackStat',
    'Average_ShapeComplementarity',
    'Average_n_InterfaceHbonds',
    'Average_dSASA'
]

# Create summary dataframe
df_summary = df_sorted[columns_to_keep].copy()

# Define color and marker coding function
def assign_style(row):
    if row['Average_i_pTM'] >= 0.8 and row['Average_pLDDT'] >= 0.9:
        return ('blue', 'o')  # Very high quality - blue circles
    elif (row['Average_i_pTM'] >= 0.8 and 0.85 <= row['Average_pLDDT'] < 0.9) or \
         (0.75 <= row['Average_i_pTM'] < 0.8 and row['Average_pLDDT'] >= 0.9):
        return ('purple', 's')  # High quality in one metric - purple squares
    elif (0.75 <= row['Average_i_pTM'] < 0.8) and (0.85 <= row['Average_pLDDT'] < 0.9):
        return ('red', 'o')  # Medium quality - red circles
    else:
        return ('black', 'x')  # Others - black crosses

# Add style columns
df_summary[['color', 'marker']] = pd.DataFrame(df_summary.apply(assign_style, axis=1).tolist(),
                                              index=df_summary.index)

# Scoring functions
def score_dG(x):
    if x <= -65: return 'Excellent'
    elif x <= -55: return 'Good'
    return 'Moderate'

def score_pLDDT(x):
    if x >= 0.90: return 'Excellent'
    elif x >= 0.85: return 'Good'
    return 'Moderate'

def score_PackStat(x):
    if x >= 0.60: return 'Excellent'
    elif x >= 0.50: return 'Good'
    return 'Moderate'

def score_iPTM(x):
    if x >= 0.80: return 'Excellent'
    elif x >= 0.75: return 'Good'
    return 'Moderate'

def score_ShapeComp(x):
    if x >= 0.65: return 'Excellent'
    elif x >= 0.60: return 'Good'
    return 'Moderate'

# Add scoring columns in the right order
df_summary.insert(2, 'iPTM_Quality', df_summary['Average_i_pTM'].apply(score_iPTM))
df_summary.insert(4, 'dG_Quality', df_summary['Average_dG'].apply(score_dG))
df_summary.insert(6, 'pLDDT_Quality', df_summary['Average_pLDDT'].apply(score_pLDDT))
df_summary.insert(8, 'PackStat_Quality', df_summary['Average_PackStat'].apply(score_PackStat))
df_summary.insert(10, 'ShapeComp_Quality', df_summary['Average_ShapeComplementarity'].apply(score_ShapeComp))

# Add rank number to dataframe
df_summary.insert(0, 'Rank', range(1, len(df_summary) + 1))

# Save detailed analysis to CSV with rank
output_path = os.path.join(directory, 'protein_analysis_complete.csv')
df_summary.to_csv(output_path, index=False)

# Function to create correlation plots
def create_correlation_plots(x_metric, x_label, df, output_prefix, include_legend=False):
    # Set up plotting parameters for all metrics, ensuring no self-correlation and including dG
    base_metrics = [
        ('Average_dG', 'Binding Energy (ΔG)'),
        ('Average_i_pTM', 'iPTM Score'),
        ('Average_pLDDT', 'pLDDT Score'),
        ('Average_PackStat', 'Packing Statistics'),
        ('Average_ShapeComplementarity', 'Shape Complementarity'),
        ('Average_n_InterfaceHbonds', 'Interface H-bonds'),
        ('Average_dSASA', 'Buried Surface Area (Å²)')
    ]

    # Filter out self-correlation and ensure dG is included if not the x_metric
    metrics_to_plot = [m for m in base_metrics if m[0] != x_metric]
    if x_metric != 'Average_dG' and ('Average_dG', 'Binding Energy (ΔG)') not in metrics_to_plot:
        metrics_to_plot = [('Average_dG', 'Binding Energy (ΔG)')] + metrics_to_plot[:-1]

    # Create figure with 2x3 subplots
    fig, axes = plt.subplots(3, 2, figsize=(11.69, 16.54), dpi=300)
    axes = axes.flatten()

    # Filter out low quality hits
    high_quality_mask = df['color'].isin(['blue', 'purple', 'red'])
    df_filtered = df[high_quality_mask]

    # Create legend handles (only if needed)
    if include_legend:
        very_high_quality = plt.Line2D([], [], marker='o', color='blue', linestyle='None',
                                    markersize=6, label='Very high quality (iPTM ≥ 0.8 & pLDDT ≥ 0.9)')
        high_quality = plt.Line2D([], [], marker='s', color='purple', linestyle='None',
                                markersize=6, label='High in one metric (iPTM ≥ 0.8 & pLDDT 0.85-0.9 or iPTM 0.75-0.8 & pLDDT ≥ 0.9)')
        medium_quality = plt.Line2D([], [], marker='o', color='red', linestyle='None',
                                markersize=6, label='Medium quality (iPTM 0.75-0.8 & pLDDT 0.85-0.9)')

    for idx, ((metric, label), ax) in enumerate(zip(metrics_to_plot, axes)):
        # Plot data points with colors and markers
        for color, marker in [('blue', 'o'), ('purple', 's'), ('red', 'o')]:
            mask = (df_filtered['color'] == color) & (df_filtered['marker'] == marker)
            points = ax.scatter(df_filtered[x_metric][mask], df_filtered[metric][mask],
                              c=color, marker=marker, s=30)

            # Add rank labels
            for x, y, rank in zip(df_filtered[x_metric][mask],
                                df_filtered[metric][mask],
                                df_filtered['Rank'][mask]):
                ax.annotate(str(rank), (x, y), xytext=(5, 5),
                          textcoords='offset points', fontsize=8)

        # Style the plot
        ax.set_xlabel(x_label, fontsize=13, fontweight='bold')
        ax.set_ylabel(label, fontsize=13, fontweight='bold')
        ax.set_title(f'{x_label} vs {label}', fontsize=15, fontweight='bold', pad=10)

        # Remove grid lines but keep tick marks
        ax.grid(False)
        ax.tick_params(labelsize=11, direction='in', length=6, width=1)

        # Add minor ticks
        ax.minorticks_on()
        ax.tick_params(which='minor', direction='in', length=3, width=1)

        # Style the spines
        for spine in ax.spines.values():
            spine.set_linewidth(1)

    # Add legend only if requested
    if include_legend:
        fig.legend(handles=[very_high_quality, high_quality, medium_quality],
                  loc='upper center', bbox_to_anchor=(0.5, 0.98), ncol=1, fontsize=12)

    # Add separation lines
    fig.patch.set_facecolor('white')
    fig.add_artist(plt.Line2D([0.5, 0.5], [0.1, 0.9],
                             color='#EEEEEE', linewidth=0.25,
                             linestyle='--', dashes=(3, 3),
                             transform=fig.transFigure))
    fig.add_artist(plt.Line2D([0.1, 0.9], [0.645, 0.645],
                             color='#EEEEEE', linewidth=0.25,
                             linestyle='--', dashes=(3, 3),
                             transform=fig.transFigure))
    fig.add_artist(plt.Line2D([0.1, 0.9], [0.315, 0.315],
                             color='#EEEEEE', linewidth=0.25,
                             linestyle='--', dashes=(3, 3),
                             transform=fig.transFigure))

    plt.tight_layout()
    # Adjust subplot spacing (with less space at top when no legend)
    if include_legend:
        plt.subplots_adjust(top=0.92, hspace=0.4, wspace=0.25)
    else:
        plt.subplots_adjust(top=0.95, hspace=0.4, wspace=0.25)

    # Save the figure
    plot_path = os.path.join(directory, f'{output_prefix}_correlation_plots.png')
    plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()

    return plot_path

# Create all correlation plots
correlation_metrics = [
    ('Average_dG', 'Binding Energy (ΔG)'),
    ('Average_pLDDT', 'pLDDT Score'),
    ('Average_PackStat', 'Packing Statistics'),
    ('Average_ShapeComplementarity', 'Shape Complementarity'),
    ('Average_n_InterfaceHbonds', 'Interface H-bonds'),
    ('Average_dSASA', 'Buried Surface Area (Å²)')
]

# Create PowerPoint for correlation plots
prs_correlations = Presentation()
prs_correlations.slide_width = Inches(11.69)  # A4 width
prs_correlations.slide_height = Inches(8.27)  # A4 height

# Generate all correlation plots
for metric, label in correlation_metrics:
    # Only include legend in the initial plot for 'Average_dG'
    include_legend = (metric == 'Average_dG')
    plot_path = create_correlation_plots(metric, label, df_summary, f'{metric}_vs_all', include_legend)

    # Add to PowerPoint
    slide = prs_correlations.slides.add_slide(prs_correlations.slide_layouts[5])
    left = Inches(0.5)
    top = Inches(0.5)
    width = Inches(10.69)
    height = Inches(7.27)
    slide.shapes.add_picture(plot_path, left, top, width=width)

# Save the correlation plots PowerPoint
correlation_pptx_path = os.path.join(directory, 'all_correlation_analyses.pptx')
prs_correlations.save(correlation_pptx_path)

# Set up plotting parameters
metrics_to_plot = [
    ('Average_dG', 'Binding Energy (ΔG)'),
    ('Average_pLDDT', 'pLDDT Score'),
    ('Average_PackStat', 'Packing Statistics'),
    ('Average_ShapeComplementarity', 'Shape Complementarity'),
    ('Average_n_InterfaceHbonds', 'Interface H-bonds'),
    ('Average_dSASA', 'Buried Surface Area (Å²)')
]

# Create single figure with 2x3 subplots and proper spacing
fig, axes = plt.subplots(3, 2, figsize=(11.69, 16.54), dpi=300)
axes = axes.flatten()

# Add separation lines - adjusted positions to be between labels
fig.patch.set_facecolor('white')

# Calculate positions for lines between the labels
# Vertical line
fig.add_artist(plt.Line2D([0.5, 0.5], [0.1, 0.9],
                         color='#EEEEEE', linewidth=0.25,
                         linestyle='--',
                         dashes=(3, 3),
                         transform=fig.transFigure))

# Horizontal lines adjusted to fit between labels
fig.add_artist(plt.Line2D([0.1, 0.9], [0.645, 0.645],
                         color='#EEEEEE', linewidth=0.25,
                         linestyle='--',
                         dashes=(3, 3),
                         transform=fig.transFigure))
fig.add_artist(plt.Line2D([0.1, 0.9], [0.315, 0.315],
                         color='#EEEEEE', linewidth=0.25,
                         linestyle='--',
                         dashes=(3, 3),
                         transform=fig.transFigure))

# Add legend handles
very_high_quality = plt.Line2D([], [], marker='o', color='blue', linestyle='None',
                              markersize=6, label='Very high quality (iPTM ≥ 0.8 & pLDDT ≥ 0.9)')
high_quality = plt.Line2D([], [], marker='s', color='purple', linestyle='None',
                         markersize=6, label='High in one metric (iPTM ≥ 0.8 & pLDDT 0.85-0.9 or iPTM 0.75-0.8 & pLDDT ≥ 0.9)')
medium_quality = plt.Line2D([], [], marker='o', color='red', linestyle='None',
                           markersize=6, label='Medium quality (iPTM 0.75-0.8 & pLDDT 0.85-0.9)')
other = plt.Line2D([], [], marker='x', color='black', linestyle='None',
                   markersize=6, label='Other')

for idx, ((metric, label), ax) in enumerate(zip(metrics_to_plot, axes)):
    # Create scatter plot with color-coded points and different markers
    for color, marker in [('blue', 'o'), ('purple', 's'), ('red', 'o'), ('black', 'x')]:
        mask = (df_summary['color'] == color) & (df_summary['marker'] == marker)
        ax.scatter(df_summary[metric][mask], df_summary['Average_i_pTM'][mask],
                  c=color, marker=marker, s=30 if marker != 'x' else 50, alpha=0.7)

    # Style the plot
    ax.set_xlabel(label, fontsize=13, fontweight='bold')
    ax.set_ylabel('iPTM Score', fontsize=13, fontweight='bold')
    ax.set_title(label, fontsize=15, fontweight='bold', pad=10)

    # Remove grid lines but keep tick marks
    ax.grid(False)
    ax.tick_params(labelsize=11, direction='in', length=6, width=1)

    # Add minor ticks
    ax.minorticks_on()
    ax.tick_params(which='minor', direction='in', length=3, width=1)

    # Style the spines
    for spine in ax.spines.values():
        spine.set_linewidth(1)

# Add legend to the figure
fig.legend(handles=[very_high_quality, high_quality, medium_quality, other],
          loc='upper center',
          bbox_to_anchor=(0.5, 0.98),
          ncol=2,
          fontsize=12)

plt.tight_layout()
# Adjust layout to make room for legend at top and ensure proper spacing between plots
plt.subplots_adjust(top=0.92, hspace=0.4, wspace=0.25)

# Save the combined figure
plot_path = os.path.join(directory, 'correlation_plots_combined.png')
plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')

# Create PowerPoint with A4 dimensions
prs = Presentation()
prs.slide_width = Inches(11.69)  # A4 width
prs.slide_height = Inches(8.27)  # A4 height

# Add single slide with all plots
slide = prs.slides.add_slide(prs.slide_layouts[5])
left = Inches(0.5)
top = Inches(0.5)
width = Inches(10.69)
height = Inches(7.27)
slide.shapes.add_picture(plot_path, left, top, width=width)

# Save PowerPoint
pptx_path = os.path.join(directory, 'correlation_analysis.pptx')
prs.save(pptx_path)

plt.close()

# Print summary statistics
print("\nAnalysis complete!")
print(f"Complete analysis saved to: {output_path}")
print(f"PowerPoint presentation saved to: {pptx_path}")

# Print distribution of categories
print("\nDataset Summary:")
print(f"Total number of designs: {len(df_summary)}")
print("\nDistribution of Categories:")
print("Very high quality hits (iPTM ≥ 0.8 & pLDDT ≥ 0.9):",
      sum((df_summary['color'] == 'blue') & (df_summary['marker'] == 'o')))
print("High quality hits (high in one metric):",
      sum((df_summary['color'] == 'purple') & (df_summary['marker'] == 's')))
print("Medium quality hits:",
      sum((df_summary['color'] == 'red') & (df_summary['marker'] == 'o')))
print("Other hits:",
      sum((df_summary['color'] == 'black') & (df_summary['marker'] == 'x')))

print("\nQuality Distribution:")
print("\niPTM Quality:")
print(df_summary['iPTM_Quality'].value_counts())

Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.1/165.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, python-pptx
Successfully installed XlsxWriter-3.2.2 python-pptx-1.0.2
Mounted at /content/drive

Analysis complete!
Complete analysis saved to: /content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/protein_analysis_complete.csv
PowerPoint presentation saved to: /content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/correlation_analysis.pptx

Dataset Summary:
Total number of designs: 1

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
