<a href="https://colab.research.google.com/github/eoinleen/protein-design-final-dir/blob/main/BindCraft_out_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
"""
Protein Design Analysis Script
=============================

This script analyzes protein design data from BindCraft and AF2-ProteinMPNN, creating both
visualization outputs and detailed analysis files.

Input:
------
- final_design_stats.csv: Raw data file containing protein design statistics
  Located in: /content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/

Outputs:
--------
1. CSV File (protein_analysis_complete.csv):
   - Sorted by iPTM score (descending)
   - Contains metrics and their quality assessments side by side
   - Quality categories: Excellent, Good, Moderate

2. Visualization (correlation_analysis.pptx):
   - Single A4 slide with 2x3 matrix of correlation plots
   - All metrics plotted against iPTM scores
   - Publication-quality formatting

Quality Score Thresholds:
------------------------
iPTM Score:
- Excellent: ≥ 0.80
- Good: ≥ 0.75
- Moderate: < 0.75

Binding Energy (dG):
- Excellent: ≤ -65 kcal/mol
- Good: ≤ -55 kcal/mol
- Moderate: > -55 kcal/mol

pLDDT Score:
- Excellent: ≥ 0.90
- Good: ≥ 0.85
- Moderate: < 0.85

Packing Statistics:
- Excellent: ≥ 0.60
- Good: ≥ 0.50
- Moderate: < 0.50

Shape Complementarity:
- Excellent: ≥ 0.65
- Good: ≥ 0.60
- Moderate: < 0.60

Dependencies:
------------
- python-pptx
- seaborn
- matplotlib
- pandas
- numpy

Usage:
------
1. Mount Google Drive
2. Ensure input CSV exists in specified directory
3. Run script to generate analysis files and visualizations

Author: Claude
Date: February 14, 2025
"""

# Install required packages
!pip install seaborn python-pptx

import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
import numpy as np
from pptx import Presentation
from pptx.util import Inches

# Mount Google Drive
drive.mount('/content/drive')

# Set the directory path
directory = '/content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/'

# Read and process data
csv_path = os.path.join(directory, 'final_design_stats.csv')
df = pd.read_csv(csv_path)

# Sort by Average_i_pTM
df_sorted = df.sort_values('Average_i_pTM', ascending=False)

# Select and reorder columns with metrics and their qualities adjacent
columns_to_keep = [
    'Design',
    'Average_i_pTM',
    'Average_dG',
    'Average_pLDDT',
    'Average_PackStat',
    'Average_ShapeComplementarity',
    'Average_n_InterfaceHbonds',
    'Average_dSASA'
]

# Create summary dataframe
df_summary = df_sorted[columns_to_keep].copy()

# Scoring functions
def score_dG(x):
    if x <= -65: return 'Excellent'
    elif x <= -55: return 'Good'
    return 'Moderate'

def score_pLDDT(x):
    if x >= 0.90: return 'Excellent'
    elif x >= 0.85: return 'Good'
    return 'Moderate'

def score_PackStat(x):
    if x >= 0.60: return 'Excellent'
    elif x >= 0.50: return 'Good'
    return 'Moderate'

def score_iPTM(x):
    if x >= 0.80: return 'Excellent'
    elif x >= 0.75: return 'Good'
    return 'Moderate'

def score_ShapeComp(x):
    if x >= 0.65: return 'Excellent'
    elif x >= 0.60: return 'Good'
    return 'Moderate'

# Add scoring columns in the right order
df_summary.insert(2, 'iPTM_Quality', df_summary['Average_i_pTM'].apply(score_iPTM))
df_summary.insert(4, 'dG_Quality', df_summary['Average_dG'].apply(score_dG))
df_summary.insert(6, 'pLDDT_Quality', df_summary['Average_pLDDT'].apply(score_pLDDT))
df_summary.insert(8, 'PackStat_Quality', df_summary['Average_PackStat'].apply(score_PackStat))
df_summary.insert(10, 'ShapeComp_Quality', df_summary['Average_ShapeComplementarity'].apply(score_ShapeComp))

# Save detailed analysis to CSV
output_path = os.path.join(directory, 'protein_analysis_complete.csv')
df_summary.to_csv(output_path, index=False)

# Set up plotting parameters
metrics_to_plot = [
    ('Average_dG', 'Binding Energy (ΔG)'),
    ('Average_pLDDT', 'pLDDT Score'),
    ('Average_PackStat', 'Packing Statistics'),
    ('Average_ShapeComplementarity', 'Shape Complementarity'),
    ('Average_n_InterfaceHbonds', 'Interface H-bonds'),
    ('Average_dSASA', 'Buried Surface Area (Å²)')
]

# Create single figure with 2x3 subplots
fig, axes = plt.subplots(3, 2, figsize=(11.69, 16.54), dpi=300)
axes = axes.flatten()

for idx, ((metric, label), ax) in enumerate(zip(metrics_to_plot, axes)):
    # Create scatter plot with black crosses
    ax.plot(df_summary[metric], df_summary['Average_i_pTM'],
            'k+', markersize=4, alpha=0.7)

    # Style the plot
    ax.set_xlabel(label, fontsize=13, fontweight='bold')
    ax.set_ylabel('iPTM Score', fontsize=13, fontweight='bold')
    ax.set_title(label, fontsize=15, fontweight='bold', pad=10)

    # Remove grid lines but keep tick marks
    ax.grid(False)
    ax.tick_params(labelsize=11, direction='in', length=6, width=1)

    # Add minor ticks
    ax.minorticks_on()
    ax.tick_params(which='minor', direction='in', length=3, width=1)

    # Style the spines
    for spine in ax.spines.values():
        spine.set_linewidth(1)

plt.tight_layout()

# Save the combined figure
plot_path = os.path.join(directory, 'correlation_plots_combined.png')
plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')

# Create PowerPoint with A4 dimensions
prs = Presentation()
prs.slide_width = Inches(11.69)  # A4 width
prs.slide_height = Inches(8.27)  # A4 height

# Add single slide with all plots
slide = prs.slides.add_slide(prs.slide_layouts[5])
left = Inches(0.5)
top = Inches(0.5)
width = Inches(10.69)
height = Inches(7.27)
slide.shapes.add_picture(plot_path, left, top, width=width)

# Save PowerPoint
pptx_path = os.path.join(directory, 'correlation_analysis.pptx')
prs.save(pptx_path)

plt.close()

print(f"\nAnalysis complete!")
print(f"Complete analysis saved to: {output_path}")
print(f"PowerPoint presentation saved to: {pptx_path}")

# Print summary statistics
print("\nDataset Summary:")
print(f"Total number of designs: {len(df_summary)}")
print("\nQuality Distribution:")
print("\niPTM Quality:")
print(df_summary['iPTM_Quality'].value_counts())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Analysis complete!
Complete analysis saved to: /content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/protein_analysis_complete.csv
PowerPoint presentation saved to: /content/drive/MyDrive/BindCraft/UB2-K11/3NOB/Att1-iso/correlation_analysis.pptx

Dataset Summary:
Total number of designs: 114

Quality Distribution:

iPTM Quality:
iPTM_Quality
Moderate     70
Good         31
Excellent    13
Name: count, dtype: int64


In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
