<a href="https://colab.research.google.com/github/eoinleen/protein-design-final-dir/blob/main/Plotting_PAE_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
"""
iPAE Score Visualization Script
Created: January 31, 2025
Authors: Claude (Anthropic) and User - Collaborative Development

Purpose:
    Creates a visualization of iPAE scores for protein design data, specifically handling
    32 designs with 64 sequences each (2048 total sequences). Generates both interactive HTML
    and static PDF outputs with alternating black/red coloring to distinguish designs.

Input Requirements:
    - CSV file with columns:
        * Column 1: Index
        * design: Design number (0-31)
        * n: Sequence number within each design (0-63)
        * i_pae: iPAE score to be plotted
        * (other columns will be ignored)
    - Total of 2048 rows expected (32 designs × 64 sequences)
    - File should be accessible from Google Drive

Dependencies:
    !pip install -U kaleido plotly

    Python libraries:
    - pandas
    - plotly
    - numpy
    - os
    - google.colab (for Drive mounting)

Usage:
    1. Mount Google Drive in Colab
    2. Update these paths in main():
       input_file = '/content/drive/MyDrive/path_to_your_data/af2_scores.csv'
       output_dir = '/content/drive/MyDrive/path_to_output_directory'
    3. Run the script

Output:
    - Interactive HTML plot with 4 subplots (8 designs each)
    - Static PDF version of the same plot
    - Each design's 64 sequences shown in alternating black/red
    - Hover information showing design number, sequence number, and iPAE score
    - Statistical summary of the data

Notes:
    - Designed for Google Colab environment
    - Plot dimensions optimized for A4 paper size
    - Y-axis fixed at 0-30 range for iPAE scores
"""

# Install required packages
!pip install -U kaleido plotly

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

def process_data(file_path):
    df = pd.read_csv(file_path)

    # Print data structure for verification
    print(f"Number of unique designs: {df['design'].nunique()}")
    print(f"Sequences per design: {df.groupby('design').size().iloc[0]}")

    return df

def create_ipae_plots(df, output_path):
    # Create 4 subplots (8 designs per subplot)
    fig = make_subplots(
        rows=4,
        cols=1,
        vertical_spacing=0.08,
        subplot_titles=[f"Designs {i*8}-{(i+1)*8-1}" for i in range(4)]
    )

    # Define colors
    colors = ['black', 'red']
    rows_per_subplot = 512  # 8 designs × 64 sequences = 512 rows per subplot

    for i in range(4):
        start_idx = i * rows_per_subplot
        end_idx = start_idx + rows_per_subplot
        chunk = df.iloc[start_idx:end_idx].copy()

        # Plot each design's sequences with alternating colors
        for design_num in chunk['design'].unique():
            mask = chunk['design'] == design_num
            color = colors[design_num % 2]

            fig.add_trace(
                go.Bar(
                    x=chunk[mask].index,
                    y=chunk[mask]['i_pae'],
                    showlegend=False,
                    marker_color=color,
                    width=1,
                    hovertemplate=(
                        "Index: %{x}<br>"
                        "Design: " + str(design_num) + "<br>"
                        "Sequence: %{customdata}<br>"
                        "iPAE: %{y:.4f}<br>"
                    ),
                    customdata=chunk[mask]['n']
                ),
                row=i+1,
                col=1
            )

        # Update axes
        fig.update_yaxes(
            range=[0, 30],
            title_text='iPAE' if i == 1 else None,
            row=i+1,
            col=1
        )

        # Add design number labels
        design_numbers = sorted(chunk['design'].unique())
        fig.update_xaxes(
            tickmode='array',
            ticktext=design_numbers,
            tickvals=[start_idx + (j*64) + 32 for j in range(len(design_numbers))],  # Center of each design group
            row=i+1,
            col=1,
            title_text='Design Number' if i == 3 else None
        )

    # Update layout
    fig.update_layout(
        title='iPAE Scores by Design Number and Sequence (Scale: 0-30)',
        height=1000,
        width=1200,
        showlegend=False,
        margin=dict(t=50, b=50, r=150, l=50),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )

    # Add statistics
    stats_text = (
        f"Min iPAE: {df['i_pae'].min():.4f}<br>"
        f"Max iPAE: {df['i_pae'].max():.4f}<br>"
        f"Avg iPAE: {df['i_pae'].mean():.4f}<br>"
        f"Total Designs: {df['design'].nunique()}<br>"
        f"Sequences per Design: {df.groupby('design').size().iloc[0]}"
    )

    fig.add_annotation(
        text=stats_text,
        xref="paper", yref="paper",
        x=1.02, y=-0.1,
        showarrow=False,
        font=dict(size=10),
        align="left"
    )

    # Save the figure
    fig.write_html(output_path)
    try:
        fig.write_image(output_path.replace('.html', '.pdf'))
    except Exception as e:
        print(f"Could not save PDF version due to: {e}")
        print("HTML version still saved successfully")

# Main execution
def main():
    input_file = '/content/drive/MyDrive/Fasta-files/af2_scores/af2_results.csv'  # Update this path
    output_dir = '/content/drive/MyDrive/Fasta-files/af2_scores/'    # Update this path

    os.makedirs(output_dir, exist_ok=True)
    html_output = os.path.join(output_dir, 'ipae_visualization.html')

    df = process_data(input_file)
    create_ipae_plots(df, html_output)

    print(f"Visualization saved to: {html_output}")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of unique designs: 32
Sequences per design: 64
Visualization saved to: /content/drive/MyDrive/Fasta-files/af2_scores/ipae_visualization.html
