<a href="https://colab.research.google.com/github/eoinleen/protein-design-final-dir/blob/main/cp_RFdiffusion_pathway_str_analysis_tool_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Super Easy Protein Analysis Tool
===============================

What Does This Thing Do?
-----------------------
Takes your PDB files and your AlphaFold2 (AF2) scores and:
1. Measures the buried surface area, hydrogen bonds, hydrophobic contacts, and salt bridges of AF2 outputs from FRdiffusion->ProteinMPNN -> AF2 pipeline
2. Makes some nice graphs comparing your results
3. Saves everything in easy-to-use files

REALLY Simple Instructions:
---------------------------------------------------------------
1. Put these files in ONE folder:
   - This code (saved as something.py)
   - Your PDB files (must end in .pdb)
   - Your AF2 scores (must be called 'af2_scores.csv')

2. Change ONE Line in the Code:
   Find this line near the bottom:
   pdb_directory = '/content/drive/MyDrive/PDB-files/all_pdb_2MBO-12'

   Change it to where your files are, like:
   pdb_directory = '/content/drive/MyDrive/YOUR_FOLDER_NAME'

3. Run It:
   - Just click the play button
   - That's literally it

What You'll Get:
--------------
1. CSV files with all your numbers:
   - structure_analysis.csv: Measured parametres fron this macro
   - combined_analysis.csv: Everything combined with AF2 scores

2. Three pretty graphs showing:
   - How many hydrogen bonds vs. ipTM scores
   - How many hydrophobic contacts vs. ipTM scores
   - How much buried surface area vs. ipTM scores
   Each graph will highlight the top 15 designs!

Requirements:
-----------
1. Your PDB Files:
   - Must be named like: design5_n18.pdb
     (where 5 is the design number and 18 is the variant)
   - Must have two chains that interact

2. Your AF2 Scores File: (Called mpnn_results.csv in the outputs directory from the RFdiffusion colabsheet)
   - Must be named: af2_scores.csv
   - Must have columns: design, n, i_ptm
   - Other columns are fine too!

Common Problems:
--------------
If it says:
- "Drive not mounted": Click the link it shows you
- "No such directory": Check your folder path for typos
- "Error processing [filename]": That PDB file might be broken
- "Can't find output": Look in the same folder as your PDB files

Need More Help?
-------------
Look for these messages when it runs:
- "Processing X PDB files..." (it's working!)
- "Saved results to..." (shows where to find your files)
- Any error messages will tell you what went wrong

Written by: Claude (Anthropic)
Original code from: Claude & Windows Co-pilot, Dr Eoin Leen, University of Leeds
Version: 3.1 (2025)
"""

# Install required packages
!pip install -q biopython pandas freesasa numpy matplotlib seaborn

# Import required libraries
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from Bio import PDB
from Bio.PDB.PDBIO import PDBIO
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.Structure import Structure
import freesasa
import numpy as np
from google.colab import drive

class StructureValidationError(Exception):
    pass

def validate_pdb_file(file_path: str) -> bool:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"PDB file not found: {file_path}")

    try:
        with open(file_path, 'r') as f:
            first_line = f.readline()
            if not any(marker in first_line for marker in ['HEADER', 'ATOM', 'MODEL']):
                raise StructureValidationError(f"File does not appear to be a valid PDB: {file_path}")
    except UnicodeDecodeError:
        raise StructureValidationError(f"File is not a valid text file: {file_path}")

    return True

def safe_structure_load(parser: PDB.PDBParser, file_path: str) -> Optional[Structure]:
    try:
        validate_pdb_file(file_path)
        structure = parser.get_structure('protein', file_path)
        models = list(structure.get_models())
        if not models:
            raise StructureValidationError("Structure contains no models")
        chains = list(models[0].get_chains())
        if not chains:
            raise StructureValidationError("Structure contains no chains")
        return structure
    except Exception as e:
        print(f"Error loading structure {file_path}: {str(e)}")
        return None

def calculate_buried_surface_area(pdb_file: str) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
    parser = PDB.PDBParser(QUIET=True)
    structure = safe_structure_load(parser, pdb_file)
    if not structure:
        return None, None
    try:
        chains = list(structure.get_chains())
        if len(chains) < 2:
            print(f"Warning: {pdb_file} has fewer than 2 chains, cannot calculate BSA")
            return None, None
        structure_atoms = [atom for atom in structure.get_atoms()]
        if not structure_atoms:
            print(f"Warning: {pdb_file} has no atoms")
            return None, None
        combined_structure = freesasa.Structure(pdb_file)
        result = freesasa.calc(combined_structure)
        total_area = result.totalArea()
        chain_areas: Dict[str, float] = {}
        io = PDBIO()
        temp_files: List[str] = []
        try:
            for chain in chains:
                new_structure = PDB.Structure.Structure('temp')
                new_model = PDB.Model.Model(0)
                new_structure.add(new_model)
                new_chain = chain.copy()
                new_model.add(new_chain)
                temp_file = f"temp_chain_{chain.id}.pdb"
                temp_files.append(temp_file)
                io.set_structure(new_structure)
                io.save(temp_file)
                chain_structure = freesasa.Structure(temp_file)
                chain_result = freesasa.calc(chain_structure)
                chain_areas[chain.id] = chain_result.totalArea()
        finally:
            for temp_file in temp_files:
                try:
                    if os.path.exists(temp_file):
                        os.remove(temp_file)
                except Exception as e:
                    print(f"Warning: Failed to remove temporary file {temp_file}: {str(e)}")
        total_individual_area = sum(chain_areas.values())
        buried_surface_area = abs(total_individual_area - total_area) / 2
        return buried_surface_area, chain_areas
    except Exception as e:
        print(f"Error calculating buried surface area for {pdb_file}: {str(e)}")
        return None, None

def calculate_hydrogen_bonds(structure: Structure) -> int:
    try:
        h_bonds = 0
        for chain1 in structure.get_chains():
            for chain2 in structure.get_chains():
                if chain1.id >= chain2.id:
                    continue
                for res1 in chain1.get_residues():
                    if not is_aa(res1):
                        continue
                    for res2 in chain2.get_residues():
                        if not is_aa(res2):
                            continue
                        try:
                            if 'O' in res1 and 'N' in res2:
                                distance = res1['O'] - res2['N']
                                if distance < 3.5:
                                    h_bonds += 1
                        except KeyError:
                            continue
        return h_bonds
    except Exception as e:
        print(f"Error calculating hydrogen bonds: {str(e)}")
        return 0

def calculate_hydrophobic_contacts(structure: Structure) -> int:
    try:
        hydrophobic_residues = {'ALA', 'VAL', 'LEU', 'ILE', 'MET', 'PHE', 'TRP', 'PRO'}
        contacts = 0
        for chain1 in structure.get_chains():
            for chain2 in structure.get_chains():
                if chain1.id >= chain2.id:
                    continue
                for res1 in chain1.get_residues():
                    if not is_aa(res1) or res1.get_resname() not in hydrophobic_residues:
                        continue
                    for res2 in chain2.get_residues():
                        if not is_aa(res2) or res2.get_resname() not in hydrophobic_residues:
                            continue
                        try:
                            min_distance = float('inf')
                            for atom1 in res1.get_atoms():
                                for atom2 in res2.get_atoms():
                                    distance = atom1 - atom2
                                    min_distance = min(min_distance, distance)
                            if min_distance < 5.0:
                                contacts += 1
                        except Exception:
                            continue
        return contacts
    except Exception as e:
        print(f"Error calculating hydrophobic contacts: {str(e)}")
        return 0

def calculate_salt_bridges(structure: Structure) -> int:
    try:
        acidic = {'ASP', 'GLU'}
        basic = {'LYS', 'ARG', 'HIS'}
        salt_bridges = 0
        for chain1 in structure.get_chains():
            for chain2 in structure.get_chains():
                if chain1.id >= chain2.id:
                    continue
                for res1 in chain1.get_residues():
                    if not is_aa(res1):
                        continue
                    res1_name = res1.get_resname()
                    for res2 in chain2.get_residues():
                        if not is_aa(res2):
                            continue
                        res2_name = res2.get_resname()
                        if (res1_name in acidic and res2_name in basic) or \
                           (res1_name in basic and res2_name in acidic):
                            try:
                                min_distance = float('inf')
                                for atom1 in res1.get_atoms():
                                    for atom2 in res2.get_atoms():
                                        distance = atom1 - atom2
                                        min_distance = min(min_distance, distance)
                                if min_distance < 4.0:
                                    salt_bridges += 1
                            except Exception:
                                continue
        return salt_bridges
    except Exception as e:
        print(f"Error calculating salt bridges: {str(e)}")
        return 0

def save_results_as_df(results: List[Dict[str, Any]], pdb_directory: str, output_file: str = "structure_analysis.csv") -> pd.DataFrame:
    analysis_data = []
    for result in results:
        filename = result['file_name'].replace('.pdb', '')
        try:
            design_num = int(filename.split('design')[1].split('_')[0])
            variant_num = int(filename.split('_n')[1])
            analysis_data.append({
                'design': design_num,
                'n': variant_num,
                'buried_surface_area': result['buried_surface_area'] if result['buried_surface_area'] else 0,
                'hydrogen_bonds': result['hydrogen_bonds'],
                'hydrophobic_contacts': result['hydrophobic_contacts'],
                'salt_bridges': result['salt_bridges']
            })
        except Exception as e:
            print(f"Error parsing filename {filename}: {str(e)}")
            continue
    df = pd.DataFrame(analysis_data)
    df = df.sort_values(['design', 'n']).reset_index(drop=True)
    full_output_path = os.path.join(pdb_directory, output_file)
    df.to_csv(full_output_path, index=False)
    print(f"Saved structure analysis results to {full_output_path}")
    try:
        files.download(full_output_path)
        print(f"Downloaded {output_file} to your computer")
    except Exception as e:
        print(f"Note: Could not auto-download file: {str(e)}")
    return df

def merge_with_af2_scores(structure_df: pd.DataFrame, af2_scores_file: str) -> pd.DataFrame:
    af2_df = pd.read_csv(af2_scores_file)
    merged_df = pd.merge(
        af2_df,
        structure_df,
        on=['design', 'n'],
        how='left'
    )
    merged_df = merged_df.sort_values(['design', 'n']).reset_index(drop=True)
    return merged_df

def make_scatter_plots(df: pd.DataFrame, pdb_directory: str):
    metrics = {
        'hydrogen_bonds': 'Number of Hydrogen Bonds',
        'hydrophobic_contacts': 'Number of Hydrophobic Contacts',
        'buried_surface_area': 'Buried Surface Area (Å²)'
    }
    for metric, ylabel in metrics.items():
        plt.figure(figsize=(10, 6))
        plt.scatter(df[metric], df['i_ptm'], alpha=0.5)
        plt.xlabel(ylabel)
        plt.ylabel('ipTM Score')
        plt.title(f'ipTM Score vs {ylabel}')
        top_15 = df.nlargest(15, 'i_ptm')
        for _, row in top_15.iterrows():
            plt.annotate(f'({int(row["design"])},{int(row["n"])})',
                       (row[metric], row['i_ptm']))
        plt.grid(True, alpha=0.3)
        plt.savefig(os.path.join(pdb_directory, f'plot_{metric}.png'))
        plt.close()

def process_multiple_pdb_files(pdb_directory: str, af2_scores_file: str = None) -> pd.DataFrame:
    if not os.path.exists(pdb_directory):
        raise FileNotFoundError(f"Directory not found: {pdb_directory}")
    results = []
    parser = PDB.PDBParser(QUIET=True)
    pdb_files = [f for f in os.listdir(pdb_directory) if f.endswith('.pdb')]
    if not pdb_files:
        print(f"Warning: No PDB files found in {pdb_directory}")
        return pd.DataFrame()
    print(f"Processing {len(pdb_files)} PDB files...")
    for file_name in pdb_files:
        pdb_file = os.path.join(pdb_directory, file_name)
        print(f"Processing {file_name}...")
        try:
            structure = safe_structure_load(parser, pdb_file)
            if not structure:
                continue
            buried_surface_area, chain_areas = calculate_buried_surface_area(pdb_file)
            h_bonds = calculate_hydrogen_bonds(structure)
            hydrophobic = calculate_hydrophobic_contacts(structure)
            salt_bridges = calculate_salt_bridges(structure)
            results.append({
                'file_name': file_name,
                'buried_surface_area': buried_surface_area,
                'hydrogen_bonds': h_bonds,
                'hydrophobic_contacts': hydrophobic,
                'salt_bridges': salt_bridges,
                'chain_areas': chain_areas
            })
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue
    structure_df = save_results_as_df(results, pdb_directory, "structure_analysis.csv")
    if af2_scores_file and os.path.exists(af2_scores_file):
        print(f"Merging with AF2 scores from {af2_scores_file}")
        final_df = merge_with_af2_scores(structure_df, af2_scores_file)
        combined_file = os.path.join(pdb_directory, "combined_analysis.csv")
        final_df.to_csv(combined_file, index=False)
        print(f"Saved combined results to {combined_file}")
        print("\nGenerating plots...")
        make_scatter_plots(final_df, pdb_directory)
        try:
            files.download(combined_file)
            print(f"Downloaded combined_analysis.csv to your computer")
            for metric in ['hydrogen_bonds', 'hydrophobic_contacts', 'buried_surface_area']:
                plot_file = f'plot_{metric}.png'
                files.download(os.path.join(pdb_directory, plot_file))
                print(f"Downloaded {plot_file}")
        except Exception as e:
            print(f"Note: Could not auto-download files: {str(e)}")
        return final_df
    return structure_df

try:
    try:
        drive.mount('/content/drive')
    except Exception as e:
        print(f"Warning: Drive mounting failed: {str(e)}")
        print("Proceeding with local filesystem access only")

    pdb_directory = '/content/drive/MyDrive/PDB-files/all_pdb_2MBO-12'
    af2_scores_path = os.path.join(pdb_directory, 'af2_scores.csv')
    if not os.path.exists(af2_scores_path):
        af2_scores_path = None
        print("No AF2 scores file found - will generate structure analysis only")

    print("Starting analysis...")
    print(f"Processing PDB files from: {pdb_directory}")
    print(f"Results will be saved to the same directory")

    results_df = process_multiple_pdb_files(pdb_directory, af2_scores_path)

except Exception as e:
    print(f"Fatal error: {str(e)}")
    sys.exit(1)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.1/270.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for freesasa (setup.py) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting analysis...
Processing PDB files from: /content/drive/MyDrive/PDB-files/all_pdb_2MBO-12
Results will be saved to the same directory
Processing 2048 PDB files...
Processing design18_n48.pdb...
Processing design10_n46.pdb...
Processing design25_n63.pdb...
Processing design3_n41.pdb...
Processing design9_n41.pdb...
Processing design5_n7.pdb...
Processing design0_n14.pdb...
Processing design14_n8.pdb...
Processing design17_n55.pdb...
Processing design29_n7.pdb...
Processing design7_n55.pdb...
Pr

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded structure_analysis.csv to your computer
Merging with AF2 scores from /content/drive/MyDrive/PDB-files/all_pdb_2MBO-12/af2_scores.csv
Saved combined results to /content/drive/MyDrive/PDB-files/all_pdb_2MBO-12/combined_analysis.csv

Generating plots...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded combined_analysis.csv to your computer


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded plot_hydrogen_bonds.png


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded plot_hydrophobic_contacts.png


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded plot_buried_surface_area.png


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
