<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/20250117_pdb_param_inc_VDW_nrg_too_high.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Import required libraries
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from Bio import PDB
import freesasa
import numpy as np
from google.colab import drive

class StructureValidationError(Exception):
    """Custom exception for structure validation failures."""
    pass

def validate_pdb_file(file_path: str) -> bool:
    """
    Validate that a file exists and has basic PDB format.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"PDB file not found: {file_path}")

    try:
        with open(file_path, 'r') as f:
            first_line = f.readline()
            if not any(marker in first_line for marker in ['HEADER', 'ATOM', 'MODEL']):
                raise StructureValidationError(f"File does not appear to be a valid PDB: {file_path}")
    except UnicodeDecodeError:
        raise StructureValidationError(f"File is not a valid text file: {file_path}")

    return True

def safe_structure_load(parser: PDB.PDBParser, file_path: str) -> Optional[PDB.Structure]:
    """
    Safely load a PDB structure with validation.
    """
    try:
        validate_pdb_file(file_path)
        structure = parser.get_structure('protein', file_path)

        # Validate structure has at least one model and chain
        models = list(structure.get_models())
        if not models:
            raise StructureValidationError("Structure contains no models")

        chains = list(models[0].get_chains())
        if not chains:
            raise StructureValidationError("Structure contains no chains")

        return structure

    except Exception as e:
        print(f"Error loading structure {file_path}: {str(e)}")
        return None

def calculate_buried_surface_area(pdb_file: str) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
    """
    Calculate buried surface area between chains in a PDB structure.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = safe_structure_load(parser, pdb_file)

    if not structure:
        return None, None

    try:
        chains = list(structure.get_chains())
        if len(chains) < 2:
            print(f"Warning: {pdb_file} has fewer than 2 chains, cannot calculate BSA")
            return None, None

        # Calculate surface area for complete structure
        combined_structure = freesasa.Structure(pdb_file)
        result = freesasa.calc(combined_structure)
        total_area = result.totalArea()

        # Calculate individual chain areas
        chain_areas: Dict[str, float] = {}
        io = PDB.PDBIO()
        temp_files: List[str] = []

        try:
            for chain in chains:
                # Create a new structure with just this chain
                new_structure = PDB.Structure.Structure('temp')
                new_model = PDB.Model.Model(0)
                new_structure.add(new_model)
                new_chain = chain.copy()
                new_model.add(new_chain)

                # Save temporary file
                temp_file = f"temp_chain_{chain.id}.pdb"
                temp_files.append(temp_file)
                io.set_structure(new_structure)
                io.save(temp_file)

                # Calculate area
                chain_structure = freesasa.Structure(temp_file)
                chain_result = freesasa.calc(chain_structure)
                chain_areas[chain.id] = chain_result.totalArea()

        finally:
            # Clean up temp files
            for temp_file in temp_files:
                try:
                    if os.path.exists(temp_file):
                        os.remove(temp_file)
                except Exception as e:
                    print(f"Warning: Failed to remove temporary file {temp_file}: {str(e)}")

        # Calculate buried surface area
        total_individual_area = sum(chain_areas.values())
        buried_surface_area = abs(total_individual_area - total_area)

        return buried_surface_area, chain_areas

    except Exception as e:
        print(f"Error calculating buried surface area for {pdb_file}: {str(e)}")
        return None, None

def calculate_vdw_energy(structure: PDB.Structure) -> Optional[float]:
    """
    Calculate van der Waals energy between chains in the structure, explicitly in kcal/mol.
    """
    try:
        epsilon = 0.1  # kcal/mol
        sigma = 3.4    # Angstroms

        vdw_energy = 0.0
        chains = list(structure.get_chains())

        # Calculate VDW energy between all chain pairs
        for i, chain1 in enumerate(chains):
            for chain2 in chains[i+1:]:  # Only calculate each pair once
                atoms1 = [atom for atom in chain1.get_atoms()]
                atoms2 = [atom for atom in chain2.get_atoms()]

                for atom1 in atoms1:
                    for atom2 in atoms2:
                        try:
                            distance = np.linalg.norm(atom1.coord - atom2.coord)
                            if distance > 0 and distance < 10.0:  # Only consider interactions within 10Å
                                vdw_energy += 4 * epsilon * ((sigma / distance)**12 - (sigma / distance)**6)
                        except Exception as e:
                            print(f"Warning: Error calculating VDW interaction: {str(e)}")
                            continue

        print(f"Van der Waals energy: {vdw_energy:.2f} kcal/mol")  # Explicit output in kcal/mol
        return vdw_energy

    except Exception as e:
        print(f"Error calculating VDW energy: {str(e)}")
        return None

def calculate_hydrogen_bonds(structure: PDB.Structure) -> int:
    """
    Calculate hydrogen bonds between chains using distance and angle criteria.
    """
    try:
        h_bonds = []
        for chain1 in structure.get_chains():
            for chain2 in structure.get_chains():
                if chain1.id != chain2.id:
                    for res1 in chain1.get_residues():
                        if not is_aa(res1):
                            continue
                        for res2 in chain2.get_residues():
                            if not is_aa(res2):
                                continue
                            try:
                                # Check for backbone-backbone H-bonds
                                if 'O' in res1 and 'N' in res2:
                                    distance = res1['O'] - res2['N']
                                    if distance < 3.5:  # Standard H-bond distance cutoff
                                        h_bonds.append((res1, res2))
                            except KeyError:
                                continue  # Skip if atoms not found
        return len(h_bonds)
    except Exception as e:
        print(f"Error calculating hydrogen bonds: {str(e)}")
        return 0

def calculate_hydrophobic_contacts(structure: PDB.Structure) -> int:
    """
    Calculate hydrophobic contacts between chains.
    """
    try:
        hydrophobic_residues = {'ALA', 'VAL', 'LEU', 'ILE', 'MET', 'PHE', 'TRP', 'PRO'}
        contacts = []

        for chain1 in structure.get_chains():
            for chain2 in structure.get_chains():
                if chain1.id >= chain2.id:
                    continue

                for res1 in chain1.get_residues():
                    if not is_aa(res1) or res1.get_resname() not in hydrophobic_residues:
                        continue

                    for res2 in chain2.get_residues():
                        if not is_aa(res2) or res2.get_resname() not in hydrophobic_residues:
                            continue

                        try:
                            min_distance = float('inf')
                            for atom1 in res1.get_atoms():
                                for atom2 in res2.get_atoms():
                                    distance = atom1 - atom2
                                    min_distance = min(min_distance, distance)

                            if min_distance < 5.0:
                                contacts.append((res1, res2))
                        except Exception:
                            continue

        return len(contacts)
    except Exception as e:
        print(f"Error calculating hydrophobic contacts: {str(e)}")
        return 0

def calculate_salt_bridges(structure: PDB.Structure) -> int:
    """
    Calculate salt bridges between chains.
    """
    try:
        acidic = {'ASP', 'GLU'}
        basic = {'LYS', 'ARG', 'HIS'}
        salt_bridges = []

        for chain1 in structure.get_chains():
            for chain2 in structure.get_chains():
                if chain1.id >= chain2.id:
                    continue

                for res1 in chain1.get_residues():
                    if not is_aa(res1):
                        continue
                    res1_name = res1.get_resname()

                    for res2 in chain2.get_residues():
                        if not is_aa(res2):
                            continue
                        res2_name = res2.get_resname()

                        if (res1_name in acidic and res2_name in basic) or \
                           (res1_name in basic and res2_name in acidic):
                            try:
                                min_distance = float('inf')
                                for atom1 in res1.get_atoms():
                                    for atom2 in res2.get_atoms():
                                        distance = atom1 - atom2
                                        min_distance = min(min_distance, distance)

                                if min_distance < 4.0:
                                    salt_bridges.append((res1, res2))
                            except Exception:
                                continue

        return len(salt_bridges)
    except Exception as e:
        print(f"Error calculating salt bridges: {str(e)}")
        return 0

def print_summary_report(results: List[Dict[str, Any]]) -> None:
    """
    Print a formatted summary report of all results.
    """
    if not results:
        print("No results to display")
        return

    print("\nSummary Report:")
    print(f"{'PDB File':<30} {'Buried Surface Area (Å²)':<25} {'H-Bonds':<12} {'Hydrophobic':<12} {'Salt Bridges':<12} {'VDW Energy (kcal/mol)':<12}")
    print("="*100)

    for result in results:
        try:
            bsa = f"{result['buried_surface_area']:.2f}" if result['buried_surface_area'] else "N/A"
            vdw = f"{result['vdw_energy']:.2f}" if result['vdw_energy'] else "N/A"
            print(f"{result['file_name']:<30} {bsa:<25} {result['hydrogen_bonds']:<12} "
                  f"{result['hydrophobic_contacts']:<12} {result['salt_bridges']:<12} {vdw:<12}")
        except KeyError as e:
            print(f"Error displaying result for {result.get('file_name', 'unknown')}: Missing data {str(e)}")

def process_multiple_pdb_files(pdb_directory: str) -> List[Dict[str, Any]]:
    """
    Process multiple PDB files and analyze their structures.
    """
    if not os.path.exists(pdb_directory):
        raise FileNotFoundError(f"Directory not found: {pdb_directory}")

    results = []
    parser = PDB.PDBParser(QUIET=True)

    # Get list of PDB files
    pdb_files = [f for f in os.listdir(pdb_directory) if f.endswith('.pdb')]
    if not pdb_files:
        print(f"Warning: No PDB files found in {pdb_directory}")
        return results

    for file_name in pdb_files:
        pdb_file = os.path.join(pdb_directory, file_name)
        print(f"\nProcessing {file_name}")

        try:
            structure = safe_structure_load(parser, pdb_file)
            if not structure:
                continue

            buried_surface_area, chain_areas = calculate_buried_surface_area(pdb_file)
            vdw_energy = calculate_vdw_energy(structure)
            h_bonds = calculate_hydrogen_bonds(structure)
            hydrophobic = calculate_hydrophobic_contacts(structure)
            salt_bridges = calculate_salt_bridges(structure)

            results.append({
                'file_name': file_name,
                'buried_surface_area': buried_surface_area,
                'vdw_energy': vdw_energy,
                'hydrogen_bonds': h_bonds,
                'hydrophobic_contacts': hydrophobic,
                'salt_bridges': salt_bridges,
                'chain_areas': chain_areas
            })

        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue

    return results

# Main execution
try:
    drive.mount('/content/drive')

    pdb_directory = '/content/drive/MyDrive/PDB-files/all_pdb-2MBO-no-hot'

    print("Starting analysis...")
    results = process_multiple_pdb_files(pdb_directory)

    if results:
        print_summary_report(results)
    else:
        print("No results to display")

except Exception as e:
    print(f"Fatal error: {str(e)}")
    sys.exit(1)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting analysis...

Processing design0_n16.pdb
Van der Waals energy: 197353922.33 kcal/mol

Processing design0_n1.pdb
Van der Waals energy: 106546551.24 kcal/mol

Processing design0_n13.pdb
Van der Waals energy: 4525192545.27 kcal/mol

Processing design0_n10.pdb
Van der Waals energy: 1710704.96 kcal/mol

Processing design0_n12.pdb
Van der Waals energy: 758596.49 kcal/mol

Processing design0_n15.pdb
Van der Waals energy: 53091.06 kcal/mol

Processing design1_n1.pdb
Van der Waals energy: 1983392428.93 kcal/mol

Processing design0_n0.pdb
Van der Waals energy: 628830.78 kcal/mol

Processing design1_n11.pdb
Van der Waals energy: 2378412308962.11 kcal/mol

Processing design0_n14.pdb
Van der Waals energy: 611.03 kcal/mol

Processing design0_n9.pdb
Van der Waals energy: 1262.17 kcal/mol

Processing design0_n11.pdb
Van der Waals energy: 43594474026.28 kcal/mol

Proc

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
