<a href="https://colab.research.google.com/github/eoinleen/PDB-tools/blob/main/Polar-and-charged-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install BioPython if not already installed
import sys
try:
    from Bio.PDB import PDBParser
except ModuleNotFoundError:
    print("Installing BioPython...")
    !pip install biopython
    from Bio.PDB import PDBParser

import os
import glob

def analyze_pdb_residues(pdb_file, chain_id='A'):
    """
    Analyze a PDB file to count charged, polar, and specific charged residue types
    in the specified chain.

    Parameters:
    -----------
    pdb_file : str
        Path to the PDB file
    chain_id : str
        ID of the chain to analyze (default: 'A')

    Returns:
    --------
    dict
        Dictionary with counts of different residue types
    """
    # Define residue classifications
    positively_charged = ['ARG', 'LYS', 'HIS']
    negatively_charged = ['ASP', 'GLU']
    polar_uncharged = ['SER', 'THR', 'ASN', 'GLN', 'TYR']

    # Initialize counters
    counts = {
        'filename': os.path.basename(pdb_file),
        'charged_residues': 0,
        'polar_residues': 0,
        'positively_charged_residues': 0,
        'negatively_charged_residues': 0
    }

    # Parse PDB file
    parser = PDBParser(QUIET=True)
    try:
        structure = parser.get_structure('protein', pdb_file)

        # Check if the specified chain exists
        chain_exists = False
        for model in structure:
            if chain_id in model:
                chain_exists = True
                chain = model[chain_id]

                # Count residues by type
                for residue in chain:
                    res_name = residue.get_resname()

                    # Skip non-standard residues and water
                    if res_name in ['HOH', 'WAT'] or len(res_name) < 3:
                        continue

                    # Count positively charged residues
                    if res_name in positively_charged:
                        counts['positively_charged_residues'] += 1
                        counts['charged_residues'] += 1

                    # Count negatively charged residues
                    elif res_name in negatively_charged:
                        counts['negatively_charged_residues'] += 1
                        counts['charged_residues'] += 1

                    # Count polar uncharged residues
                    elif res_name in polar_uncharged:
                        counts['polar_residues'] += 1

                break  # Only analyze the first model that contains the chain

        if not chain_exists:
            print(f"Warning: Chain {chain_id} not found in {pdb_file}")

    except Exception as e:
        print(f"Error processing {pdb_file}: {e}")

    return counts

def process_pdb_directory(directory_path, chain_id='A', output_filename=None):
    """
    Process all PDB files in a directory and output results to a text file.

    Parameters:
    -----------
    directory_path : str
        Path to the directory containing PDB files
    chain_id : str
        ID of the chain to analyze (default: 'A')
    output_filename : str, optional
        Name of the output file. If None, will default to 'residue_analysis.txt'
        in the same directory

    Returns:
    --------
    str
        Path to the output file
    """
    # Set default output filename if not provided
    if output_filename is None:
        output_filename = os.path.join(directory_path, 'residue_analysis.txt')
    else:
        # If only a filename is provided, put it in the directory
        if os.path.dirname(output_filename) == '':
            output_filename = os.path.join(directory_path, output_filename)

    # Find all PDB files in the directory
    pdb_files = glob.glob(os.path.join(directory_path, '*.pdb'))

    if not pdb_files:
        print(f"No PDB files found in {directory_path}")
        return None

    print(f"Found {len(pdb_files)} PDB files to process")

    # Analyze each PDB file
    results = []
    for pdb_file in pdb_files:
        print(f"Processing {os.path.basename(pdb_file)}...")
        result = analyze_pdb_residues(pdb_file, chain_id)
        results.append(result)

    # Write results to output file
    with open(output_filename, 'w') as f:
        # Write header
        f.write("PDB_File\tCharged_Residues\tPolar_Residues\tPositively_Charged_Residues\tNegatively_Charged_Residues\n")

        # Write data rows
        for result in results:
            f.write(f"{result['filename']}\t{result['charged_residues']}\t{result['polar_residues']}\t"
                    f"{result['positively_charged_residues']}\t{result['negatively_charged_residues']}\n")

    print(f"Results written to {output_filename}")
    return output_filename

# Mount Google Drive (if necessary)
# Uncomment these lines if your PDB files are in Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# CONFIGURATION - MODIFY THESE VALUES
# =================================
# Path to directory containing PDB files
pdb_directory = '/content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders'  # Change this to your directory path

# Chain ID to analyze
chain_id = 'A'  # Change this if you want to analyze a different chain

# Output filename (optional - will default to 'residue_analysis.txt' in the same directory)
output_file = None
# =================================

# Run the analysis
process_pdb_directory(pdb_directory, chain_id, output_file)

Found 13 PDB files to process
Processing 1_bind_0_dldesign_9965_af2pred.pdb...
Processing 1_bind_0_dldesign_2056_af2pred.pdb...
Processing 1_bind_0_dldesign_5160_af2pred.pdb...
Processing 1_bind_0_dldesign_7384_af2pred.pdb...
Processing 1_bind_0_dldesign_2304_af2pred.pdb...
Processing 1_bind_0_dldesign_7578_af2pred.pdb...
Processing 1_bind_0_dldesign_8480_af2pred.pdb...
Processing 1_bind_0_dldesign_2777_af2pred.pdb...
Processing 1_bind_0_dldesign_1829_af2pred.pdb...
Processing 1_bind_0_dldesign_5024_af2pred.pdb...
Processing 1_bind_0_dldesign_2795_af2pred.pdb...
Processing 1_bind_0_dldesign_1708_af2pred.pdb...
Processing 1_bind_0_dldesign_0_af2pred.pdb...
Results written to /content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders/residue_analysis.txt


'/content/drive/MyDrive/Evolving_hits_using_ProteinMPNN/20250303-3NOBEK/0_top_binders/residue_analysis.txt'