<a href="https://colab.research.google.com/github/eoinleen/PDB-tools/blob/main/Copy_of_PDB_renum_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
PDB Residue Renumbering Script for Google Colab
Supports both Google Drive files and direct file upload.
Fetches PDB file, renumbers residues in specified chain,
and saves with 'renum_' prefix.
"""

import os
import re
from google.colab import drive, files
from typing import Optional, List, Tuple

# ============================================================================
# CONFIGURATION SECTION - MODIFY THESE VALUES
# ============================================================================
TARGET_CHAIN = "B"           # Chain to renumber (case-sensitive)
NEW_FIRST_RESIDUE = 141        # New starting residue number
START_FROM_RESIDUE = 128    # Residue to start renumbering from (None = start from first residue)
                             # Example: 50 means start renumbering from residue 50 onwards

# INPUT METHOD - Choose one of the following:
# Option 1: Use file from Google Drive
USE_DRIVE_FILE = False  # Set to True to use Google Drive file
PDB_FILENAME = "/PDB-files/20250606_mysm1/linking_both_Ub_chains.pdb" # Path to PDB file in Google Drive

# Option 2: Upload file directly (default)
USE_UPLOAD = True  # Set to True to upload file directly to Colab
# ============================================================================

def get_input_method():
    """
    Determine which input method to use based on configuration.
    Returns tuple: (method, path_or_none)
    """
    if USE_UPLOAD and USE_DRIVE_FILE:
        print("⚠ Warning: Both USE_UPLOAD and USE_DRIVE_FILE are True")
        print("Defaulting to file upload method")
        return "upload", None
    elif USE_UPLOAD:
        return "upload", None
    elif USE_DRIVE_FILE:
        return "drive", PDB_FILENAME
    else:
        print("⚠ Warning: Neither upload nor drive option is enabled")
        print("Defaulting to file upload method")
        return "upload", None

def upload_pdb_file() -> Optional[str]:
    """
    Upload a PDB file directly to Colab environment.
    Returns the path to the uploaded file or None if failed.
    """
    print("📁 Please select a PDB file to upload...")
    print("(A file dialog should appear in your browser)")

    try:
        uploaded = files.upload()

        if not uploaded:
            print("✗ No file was uploaded")
            return None

        # Get the filename (should be only one file)
        filename = list(uploaded.keys())[0]

        # Check if it's a PDB file
        if not filename.lower().endswith('.pdb'):
            print(f"⚠ Warning: File '{filename}' doesn't have .pdb extension")
            response = input("Continue anyway? (y/n): ").lower().strip()
            if response != 'y':
                return None

        # File is now in the current directory
        file_path = os.path.join('/content', filename)

        if os.path.exists(file_path):
            print(f"✓ File uploaded successfully: {filename}")
            print(f"✓ File size: {len(uploaded[filename])} bytes")
            return file_path
        else:
            print(f"✗ Upload failed - file not found at: {file_path}")
            return None

    except Exception as e:
        print(f"✗ Error during file upload: {e}")
        return None

def mount_drive():
    """Mount Google Drive with error handling."""
    try:
        # Check if already mounted by looking for MyDrive
        if os.path.exists('/content/drive/MyDrive'):
            print("✓ Google Drive already mounted")
            return True

        # Create mount directory if it doesn't exist
        os.makedirs('/content/drive', exist_ok=True)

        # Mount the drive
        drive.mount('/content/drive', force_remount=True)

        # Verify mount was successful
        if os.path.exists('/content/drive/MyDrive'):
            print("✓ Google Drive mounted successfully")
            return True
        else:
            print("✗ Google Drive mount failed - MyDrive not accessible")
            return False

    except Exception as e:
        print(f"✗ Error mounting Google Drive: {e}")
        print("Try running this in a separate cell first:")
        print("from google.colab import drive")
        print("drive.mount('/content/drive')")
        return False

def find_pdb_file(filename: str) -> Optional[str]:
    """
    Find PDB file in Google Drive using the specified path.
    Returns full path if found, None otherwise.
    """
    drive_root = "/content/drive/MyDrive"
    full_path = os.path.join(drive_root, filename.lstrip('/'))

    if not os.path.exists(drive_root):
        print(f"✗ Drive root not found: {drive_root}")
        return None

    if os.path.exists(full_path):
        print(f"✓ Found PDB file: {full_path}")
        return full_path
    else:
        print(f"✗ PDB file not found at: {full_path}")
        # Try to give helpful error message
        directory = os.path.dirname(full_path)
        if os.path.exists(directory):
            available_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
            if available_files:
                print(f"Available PDB files in that directory: {available_files}")
            else:
                print("No PDB files found in that directory")
        else:
            print(f"Directory does not exist: {directory}")
        return None

def parse_pdb_line(line: str) -> Optional[dict]:
    """
    Parse a PDB ATOM/HETATM line and extract relevant information.
    Returns dictionary with parsed data or None if not a valid line.
    """
    if not (line.startswith("ATOM  ") or line.startswith("HETATM")):
        return None

    try:
        return {
            'record_type': line[0:6].strip(),
            'atom_number': line[6:11].strip(),
            'atom_name': line[12:16].strip(),
            'alt_loc': line[16:17],
            'residue_name': line[17:20].strip(),
            'chain_id': line[21:22],
            'residue_number': line[22:26].strip(),
            'insertion_code': line[26:27],
            'x': line[30:38].strip(),
            'y': line[38:46].strip(),
            'z': line[46:54].strip(),
            'occupancy': line[54:60].strip(),
            'temp_factor': line[60:66].strip(),
            'element': line[76:78].strip(),
            'charge': line[78:80].strip(),
            'full_line': line
        }
    except IndexError as e:
        print(f"⚠ Warning: Malformed PDB line: {line.strip()}")
        return None

def parse_ter_line(line: str) -> Optional[dict]:
    """
    Parse a PDB TER line and extract relevant information.
    Returns dictionary with parsed data or None if not a valid TER line.
    """
    if not line.startswith("TER   "):
        return None

    try:
        return {
            'record_type': line[0:6].strip(),
            'serial_number': line[6:11].strip(),
            'residue_name': line[17:20].strip(),
            'chain_id': line[21:22],
            'residue_number': line[22:26].strip(),
            'insertion_code': line[26:27],
            'full_line': line
        }
    except IndexError:
        print(f"⚠ Warning: Malformed TER line: {line.strip()}")
        return None

def renumber_residues(input_path: str, output_path: str, target_chain: str, new_start: int, start_from: Optional[int] = None) -> bool:
    """
    Renumber residues in specified chain of PDB file.

    Args:
        input_path: Path to input PDB file
        output_path: Path for output PDB file
        target_chain: Chain identifier to renumber
        new_start: New starting residue number
        start_from: Original residue number to start renumbering from (None = start from first)

    Returns:
        True if successful, False otherwise
    """
    try:
        with open(input_path, 'r') as infile:
            lines = infile.readlines()
    except FileNotFoundError:
        print(f"✗ Input file not found: {input_path}")
        return False
    except Exception as e:
        print(f"✗ Error reading input file: {e}")
        return False

    # Track residue mapping for the target chain
    residue_mapping = {}
    current_new_number = new_start
    processed_lines = []
    atoms_processed = 0
    atoms_renumbered = 0
    renumbering_started = start_from is None  # Start immediately if no specific residue specified

    # First pass: collect all unique residues in target chain to find start position
    if start_from is not None:
        unique_residues = []
        for line in lines:
            parsed = parse_pdb_line(line)
            if parsed and parsed['chain_id'] == target_chain:
                old_res_id = parsed['residue_number'] + parsed['insertion_code']
                old_res_num = int(parsed['residue_number'])
                if old_res_id not in [r[1] for r in unique_residues]:
                    unique_residues.append((old_res_num, old_res_id))

        # Sort residues by number for better reporting
        unique_residues.sort(key=lambda x: x[0])
        residue_numbers = [res[0] for res in unique_residues]

        print(f"Found {len(unique_residues)} unique residues in chain '{target_chain}'")
        print(f"Residue range: {min(residue_numbers)} to {max(residue_numbers)}")
        if len(residue_numbers) > 10:
            print(f"First 5: {residue_numbers[:5]}")
            print(f"Last 5: {residue_numbers[-5:]}")
        else:
            print(f"All residues: {residue_numbers}")

        # Check if start_from residue exists
        start_residue_found = any(res_num == start_from for res_num, _ in unique_residues)
        if not start_residue_found:
            print(f"✗ Start residue {start_from} not found in chain '{target_chain}'")
            return False

        print(f"Will start renumbering from residue {start_from} in chain '{target_chain}'")

    print(f"Processing PDB file with {len(lines)} lines...")

    for line_num, line in enumerate(lines, 1):
        parsed = parse_pdb_line(line)
        ter_parsed = parse_ter_line(line)

        if parsed is None and ter_parsed is None:
            # Non-ATOM/HETATM/TER line, keep as-is
            processed_lines.append(line)
            continue

        # Handle TER records
        if ter_parsed is not None:
            # Only modify TER records for the target chain
            if ter_parsed['chain_id'] == target_chain:
                # Find the last residue number we assigned for this chain
                if residue_mapping:
                    last_new_residue = max(residue_mapping.values())
                    # Reconstruct TER line with new residue number
                    new_ter_line = (
                        ter_parsed['full_line'][:22] +  # Everything up to residue number
                        f"{last_new_residue:>4}" +      # New residue number
                        ter_parsed['full_line'][26:]    # Everything after residue number
                    )
                    processed_lines.append(new_ter_line)
                else:
                    # No residues processed yet, keep original
                    processed_lines.append(line)
            else:
                # Different chain, keep as-is
                processed_lines.append(line)
            continue

        # Handle ATOM/HETATM records (existing logic)
        atoms_processed += 1

        # Only process atoms in target chain
        if parsed['chain_id'] != target_chain:
            processed_lines.append(line)
            continue

        # Create unique residue identifier and get original residue number
        old_res_id = parsed['residue_number'] + parsed['insertion_code']
        old_res_num = int(parsed['residue_number'])

        # Check if we should start renumbering from this residue
        if start_from is not None and not renumbering_started:
            if old_res_num == start_from:
                renumbering_started = True
                print(f"✓ Started renumbering from residue {start_from}")
            else:
                # Keep original numbering for residues before start_from
                processed_lines.append(line)
                continue

        # Map old residue number to new number (only for residues being renumbered)
        if old_res_id not in residue_mapping:
            residue_mapping[old_res_id] = current_new_number
            current_new_number += 1

        new_res_num = residue_mapping[old_res_id]

        # Reconstruct the line with new residue number
        new_line = (
            parsed['full_line'][:22] +  # Everything up to residue number
            f"{new_res_num:>4}" +       # New residue number (right-aligned, 4 chars)
            parsed['full_line'][26:]    # Everything after residue number
        )

        processed_lines.append(new_line)
        atoms_renumbered += 1

    # Write output file
    try:
        with open(output_path, 'w') as outfile:
            outfile.writelines(processed_lines)

        print(f"✓ Successfully processed {atoms_processed} atoms")
        print(f"✓ Renumbered {atoms_renumbered} atoms in chain '{target_chain}'")
        if start_from is not None:
            print(f"✓ Started renumbering from original residue {start_from}")
        print(f"✓ Mapped {len(residue_mapping)} unique residues")
        print(f"✓ Output written to: {output_path}")
        return True

    except Exception as e:
        print(f"✗ Error writing output file: {e}")
        return False

def download_result(file_path: str):
    """Download the result file to user's computer."""
    try:
        files.download(file_path)
        print(f"✓ Download initiated for: {os.path.basename(file_path)}")
    except Exception as e:
        print(f"⚠ Could not initiate download: {e}")
        print(f"File is available at: {file_path}")

def main():
    """Main execution function."""
    print("=" * 60)
    print("PDB Residue Renumbering Script")
    print("=" * 60)
    print(f"Configuration:")
    print(f"  Target chain: '{TARGET_CHAIN}'")
    print(f"  New first residue: {NEW_FIRST_RESIDUE}")
    if START_FROM_RESIDUE is not None:
        print(f"  Start renumbering from: residue {START_FROM_RESIDUE}")
    else:
        print(f"  Start renumbering from: first residue in chain")

    # Determine input method
    method, drive_path = get_input_method()

    if method == "drive":
        print(f"  Input method: Google Drive")
        print(f"  PDB filename: '{drive_path}'")
    else:
        print(f"  Input method: Direct file upload")

    print("-" * 60)

    # Step 1: Get input file
    pdb_path = None

    if method == "drive":
        # Mount Google Drive and find file
        if not mount_drive():
            return
        pdb_path = find_pdb_file(drive_path)
        if pdb_path is None:
            print("Please check that:")
            print("1. The filename is correct")
            print("2. The file is in your Google Drive")
            print("3. Google Drive is properly mounted")
            return
    else:
        # Upload file directly
        pdb_path = upload_pdb_file()
        if pdb_path is None:
            print("File upload failed or cancelled.")
            return

    # Step 2: Generate output filename
    directory = os.path.dirname(pdb_path)
    original_filename = os.path.basename(pdb_path)
    base_name = os.path.splitext(original_filename)[0]
    extension = os.path.splitext(original_filename)[1]
    output_filename = f"renum_{base_name}{extension}"
    output_path = os.path.join(directory, output_filename)

    print(f"Output will be saved as: {output_path}")

    # Step 3: Perform renumbering
    success = renumber_residues(pdb_path, output_path, TARGET_CHAIN, NEW_FIRST_RESIDUE, START_FROM_RESIDUE)

    if success:
        print("=" * 60)
        print("✓ RENUMBERING COMPLETED SUCCESSFULLY!")

        if method == "drive":
            print(f"✓ Check your Google Drive for: {output_filename}")
        else:
            print(f"✓ Renumbered file saved as: {output_filename}")
            print("✓ Initiating download...")
            download_result(output_path)

        print("=" * 60)
    else:
        print("=" * 60)
        print("✗ RENUMBERING FAILED!")
        print("Please check the error messages above.")
        print("=" * 60)

if __name__ == "__main__":
    main()

PDB Residue Renumbering Script
Configuration:
  Target chain: 'B'
  New first residue: 141
  Start renumbering from: residue 128
  Input method: Direct file upload
------------------------------------------------------------
📁 Please select a PDB file to upload...
(A file dialog should appear in your browser)


Saving renum_merged_MYSM1_Ub2_fixed_5.pdb to renum_merged_MYSM1_Ub2_fixed_5.pdb
✓ File uploaded successfully: renum_merged_MYSM1_Ub2_fixed_5.pdb
✓ File size: 254253 bytes
Output will be saved as: /content/renum_renum_merged_MYSM1_Ub2_fixed_5.pdb
Found 151 unique residues in chain 'B'
Residue range: 1 to 202
First 5: [1, 2, 3, 4, 5]
Last 5: [198, 199, 200, 201, 202]
Will start renumbering from residue 128 in chain 'B'
Processing PDB file with 3220 lines...
✓ Started renumbering from residue 128
✓ Successfully processed 3218 atoms
✓ Renumbered 592 atoms in chain 'B'
✓ Started renumbering from original residue 128
✓ Mapped 75 unique residues
✓ Output written to: /content/renum_renum_merged_MYSM1_Ub2_fixed_5.pdb
✓ RENUMBERING COMPLETED SUCCESSFULLY!
✓ Renumbered file saved as: renum_renum_merged_MYSM1_Ub2_fixed_5.pdb
✓ Initiating download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Download initiated for: renum_renum_merged_MYSM1_Ub2_fixed_5.pdb
