<a href="https://colab.research.google.com/github/eoinleen/PDB-tools/blob/main/Fix-frankenstein_pdb_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
===============================================================================
 PDB File Cleaner and Fixer for Google Colab
===============================================================================

 This script is designed to clean and fix PDB (Protein Data Bank) structure
 files using Google Colab. It performs several common formatting corrections
 that are often needed when preparing PDB files for molecular modeling,
 visualization, or further computational analysis.

 FEATURES:
 ------------------------------------------------------------------------------
 - Accepts input either from Google Drive or direct file upload.
 - Renumbers atoms sequentially for consistency.
 - Fills in missing chain identifiers using default 'A'.
 - Preserves residue numbering and detects any residue gaps.
 - Formats ATOM/HETATM lines to standard PDB formatting.
 - Outputs a cleaned, corrected PDB file.
 - If direct upload --> Automatically downloads fixed file

 HOW TO USE:
 ------------------------------------------------------------------------------
 1. Option A (Google Drive):
    - Set `INPUT_FILE_PATH` and `OUTPUT_FILE_PATH` to your file paths in Drive.
    - Run the script.

 2. Option B (Local Upload):
    - Leave both paths empty.
    - Run the script and upload your `.pdb` file when prompted.

 OUTPUT:
 ------------------------------------------------------------------------------
 - A cleaned and corrected PDB file saved either to Google Drive or downloaded.
 - Warnings for missing chains or residue number gaps.
 - Up to 10-line preview of the resulting file.

 EXAMPLE OUTPUT PATH (Google Drive):
 ------------------------------------------------------------------------------
 INPUT_FILE_PATH = "/content/drive/My Drive/input_file.pdb"
 OUTPUT_FILE_PATH = "/content/drive/My Drive/input_file_fixed.pdb"

 AUTHOR:
 ------------------------------------------------------------------------------
 Created for use in Google Colab by ENL & Claude.AI working with structural biology
 data, protein design, or molecular modeling workflows.

===============================================================================
"""


from google.colab import files
from google.colab import drive
import os

# ===== EDIT THESE PATHS =====
# Leave empty to upload from your computer instead
INPUT_FILE_PATH = ""  # Example: "/content/drive/My Drive/my_spliced_file.pdb"
OUTPUT_FILE_PATH = "" # Example: "/content/drive/My Drive/my_fixed_file.pdb"
# ============================

def fix_pdb_file(input_file, output_file):
    """Fix PDB formatting issues - renumber atoms and fix chain placement"""

    print(f"Reading: {input_file}")

    with open(input_file, 'r') as f:
        lines = f.readlines()

    fixed_lines = []
    atom_counter = 1
    prev_res_num = None
    chain_id = 'A'  # Default chain

    for line in lines:
        if line.startswith('ATOM') or line.startswith('HETATM'):
            # Parse the line
            record_type = line[0:6]
            atom_name = line[12:16]
            alt_loc = line[16:17]
            res_name = line[17:20]
            chain = line[21:22].strip()
            res_num = line[22:26]
            icode = line[26:27]
            x = line[30:38]
            y = line[38:46]
            z = line[46:54]
            occupancy = line[54:60]
            temp_factor = line[60:66]

            # Fix chain if needed
            if not chain:
                chain = chain_id
            else:
                chain_id = chain

            # Check residue numbering
            current_res_num = int(res_num.strip())
            if prev_res_num and current_res_num - prev_res_num > 1:
                print(f"  Warning: Gap between residues {prev_res_num} and {current_res_num}")
            prev_res_num = current_res_num

            # Build fixed line with sequential atom numbering
            fixed_line = (
                f"{record_type}"
                f"{atom_counter:5d} "
                f"{atom_name}"
                f"{alt_loc}"
                f"{res_name} "
                f"{chain_id}"
                f"{res_num}"
                f"{icode}   "
                f"{x}"
                f"{y}"
                f"{z}"
                f"{occupancy}"
                f"{temp_factor}"
            )

            # Add element symbol if present
            if len(line) > 76:
                element = line[76:78].strip()
                if element:
                    fixed_line = fixed_line.ljust(76) + f"{element:>2}"

            fixed_lines.append(fixed_line.rstrip() + '\n')
            atom_counter += 1

        elif line.startswith('TER'):
            fixed_lines.append(f"TER   {atom_counter:5d}      {res_name} {chain_id}{prev_res_num:4d}\n")
            atom_counter += 1
        else:
            fixed_lines.append(line)

    # Write fixed file
    with open(output_file, 'w') as f:
        f.writelines(fixed_lines)

    print(f"✓ Fixed {atom_counter-1} atoms")
    print(f"✓ Saved to: {output_file}")

    return output_file

# Main execution
def main():
    if INPUT_FILE_PATH and OUTPUT_FILE_PATH:
        # Use Google Drive paths
        print("Using Google Drive files...")
        drive.mount('/content/drive')

        if not os.path.exists(INPUT_FILE_PATH):
            print(f"❌ Error: File not found at {INPUT_FILE_PATH}")
            return

        fix_pdb_file(INPUT_FILE_PATH, OUTPUT_FILE_PATH)
        print(f"\n✓ Fixed file saved to Google Drive: {OUTPUT_FILE_PATH}")

    else:
        # Upload from computer
        print("📁 Please upload your PDB file...")
        uploaded = files.upload()

        for filename in uploaded.keys():
            print(f"\nProcessing: {filename}")

            # Create output filename
            output_filename = filename.replace('.pdb', '_fixed.pdb')
            if output_filename == filename:
                output_filename = filename + '_fixed.pdb'

            # Fix the file
            fix_pdb_file(filename, output_filename)

            # Auto-download the fixed file
            print(f"\n📥 Downloading: {output_filename}")
            files.download(output_filename)

            # Show preview
            print("\nPreview of fixed file:")
            print("-" * 50)
            with open(output_filename, 'r') as f:
                for i, line in enumerate(f):
                    if i < 10:
                        print(line.rstrip())
                    else:
                        print("...")
                        break

# Run the script
if __name__ == "__main__":
    main()

📁 Please upload your PDB file...


Saving 5lrv_evo_v3_renumbered-mod1.pdb to 5lrv_evo_v3_renumbered-mod1.pdb

Processing: 5lrv_evo_v3_renumbered-mod1.pdb
Reading: 5lrv_evo_v3_renumbered-mod1.pdb
✓ Fixed 3652 atoms
✓ Saved to: 5lrv_evo_v3_renumbered-mod1_fixed.pdb

📥 Downloading: 5lrv_evo_v3_renumbered-mod1_fixed.pdb


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Preview of fixed file:
--------------------------------------------------
CRYST1   92.030   56.520   91.750  90.00  90.00  90.00 P 21 21 21    1
ATOM      1  N   MET A 128     -31.631 -14.694 -14.024  1.00 69.97           N
ATOM      2  CA  MET A 128     -30.905 -15.689 -14.805  1.00 71.82           C
ATOM      3  C   MET A 128     -29.969 -15.021 -15.808  1.00 70.93           C
ATOM      4  O   MET A 128     -29.482 -15.663 -16.739  1.00 70.88           O
ATOM      5  CB  MET A 128     -31.882 -16.618 -15.530  1.00 75.28           C
ATOM      6  N   GLU A 129     -29.725 -13.728 -15.609  1.00 69.68           N
ATOM      7  CA  GLU A 129     -28.844 -12.965 -16.487  1.00 66.21           C
ATOM      8  C   GLU A 129     -27.842 -12.146 -15.679  1.00 62.90           C
ATOM      9  O   GLU A 129     -28.208 -11.502 -14.695  1.00 63.29           O
...
