<a href="https://colab.research.google.com/github/eoinleen/PDB-tools/blob/main/Ubiqutin_chain_merge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
"""
PDB Chain Merger - Continuous Numbering Version
===============================================

What does this do?
-----------------
Merges multiple chains into a single chain with continuous residue numbering.
Perfect for combining multiple copies of the same protein (like Ubiquitin)
into one chain while maintaining a clear gap in numbering.

Example:
- Chain A: Your main protein (unchanged)
- Chain B: First Ub copy (residues 1-76)
- Chain C: Second Ub copy (residues 1-76) → becomes part of chain B (residues 127-202)

Result: Chain A unchanged, single chain B with residues 1-76, then 127-202

What to change:
--------------
1. File path: Update input_pdb_file to your PDB file location
2. Gap size: Change gap_size=50 to whatever gap you want between chains
3. Target chain: Change target_chain='B' if you want a different chain letter
4. Chains to merge: Update chains_to_merge=['B', 'C'] for different chains

Created by: Claude (Anthropic) - Modified for continuous numbering
Version: 2.0
"""


def clean_pdb_name(filename):
    """
    Create a PDB-friendly filename (from your original script)
    """
    # Remove .pdb extension if present
    base = os.path.splitext(filename)[0]
    # Remove special characters and replace with underscores
    clean = ''.join(c if c.isalnum() else '_' for c in base)
    # Add .pdb extension back
    return f"{clean}.pdb"

def merge_chains_in_pdb(input_file, target_chain='B', chains_to_merge=['B', 'C'], gap_size=50):
    """
    Merge multiple chains into a single chain with continuous numbering.

    Args:
        input_file: Path to input PDB file
        target_chain: The chain identifier to use for merged chains
        chains_to_merge: List of chains to merge into target_chain
        gap_size: Number of residues to add between chains (default 50)
    """
    modified_lines = []
    line_count = 0
    modified_count = 0

    # Find the highest residue number in the target chain
    max_residue_in_target = 0

    # First pass: find the maximum residue number in the target chain
    try:
        with open(input_file, 'r') as file:
            for line in file:
                if line.startswith(("ATOM", "HETATM")) and len(line) > 26:
                    chain_id = line[21]
                    if chain_id == target_chain:
                        try:
                            residue_num = int(line[22:26].strip())
                            max_residue_in_target = max(max_residue_in_target, residue_num)
                        except ValueError:
                            continue
    except Exception as e:
        print(f"Error in first pass: {e}")
        return None

    print(f"Highest residue number in chain {target_chain}: {max_residue_in_target}")
    offset = max_residue_in_target + gap_size
    print(f"Will add {offset} to residue numbers from chains other than {target_chain}")

    # Second pass: modify the file
    try:
        with open(input_file, 'r') as file:
            for line in file:
                line_count += 1
                if not line.strip():
                    modified_lines.append(line)
                    continue

                if len(line) < 22:
                    modified_lines.append(line)
                    continue

                if line.startswith(("ATOM", "HETATM")):
                    chain_id = line[21]
                    if chain_id in chains_to_merge:
                        modified_count += 1
                        # Ensure line is long enough for segment ID modification
                        if len(line) < 76:
                            line = line.rstrip() + ' ' * (80 - len(line.rstrip()))

                        # For chains other than the target, add offset to residue number
                        if chain_id != target_chain:
                            try:
                                old_residue_num = int(line[22:26].strip())
                                new_residue_num = old_residue_num + offset
                                residue_str = f"{new_residue_num:>4}"
                            except ValueError:
                                residue_str = line[22:26]  # Keep original if can't parse
                        else:
                            residue_str = line[22:26]  # Keep original numbering for target chain

                        # Change chain ID, residue number, and segment ID
                        new_line = (line[:21] +
                                  target_chain +
                                  residue_str +
                                  line[26:72] +
                                  target_chain.ljust(4) +
                                  line[76:])
                        modified_lines.append(new_line)
                        continue
                elif line.startswith("TER"):
                    chain_id = line[21]
                    if chain_id in chains_to_merge:
                        modified_count += 1
                        # Also update residue number in TER records
                        if chain_id != target_chain:
                            try:
                                old_residue_num = int(line[22:26].strip())
                                new_residue_num = old_residue_num + offset
                                residue_str = f"{new_residue_num:>4}"
                            except ValueError:
                                residue_str = line[22:26]
                        else:
                            residue_str = line[22:26]

                        new_line = line[:21] + target_chain + residue_str + line[26:]
                        modified_lines.append(new_line)
                        continue

                modified_lines.append(line)

        print(f"Processed {line_count} lines")
        print(f"Modified {modified_count} ATOM/HETATM/TER records")
        print(f"Merged chains {chains_to_merge} into chain {target_chain}")
        return modified_lines

    except Exception as e:
        print(f"Error reading file: {e}")
        print(f"Error occurred at line {line_count}")
        return None

def process_pdb_file_merge(input_file, target_chain='B', chains_to_merge=['B', 'C'], gap_size=50):
    """
    Process a PDB file to merge specified chains with continuous numbering.
    """
    if not os.path.exists(input_file):
        print(f"Error: Input file not found: {input_file}")
        return False

    directory = os.path.dirname(input_file)
    filename = os.path.basename(input_file)
    clean_name = clean_pdb_name(filename)
    output_file = os.path.join(directory, f"merged_{clean_name}")

    print(f"Processing: {filename}")
    print(f"Input path: {input_file}")
    print(f"Output path: {output_file}")

    modified_content = merge_chains_in_pdb(input_file, target_chain, chains_to_merge, gap_size)

    if modified_content:
        try:
            with open(output_file, 'w') as file:
                file.writelines(modified_content)
            print(f"\nSuccessfully saved modified PDB to: {output_file}")

            # Print sample lines showing the renumbering
            print("\nSample lines from modified file showing renumbering:")
            with open(output_file, 'r') as f:
                chain_b_lines = []
                for line in f:
                    if line.startswith(("ATOM", "HETATM")) and line[21] == target_chain:
                        chain_b_lines.append(line)

                # Show first few and last few residues of merged chain
                if len(chain_b_lines) > 10:
                    print("First few residues:")
                    for line in chain_b_lines[:5]:
                        residue_num = line[22:26].strip()
                        atom_name = line[12:16].strip()
                        print(f"Chain: {line[21]}, Residue: {residue_num}, Atom: {atom_name}")

                    print("...")
                    print("Last few residues:")
                    for line in chain_b_lines[-5:]:
                        residue_num = line[22:26].strip()
                        atom_name = line[12:16].strip()
                        print(f"Chain: {line[21]}, Residue: {residue_num}, Atom: {atom_name}")

            return True
        except Exception as e:
            print(f"Error saving file: {e}")
            return False
    return False

# Example usage:
# Replace with your actual file path
input_pdb_file = "/content/drive/MyDrive/PDB-files/20250606_mysm1/MYSM1_Ub2_fixed.pdb"

# This will merge chains B and C into chain B with continuous numbering
# Chain B residues stay 1-76, Chain C residues become 127-202 (76+50+1 to 76+50+76)
success = process_pdb_file_merge(input_pdb_file, target_chain='B', chains_to_merge=['B', 'C'], gap_size=50)

if success:
    print("\nChain merging completed successfully!")
    print("Result: Chain A unchanged, chains B and C merged into chain B with continuous numbering")
    print("Original chain B: residues 1-76")
    print("Original chain C: residues now renumbered starting from 127 (76+50+1)")
else:
    print("\nFailed to process PDB file.")

Processing: MYSM1_Ub2_fixed.pdb
Input path: /content/drive/MyDrive/PDB-files/20250606_mysm1/MYSM1_Ub2_fixed.pdb
Output path: /content/drive/MyDrive/PDB-files/20250606_mysm1/merged_MYSM1_Ub2_fixed.pdb
Highest residue number in chain B: 76
Will add 126 to residue numbers from chains other than B
Processed 3222 lines
Modified 1195 ATOM/HETATM/TER records
Merged chains ['B', 'C'] into chain B

Successfully saved modified PDB to: /content/drive/MyDrive/PDB-files/20250606_mysm1/merged_MYSM1_Ub2_fixed.pdb

Sample lines from modified file showing renumbering:
First few residues:
Chain: B, Residue: 1, Atom: N
Chain: B, Residue: 1, Atom: CA
Chain: B, Residue: 1, Atom: C
Chain: B, Residue: 1, Atom: O
Chain: B, Residue: 1, Atom: CB
...
Last few residues:
Chain: B, Residue: 201, Atom: O
Chain: B, Residue: 202, Atom: O
Chain: B, Residue: 202, Atom: N
Chain: B, Residue: 202, Atom: CA
Chain: B, Residue: 202, Atom: C

Chain merging completed successfully!
Result: Chain A unchanged, chains B and C merge

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
