<a href="https://colab.research.google.com/github/eoinleen/PDB-tools/blob/main/pdb_analyser_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

# Upload the PDB file
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Read all lines from the file
with open(filename, 'r') as f:
    lines = f.readlines()

# Parse ATOM and HETATM records into a DataFrame
def parse_pdb_atoms(lines):
    atoms = []
    for i, line in enumerate(lines):
        if line.startswith(('ATOM', 'HETATM')) and len(line) >= 54:
            try:
                atom = {
                    'record_type': line[0:6].strip(),
                    'atom_number': int(line[6:11]),
                    'atom_name': line[12:16].strip(),
                    'residue_name': line[17:20].strip(),
                    'chain_id': line[21].strip(),
                    'residue_number': int(line[22:26]),
                    'x': float(line[30:38]),
                    'y': float(line[38:46]),
                    'z': float(line[46:54]),
                    'line_number': i + 1
                }
                atoms.append(atom)
            except ValueError:
                continue
    return pd.DataFrame(atoms)

# Parse atom records
df = parse_pdb_atoms(lines)

# Locate TER and END records
ter_lines = [i + 1 for i, line in enumerate(lines) if line.startswith('TER')]
has_end = any(line.startswith('END') for line in lines)

# Atom number continuity check
df['atom_diff'] = df['atom_number'].diff()
non_seq_atoms = df[df['atom_diff'] != 1].copy()

# Summary
print(f"✅ Total chains: {df['chain_id'].nunique()}")
print(f"✅ Total ATOM/HETATM records: {len(df)}")
print(f"✅ TER records at lines: {ter_lines}")
print(f"✅ END record present: {'Yes' if has_end else 'No'}")
print(f"✅ Atom numbers sequential: {'Yes' if non_seq_atoms.empty else 'No'}")

# Show atom number discontinuities if any
if not non_seq_atoms.empty:
    print(f"\n❌ Found {len(non_seq_atoms)} breaks in atom numbering:\n")
    display(non_seq_atoms[['atom_number', 'atom_diff', 'line_number', 'chain_id', 'residue_number', 'atom_name']].head(10))

# Residue number check per chain (fixed index handling)
for chain_id, group in df.groupby('chain_id'):
    print(f"\n--- Chain {chain_id or '(blank)'} ---")
    residues = group['residue_number'].drop_duplicates().sort_values().reset_index(drop=True)
    res_diffs = residues.diff().dropna()
    gaps = [(residues[i-1], residues[i]) for i in range(1, len(residues)) if res_diffs[i] > 1]
    print(f"Residues in chain: {len(residues)}")
    if gaps:
        print(f"❌ Gaps in residue numbering at: {gaps}")
    else:
        print("✅ Residue numbers sequential (no gaps)")

# Optional: Renumber atoms sequentially and write new file
def renumber_atoms(df):
    df = df.copy()
    df['new_atom_number'] = range(1, len(df) + 1)

    def format_pdb_line(row):
        return f"{row['record_type']:<6}{row['new_atom_number']:>5} {row['atom_name']:^4}{row['residue_name']:>4} {row['chain_id']}{row['residue_number']:>4}    {row['x']:8.3f}{row['y']:8.3f}{row['z']:8.3f}"

    with open("renumbered.pdb", "w") as out:
        for _, row in df.iterrows():
            out.write(format_pdb_line(row) + "\n")
    print("\n🛠️ Renumbered PDB written to: renumbered.pdb")

# User prompt
if not non_seq_atoms.empty:
    print("\n⚠️ Atom numbers are not sequential.")
    print("If you'd like to renumber the atoms, run the following in a new cell:")
    print("```python\nrenumber_atoms(df)\n```")


Saving PDB_plus_scar_B_sorted_fixed_v4_1-coot-1.pdb to PDB_plus_scar_B_sorted_fixed_v4_1-coot-1 (3).pdb
✅ Total chains: 2
✅ Total ATOM/HETATM records: 3474
✅ TER records at lines: [2026]
✅ END record present: Yes
✅ Atom numbers sequential: No

❌ Found 2 breaks in atom numbering:



Unnamed: 0,atom_number,atom_diff,line_number,chain_id,residue_number,atom_name
0,1,,1,A,1,N
2025,2027,2.0,2027,B,1,N



--- Chain A ---
Residues in chain: 254
✅ Residue numbers sequential (no gaps)

--- Chain B ---
Residues in chain: 215
✅ Residue numbers sequential (no gaps)

⚠️ Atom numbers are not sequential.
If you'd like to renumber the atoms, run the following in a new cell:
```python
renumber_atoms(df)
```
