In [1]:
%cd ~/REVIVAL2
%load_ext autoreload
%autoreload 2
%load_ext blackcellmagic

/disk2/fli/REVIVAL2


In [133]:
import os
import re
import numpy as np
from Bio.PDB import PDBParser, MMCIFParser, PDBIO


def get_atom_with_variations(residue, atom_name):
    """
    Attempt to retrieve an atom from a residue, trying multiple variations of the atom name.

    Args:
        residue: A Bio.PDB.Residue object.
        atom_name (str): The atom name to search for.

    Returns:
        Atom object if found.

    Raises:
        KeyError: If no matching atom name is found.
    """
    variations = [
        atom_name,
        atom_name.replace("_", ""),  # Remove underscores
        f"{atom_name[0]}_{atom_name[1:]}",  # Add underscore after the first character
    ]
    for variation in variations:
        try:
            return residue[variation]
        except KeyError:
            continue
    raise KeyError(f"Atom {atom_name} or its variations not found in residue {residue}.")


# Match residues dynamically
def find_residue_by_id(chain, target_res_id):
    for residue in chain.get_residues():
        if residue.id[1] == target_res_id:
            return residue
    raise ValueError(f"Residue with ID {target_res_id} not found in chain.")


def clean_pdb(input_file, output_file):
    """
    Clean a PDB file using Biopython to ensure proper formatting.
    Args:
        input_file (str): Path to the input PDB file.
        output_file (str): Path to save the cleaned PDB file.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("protein", input_file)
    print(structure)
    io = PDBIO()
    io.set_structure(structure)
    io.save(output_file)

    
def replace_residue_names_auto(input_file, output_file, residue_prefix="LIG", new_residue="LIG"):
    """
    Automatically detect and replace residue names in a PDB file that match a specific prefix.

    Args:
        input_file (str): Path to the input PDB file.
        output_file (str): Path to save the modified PDB file.
        residue_prefix (str): Prefix of residue names to replace (e.g., "LIG").
        new_residue (str): New residue name to replace with.
    """
    detected_residues = set()  # To store dynamically detected residue names
    pattern = re.compile(f"^{residue_prefix}_\\w$")  # Regex to detect residues like LIG_B, LIG_C

    with open(input_file, "r") as infile:
        lines = infile.readlines()

    # First pass: Detect residue names dynamically
    for line in lines:
        if line.startswith(("ATOM", "HETATM")):
            long_res_name = line[17:22]
            if pattern.match(long_res_name):
                detected_residues.add(long_res_name)

    print(f"Detected residues to replace: {detected_residues}")

    # batch replace detected residues with new residue name
    with open(output_file, "w") as outfile:
        for line in lines:
            if line.startswith(("ATOM", "HETATM")):
                long_res_name = line[17:22]
                if long_res_name in detected_residues:
                    line = line.replace(long_res_name, new_residue)
                outfile.write(line)


def get_covalent_neighbors(atom, residue, bond_distance_cutoff=1.6):
    """
    Get covalently bonded neighbors of an atom in a residue based on distance.

    Args:
        atom: Biopython Atom object for the input atom.
        residue: Biopython Residue object containing the atom.
        bond_distance_cutoff (float): Maximum distance for covalent bonds in Ångströms.

    Returns:
        List: Neighboring atoms covalently bonded to the input atom.
    """
    atom_coord = np.array(atom.coord)
    neighbors = []
    for neighbor in residue.get_atoms():
        if neighbor != atom:  # Exclude the atom itself
            neighbor_coord = np.array(neighbor.coord)
            distance = np.linalg.norm(atom_coord - neighbor_coord)
            if distance <= bond_distance_cutoff:
                neighbors.append(neighbor)
    return neighbors



def calculate_hydrogen_position(atom, neighbors, bond_length=1.0):
    """
    Calculate the coordinates of a hydrogen atom based on the input atom's geometry.

    Args:
        atom: Biopython Atom object for the input atom.
        neighbors: List of neighboring Biopython Atom objects.
        bond_length (float): Bond length for the hydrogen atom in Ångströms.

    Returns:
        np.ndarray: Hydrogen atom coordinates.
    """

    atom_coord = np.array(atom.coord)

    if len(neighbors) == 1:  # SP hybridization (linear)
        neighbor_coord = np.array(neighbors[0].coord)
        direction = atom_coord - neighbor_coord
        direction /= np.linalg.norm(direction)  # Normalize
        hydrogen_coord = atom_coord + direction * bond_length

    elif len(neighbors) == 2:  # SP2 hybridization (planar)
        neighbor_coords = [np.array(neighbor.coord) for neighbor in neighbors]
        v1 = neighbor_coords[0] - atom_coord
        v2 = neighbor_coords[1] - atom_coord
        v1 /= np.linalg.norm(v1)
        v2 /= np.linalg.norm(v2)
        in_plane_direction = -(v1 + v2)  # Opposite direction within the plane
        in_plane_direction /= np.linalg.norm(in_plane_direction)
        hydrogen_coord = atom_coord + in_plane_direction * bond_length

    elif len(neighbors) == 3:  # SP3 hybridization (tetrahedral)
        # TODO test this case
        neighbor_coords = [np.array(neighbor.coord) for neighbor in neighbors]
        centroid = np.mean(neighbor_coords, axis=0)
        direction = atom_coord - centroid
        direction /= np.linalg.norm(direction)  # Normalize
        hydrogen_coord = atom_coord + direction * bond_length

    else:
        raise ValueError(f"Unsupported geometry for atom with {len(neighbors)} neighbors.")

    return hydrogen_coord

def measure_bond_distance(
    structure_file, 
    chain_id_1, 
    res_id_1, 
    atom_name_1, 
    chain_id_2, 
    res_id_2, 
    atom_name_2, 
    add_hydrogen_to_1=False, 
    add_hydrogen_to_2=False):
    """
    Measure the bond distance between two atoms or hydrogens attached to them in a PDB or CIF file.

    Args:
        structure_file (str): Path to the PDB or CIF file.
        chain_id_1 (str): Chain ID where the first atom is located.
        res_id_1 (tuple): Tuple of (residue sequence number, insertion code) for the first atom.
        atom_name_1 (str): Name of the first atom.
        chain_id_2 (str): Chain ID where the second atom is located.
        res_id_2 (tuple): Tuple of (residue sequence number, insertion code) for the second atom.
        atom_name_2 (str): Name of the second atom.
        add_hydrogen_to_1 (bool): Add a hydrogen to atom_1 for distance calculation.
        add_hydrogen_to_2 (bool): Add a hydrogen to atom_2 for distance calculation.

    Returns:
        float: Distance between the specified atoms (or hydrogen atoms) in angstroms.
    """

    file_format = os.path.splitext(structure_file)[1][1:]

    if file_format.lower() == 'pdb':
        parser = PDBParser(QUIET=True)
    elif file_format.lower() == 'cif':
        parser = MMCIFParser(QUIET=True)
    else:
        raise ValueError("Unsupported file format. Use 'pdb' or 'cif'.")

    structure = parser.get_structure("protein", structure_file)

    # # Ensure res_id_1 and res_id_2 are tuples
    # def format_res_id(res_id):
    #     if isinstance(res_id, int):  # If only sequence number is provided
    #         return (' ', res_id, ' ')
    #     return res_id


    # Locate the first atom
    chain_1 = structure[0][chain_id_1]
    chain_2 = structure[0][chain_id_2]
    print(chain_1, chain_2)

    residue_1 = find_residue_by_id(chain_1, res_id_1)
    residue_2 = find_residue_by_id(chain_2, res_id_2)
    print(residue_1, residue_2)

    atom_1 = get_atom_with_variations(residue_1, atom_name_1)
    atom_2 = get_atom_with_variations(residue_2, atom_name_2)
    print(atom_1, atom_2)

    # # Generate RDKit molecule
    # mol = Chem.MolFromPDBFile(structure_file, sanitize=False)
    # mol_with_h = Chem.AddHs(mol)
    # AllChem.EmbedMolecule(mol_with_h)

    # def get_hydrogen_position(atom):
    #     """
    #     Find the position of a hydrogen atom attached to the given atom.
    #     """
    #     atom_idx = atom.serial_number - 1  # Match RDKit indexing
    #     conf = mol_with_h.GetConformer()
    #     for neighbor in mol_with_h.GetAtomWithIdx(atom_idx).GetNeighbors():
    #         if neighbor.GetSymbol() == "H":
    #             hydrogen_idx = neighbor.GetIdx()
    #             return np.array(conf.GetAtomPosition(hydrogen_idx))
    #     raise ValueError(f"No hydrogen found for atom {atom.get_id()}.")

    # Determine coordinates for atom_1 and atom_2
    if add_hydrogen_to_1:
        neighbors_1 = get_covalent_neighbors(atom_1, residue_1)
        print(neighbors_1)
        coord_1 = calculate_hydrogen_position(atom_1, neighbors_1)
    else:
        coord_1 = atom_1.coord

    if add_hydrogen_to_2:
        neighbors_2 = get_covalent_neighbors(atom_2, residue_2)
        print(neighbors_2)
        coord_2 = calculate_hydrogen_position(atom_2, neighbors_2)
    else:
        coord_2 = atom_2.coord

    # Calculate the distance
    distance = np.linalg.norm(coord_1 - coord_2)
    return distance

In [50]:
from REVIVAL.global_param import LIB_INFO_DICT

In [51]:
LIB_INFO_DICT["PfTrpB-4bromo"]["cofactor-distances"]

{'C-C': (('B', 1, 'LIG', 'C_5', False), ('B', 1, 'LIG', 'C_14', False)),
 'GLU-NH_1': (('A', 104, 'GLU', 'OE1', False), ('B', 1, 'LIG', 'N_1', True)),
 'GLU-NH_2': (('A', 104, 'GLU', 'OE2', False), ('B', 1, 'LIG', 'N_1', True))}

In [127]:
atom1_info, atom2_info = LIB_INFO_DICT["PfTrpB-4bromo"]["cofactor-distances"]["C-C"]
chain_id_1, res_id_1,res_name_1, atom_name_1, atom_h_1 = atom1_info
chain_id_2, res_id_2, res_name_2, atom_name_2, atom_h_2 = atom2_info

In [128]:
measure_bond_distance(
    structure_file="/disk2/fli/af3_inference/outputs/pftrpb-4bromo_joint-all/pftrpb-4bromo_joint-all_model.cif", 
    chain_id_1=chain_id_1, 
    res_id_1=res_id_1, 
    atom_name_1=atom_name_1, 
    chain_id_2=chain_id_2, 
    res_id_2=res_id_2, 
    atom_name_2=atom_name_2, 
    add_hydrogen_to_1=atom_h_1,
    add_hydrogen_to_2=atom_h_2
    )

<Chain id=B> <Chain id=B>
<Residue LIG_B het=H_LIG_B resseq=1 icode= > <Residue LIG_B het=H_LIG_B resseq=1 icode= >
<Atom C5> <Atom C14>


3.1300173

In [136]:
atom1_info, atom2_info = LIB_INFO_DICT["PfTrpB-4bromo"]["cofactor-distances"]["GLU-NH_1"]
chain_id_1, res_id_1,res_name_1, atom_name_1, atom_h_1 = atom1_info
chain_id_2, res_id_2, res_name_2, atom_name_2, atom_h_2 = atom2_info
# atom_h_2 = False

In [137]:
measure_bond_distance(
    structure_file="/disk2/fli/af3_inference/outputs/pftrpb-4bromo_joint-all/pftrpb-4bromo_joint-all_model.cif", 
    chain_id_1=chain_id_1, 
    res_id_1=res_id_1, 
    atom_name_1=atom_name_1, 
    chain_id_2=chain_id_2, 
    res_id_2=res_id_2, 
    atom_name_2=atom_name_2, 
    add_hydrogen_to_1=atom_h_1,
    add_hydrogen_to_2=atom_h_2
    )

<Chain id=A> <Chain id=B>
<Residue GLU het=  resseq=104 icode= > <Residue LIG_B het=H_LIG_B resseq=1 icode= >
<Atom OE1> <Atom N1>
[<Atom C3>, <Atom C6>]


3.6268325

In [134]:
atom1_info, atom2_info = LIB_INFO_DICT["PfTrpB-4bromo"]["cofactor-distances"]["GLU-NH_2"]
chain_id_1, res_id_1,res_name_1, atom_name_1, atom_h_1 = atom1_info
chain_id_2, res_id_2, res_name_2, atom_name_2, atom_h_2 = atom2_info
# atom_h_2 = False

In [135]:
measure_bond_distance(
    structure_file="/disk2/fli/af3_inference/outputs/pftrpb-4bromo_joint-all/pftrpb-4bromo_joint-all_model.cif", 
    chain_id_1=chain_id_1, 
    res_id_1=res_id_1, 
    atom_name_1=atom_name_1, 
    chain_id_2=chain_id_2, 
    res_id_2=res_id_2, 
    atom_name_2=atom_name_2, 
    add_hydrogen_to_1=atom_h_1,
    add_hydrogen_to_2=atom_h_2
    )

<Chain id=A> <Chain id=B>
<Residue GLU het=  resseq=104 icode= > <Residue LIG_B het=H_LIG_B resseq=1 icode= >
<Atom OE2> <Atom N1>
[<Atom C3>, <Atom C6>]


1.4908736