# Protonate IL SMILES

For simplicity, LNPDB assigns one representative +1e protonated state per ionizable lipid. To select the nitrogen for protonation for each ionizable lipid, the following rule-based decision tree was applied: if the lipid contained only a single nitrogen, that nitrogen was protonated. If multiple nitrogens were present, then protonation of nitrogens was prioritized by class as follows from highest to lowest priority: tertiary amine, secondary amine, primary amine, imidazole, pyridine, tertiary aromatic amine, secondary aromatic amine, and primary aromatic amine. Groups comprising amide or sulfonamide structures were excluded, and quaternary nitrogens were not considered protonatable. If multiple candidates of the same class were found, the nitrogen closest to the molecular periphery of the ionizable lipid head – defined as having the greatest graph eccentricity i.e. the longest existing graph distance to a terminal atom – was selected. It is intuitively trivial to identify tails and the headgroup in ionizable lipids, however, restraints on crossing between molecular substructures (e.g. tail to headgroup to tail) are necessary, requiring a path invalidation protocol. The validity of a nitrogen–periphery path is constrained by the enclosing periphery–periphery path: specifically, a nitrogen–periphery path is considered chemically “valid” if the number of nitrogen atoms it contains does not exceed half of the maximum nitrogen content of any enclosing periphery–periphery path. This invalidates paths that traverse an unrealistically large fraction of the molecule’s atoms, thereby excluding spurious or highly delocalized protonation sites. For specific cases within the KZ_2016 dataset involving ionizable lipids with greater than or equal to four nitrogens, the most centrally located candidate (with the lowest average squared distance to all other atoms) was chosen to mitigate peculiarities with regard to tail amines. The selected nitrogen was then protonated by adding a new bond to a hydrogen atom and assigning a +1 formal charge, which was translated back into SMILES accordingly. Future research is warranted to explore more accurate, dynamic protonation conditions.

In [3]:
import pandas as pd
from collections import deque
from rdkit import Chem
from rdkit.Chem import AllChem
import math
from tqdm import tqdm

# if KZ dataset, use AA centrality
def select_center_priority(mol, nitrogens):
    if not nitrogens:
        return []

    heavy_atoms = [a.GetIdx() for a in mol.GetAtoms() if a.GetSymbol() != "H"]

    def shortest_path_length(start_idx, end_idx):
        visited = {start_idx}
        q = deque([(start_idx, 0)])
        while q:
            cur, d = q.popleft()
            if cur == end_idx:
                return d
            for nbr in mol.GetAtomWithIdx(cur).GetNeighbors():
                nidx = nbr.GetIdx()
                if nbr.GetSymbol() != "H" and nidx not in visited:
                    visited.add(nidx)
                    q.append((nidx, d + 1))
        return math.inf

    scores = []
    for atom in nitrogens:
        idx = atom.GetIdx()
        dists = [
            shortest_path_length(idx, j)
            for j in heavy_atoms if j != idx
        ]
        if not dists:
            avg_sq = math.inf
        else:
            avg_sq = sum(d**2 for d in dists) / len(dists)
        scores.append((avg_sq, atom))

    # Lower score = more central
    scores.sort(key=lambda x: x[0])
    return [atom for _, atom in scores]

def detect_amine_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES: {smiles}")

    excl_amide = "!$([NX3][CX3](=O))"
    excl_sulfonamide = "!$([NX3][SX4](=O)(=O))"
    excl_aromatic_neighbor = "!$([N]a)"  # exclude amines directly bound to aromatic atoms
    excl_quaternary = "!$([NX4+;H0])"

    primary_smarts   = f"[NX3;H2;{excl_amide};{excl_sulfonamide};{excl_aromatic_neighbor};{excl_quaternary}]"
    secondary_smarts = f"[NX3;H1;{excl_amide};{excl_sulfonamide};{excl_aromatic_neighbor};{excl_quaternary}]"
    tertiary_smarts  = f"[NX3;H0;{excl_amide};{excl_sulfonamide};{excl_aromatic_neighbor};{excl_quaternary}]"

    prim_atoms = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(primary_smarts)) for i in match]
    sec_atoms  = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(secondary_smarts)) for i in match]
    ter_atoms  = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(tertiary_smarts)) for i in match]

    def unique_atoms(atoms):
        seen = set()
        result = []
        for atom in atoms:
            if atom.GetIdx() not in seen:
                seen.add(atom.GetIdx())
                result.append(atom)
        return result

    prim_atoms = unique_atoms(prim_atoms)
    sec_atoms  = unique_atoms(sec_atoms)
    ter_atoms  = unique_atoms(ter_atoms)

    all_amines = prim_atoms + sec_atoms + ter_atoms

    if not all_amines:
        # print(f"No amines found in: {smiles}, looking for imidazole")
        # --- Check for imidazole sp2 nitrogen ---
        imidazole_sp2 = "[nH0]1cncc1"
        imidazole_matches = mol.GetSubstructMatches(Chem.MolFromSmarts(imidazole_sp2))
        if imidazole_matches:
            atom_idx = [match[0] for match in imidazole_matches]
            atoms = [mol.GetAtomWithIdx(idx) for idx in atom_idx]
            # print("Imidazole sp2 nitrogen used")
            return select_nitrogen_with_path_invalidation(mol, atoms), mol, "imidazole"

        # print(f"Imidazole not found in: {smiles}, looking for pyridines")

        pyridine = "n1ccccc1"
        pyridine_matches = mol.GetSubstructMatches(Chem.MolFromSmarts(pyridine))
        if pyridine_matches:
            atom_idx = [match[0] for match in pyridine_matches]
            atoms = [mol.GetAtomWithIdx(idx) for idx in atom_idx]
            # print("Pyridine nitrogen used")
            return select_nitrogen_with_path_invalidation(mol, atoms), mol, "pyridine"

        # print(f"Pyridine not found in: {smiles}, looking for non-amide aromatic amines")

        primary_arom_smarts   = f"[NX3;H2;{excl_amide};{excl_sulfonamide};{excl_quaternary}]"
        secondary_arom_smarts = f"[NX3;H1;{excl_amide};{excl_sulfonamide};{excl_quaternary}]"
        tertiary_arom_smarts  = f"[NX3;H0;{excl_amide};{excl_sulfonamide};{excl_quaternary}]"

        prim_arom_atoms = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(primary_arom_smarts)) for i in match]
        sec_arom_atoms  = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(secondary_arom_smarts)) for i in match]
        ter_arom_atoms  = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(tertiary_arom_smarts)) for i in match]        

        prim_arom = unique_atoms(prim_arom_atoms)
        sec_arom = unique_atoms(sec_arom_atoms)
        ter_arom = unique_atoms(ter_arom_atoms)

        all_arom = prim_arom + sec_arom + ter_arom

        if all_arom:
            # print("Using aromatic amine")

            if len(all_arom) == 1:
                if all_arom[0] in ter_arom_atoms:
                    return all_arom, mol, "tertiary"
                elif all_arom[0] in sec_arom_atoms:
                    return all_arom, mol, "secondary"
                else:
                    return all_arom, mol, "primary"
        
            if ter_arom_atoms:
                highest_group = ter_arom_atoms
                group_type = "tertiary"
            elif sec_arom_atoms:
                highest_group = sec_arom_atoms
                group_type = "secondary"
            else:
                highest_group = prim_arom_atoms
                group_type = "primary"
        
            if len(highest_group) > 1:
                return select_nitrogen_with_path_invalidation(mol, highest_group), mol, group_type
            else:
                return highest_group, mol, group_type

        non_resonance_smarts = f"[N;{excl_amide};{excl_sulfonamide};{excl_aromatic_neighbor};{excl_quaternary}]"
        non_resonance_atoms = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(non_resonance_smarts)) for i in match]
        if non_resonance_atoms:
            print("Using non-resonance")
            return select_nitrogen_with_path_invalidation(mol, non_resonance_atoms), mol, "non-resonance"

        non_amide_smarts = f"[N;{excl_amide};{excl_sulfonamide};{excl_quaternary}]"
        non_amide_atoms = [mol.GetAtomWithIdx(i) for match in mol.GetSubstructMatches(Chem.MolFromSmarts(non_amide_smarts)) for i in match]
        if non_amide_atoms:
            print("Using non-amide")
            return select_nitrogen_with_path_invalidation(mol, non_amide_atoms), mol, "non-amide"

        print("Selecting all nitrogens")
        return select_nitrogen_with_path_invalidation(mol, [atom for atom in mol.GetAtoms() if atom.GetSymbol() == "N"]), mol, "none"

    if len(all_amines) == 1:
        if all_amines[0] in ter_atoms:
            return all_amines, mol, "tertiary"
        elif all_amines[0] in sec_atoms:
            return all_amines, mol, "secondary"
        else:
            return all_amines, mol, "primary"

    if ter_atoms:
        highest_group = ter_atoms
        group_type = "tertiary"
    elif sec_atoms:
        highest_group = sec_atoms
        group_type = "secondary"
    else:
        highest_group = prim_atoms
        group_type = "primary"

    if len(highest_group) > 1:
        return select_nitrogen_with_path_invalidation(mol, highest_group), mol, group_type
    else:
        return highest_group, mol, group_type

from collections import defaultdict
from rdkit import Chem

def select_nitrogen_with_path_invalidation(mol, nitrogens):
    """
    Select nitrogen based on longest valid nitrogen→periphery path.
    - Precomputes all periphery–periphery shortest paths once.
    - For each considered nitrogen, tries longest nitrogen→periphery path first,
      checks validity based on enclosing periphery–periphery path's max nitrogen count.
    - Falls back to longest invalid path if no valid paths exist for that nitrogen.
    """
    priority_idxs = [n.GetIdx() for n in nitrogens]
    all_nitrogen_idxs = [n.GetIdx() for n in nitrogens]
    # all_nitrogen_idxs = {a.GetIdx() for a in mol.GetAtoms() if a.GetSymbol() in ("N", "n")}

    # Identify peripheral atoms (non-H, degree 1 heavy atoms)
    endpoints = []
    for atom in mol.GetAtoms():
        if atom.GetSymbol() != "H":
            neighbors = [nbr for nbr in atom.GetNeighbors() if nbr.GetSymbol() != "H"]
            if len(neighbors) == 1:
                endpoints.append(atom.GetIdx())

    if not endpoints:
        return nitrogens[0]  # fallback if no periphery atoms exist

    # Precompute all periphery–periphery shortest paths + nitrogen counts
    pp_paths = []
    for i, ep1 in enumerate(endpoints):
        for ep2 in endpoints[i + 1:]:
            path = bfs_path(mol, ep1, ep2)  # shortest path
            if path:
                pp_paths.append({
                    "path": path,
                    "path_set": set(path),
                    "n_count": sum(1 for idx in path if idx in all_nitrogen_idxs)
                })

    best_nitrogen = None
    best_score = -1

    # Evaluate each considered nitrogen
    for n_idx in priority_idxs:
        # Collect all shortest N→periphery paths
        np_paths = []
        for ep_idx in endpoints:
            path = bfs_path(mol, n_idx, ep_idx)
            if path:
                np_paths.append(path)

        if not np_paths:
            continue  # No paths from this nitrogen to periphery

        # Sort nitrogen→periphery paths by length (longest first)
        np_paths.sort(key=len, reverse=True)

        nitrogen_best_score = -1

        # Check paths in descending length order
        for path in np_paths:
            nitrogens_in_path = sum(1 for idx in path if idx in all_nitrogen_idxs)

            # Find enclosing periphery–periphery path with max nitrogens
            max_nitrogens_including = nitrogens_in_path
            for pp in pp_paths:
                if set(path).issubset(pp["path_set"]):
                    if pp["n_count"] > max_nitrogens_including:
                        max_nitrogens_including = pp["n_count"]

            n_max = max_nitrogens_including
            limit = n_max // 2 + 1

            if nitrogens_in_path <= limit:
                # Valid path
                score = len(path) - 1
                nitrogen_best_score = score
                break  # stop at first valid path

        # Fallback: longest invalid path if no valid found
        if nitrogen_best_score == -1:
            nitrogen_best_score = len(np_paths[0]) - 1

        # Update global best nitrogen
        if nitrogen_best_score > best_score:
            best_score = nitrogen_best_score
            best_nitrogen = mol.GetAtomWithIdx(n_idx)

    # Final fallback if no nitrogen selected
    if best_nitrogen is None:
        return nitrogens

    return [best_nitrogen] + [nitrogen for nitrogen in nitrogens if nitrogen.GetIdx() != best_nitrogen.GetIdx()]

def bfs_path(mol, start_idx, target_idx, forbidden=None):
    """BFS shortest path (list of atom indices) from start -> target avoiding 'forbidden' (set of idxs).
       'forbidden' atoms are not allowed to appear in internal nodes of path; start/target allowed.
    """
    if forbidden is None:
        forbidden = set()
    q = deque([[start_idx]])
    visited = {start_idx}
    while q:
        path = q.popleft()
        cur = path[-1]
        if cur == target_idx:
            return path
        for nbr in mol.GetAtomWithIdx(cur).GetNeighbors():
            nidx = nbr.GetIdx()
            if nbr.GetSymbol() == "H":
                continue
            # allow target even if it's in forbidden
            if nidx in forbidden and nidx != target_idx:
                continue
            if nidx in path:  # avoid cycles in simple path BFS
                continue
            q.append(path + [nidx])
    return None

def protonate_nitrogen(mol, atom, group_type):
    """
    Protonates the selected nitrogen atom based on group type.
    - tertiary: [NH+]
    - secondary: [NH2+]
    - primary: [NH3+]
    For aromatic cases: preserve aromaticity after protonation.
    """
    editable = Chem.RWMol(mol)
    idx = atom.GetIdx()
    nitrogen = editable.GetAtomWithIdx(idx)

    aromatic_types = {"imidazole", "pyridine"}
    special_groups = {"non-resonance", "non-amide"}
    do_not_protonate = {"none"}

    if group_type in aromatic_types:
        # Add one explicit hydrogen if not already present
        current_num_h = nitrogen.GetTotalNumHs()
        # Add H only if zero hydrogens (should be 0 for sp2 nitrogen)
        if current_num_h == 0:
            editable.AddAtom(Chem.Atom(1))  # add H
            h_idx = editable.GetNumAtoms() - 1
            editable.AddBond(idx, h_idx, Chem.BondType.SINGLE)
        nitrogen.SetFormalCharge(1)
        # Maintain aromaticity
        nitrogen.SetIsAromatic(True)

    elif group_type in do_not_protonate:
        smiles = Chem.MolToSmiles(mol)
        print(f"UNKNOWN GROUP {group_type}, SMILES NOT PROTONATED: {smiles}")

    else:
        if group_type in special_groups:
            smiles = Chem.MolToSmiles(mol)
            print(f"SPECIAL GROUP {group_type}, SMILES WERE PROTONATED: {smiles}")
        # For primary/secondary/tertiary aliphatic amines:
        editable.AddAtom(Chem.Atom(1))
        h_idx = editable.GetNumAtoms() - 1
        editable.AddBond(idx, h_idx, Chem.BondType.SINGLE)
        nitrogen.SetFormalCharge(1)

    Chem.SanitizeMol(editable)
    return editable

def process_csv(input_csv, output_csv):
    df = pd.read_csv(input_csv)

    if 'IL_SMILES' not in df.columns:
        raise ValueError("CSV must have a 'IL_SMILES' column.")

    protonated_smiles_list = []
    

    for index, smi in enumerate(tqdm(df['IL_SMILES'])):
        try:
            atoms, mol, group_type = detect_amine_atoms(smi)
            if group_type == "none":
                protonated_smiles_list.append(smi)
                continue
            
            exp_id = str(df.loc[index, 'Experiment_ID']).strip()
            if exp_id.upper() == "KZ_2016" and sum(1 for a in mol.GetAtoms() if a.GetSymbol() == "N") >= 4:
                atoms = select_center_priority(mol, atoms)

            selected_atom = atoms[0]
            prot_mol = protonate_nitrogen(mol, selected_atom, group_type)
            prot_smi = Chem.MolToSmiles(prot_mol, canonical=False)
            protonated_smiles_list.append(prot_smi)

        except Exception as e:
            print(f"Error processing {index}:{smi}: {e}")
            protonated_smiles_list.append(smi)

    df['IL_protonated_SMILES'] = protonated_smiles_list
    df.to_csv(output_csv, index=False)
    print(f"Saved output to {output_csv}")

ModuleNotFoundError: No module named 'pandas'

In [None]:
process_csv("unique_ionizablelipids_lnpdb_v22 (1).csv", "unique_protonated.csv")