# 01 Molecular features


In [1]:
import pyarrow.dataset as ds
import pyarrow.compute as pc

In [2]:
path_train_data = "../../../data/train.parquet"
data_train = ds.dataset(source=path_train_data, format="parquet")

In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [4]:
def clean_smiles(smiles):

    # Remove [Dy] from smiles
    smiles = smiles.replace("[Dy]", "")

    # Convert SMILES to a RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")
    
    # Remove any salts or fragments
    mol = Chem.RemoveHs(mol)  # Remove explicit hydrogens
    fragments = Chem.GetMolFrags(mol, asMols=True)
    
    # Keep the largest fragment
    largest_fragment = max(fragments, default=mol, key=lambda m: m.GetNumAtoms())
    
    # Standardize the molecule
    AllChem.Compute2DCoords(largest_fragment)  # Compute 2D coordinates
    
    # Convert the molecule back to a canonical SMILES string
    cleaned_smiles = Chem.MolToSmiles(largest_fragment, canonical=True)
    return cleaned_smiles

In [5]:
def get_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    AllChem.MMFFOptimizeMolecule(mol, maxIters=200)
    return mol

In [6]:
no_bind_scanner = data_train.scanner(filter=(pc.field("binds") == 0))
bind_scanner = data_train.scanner(filter=(pc.field("binds") == 1))

In [7]:
example_row_binds = bind_scanner.head(num_rows=1)

In [8]:
smi = clean_smi(example_row_binds["molecule_smiles"][0].as_py())
mol_example = get_mol(smi)

### MACCS keys

https://pubs.acs.org/doi/10.1021/ci010132r

In [9]:
from rdkit.Chem import MACCSkeys

In [10]:
maccs = MACCSkeys.GenMACCSKeys(mol_example)

### Morgan fingerprint

https://pubs.acs.org/doi/10.1021/ci100050t

https://batistalab.com/classes/CHEM584/ci100050t.pdf

In [11]:
ecfp_example = AllChem.GetMorganFingerprintAsBitVect(
    mol_example,
    radius=3,
    nBits=248
)
print(list(ecfp_example))

[1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0]
