In [95]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
import os
import numpy as np
import polars as pl
from tqdm import tqdm

In [2]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v4.csv').select(
        pl.col('molecule'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        # pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(test_df.estimated_size('gb'), 'GB')
test_df

0.06128192972391844 GB


molecule
str
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
…
"""Cn1ncc2cc(Nc3n…"
"""[N-]=[N+]=NCCC…"
"""COC(=O)c1ccnc(…"
"""COC1CCC(CCNc2n…"


## Sanitize

In [93]:
for _smiles in test_df[1000:2000:10, 'molecule']:
    print('\t', _smiles)
    smiles = Chem.MolToSmiles(Chem.MolFromSmiles(_smiles), doRandom = True)
    assert Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) == _smiles
    print('\t', smiles)
    
    mol = Chem.MolFromSmiles(smiles, sanitize=True)
    smiles1 = Chem.MolToSmiles(mol, isomericSmiles = True, canonical = False, kekuleSmiles = True,
                              allBondsExplicit = False, allHsExplicit = False)
    print('\t', smiles1)
    
    mol = Chem.MolFromSmiles(smiles, sanitize=False)
    
    flags = Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_CLEANUP
    Chem.SanitizeMol(mol, flags, catchErrors=True)

#     Chem.SanitizeMol(mol)

    Chem.Kekulize(mol, clearAromaticFlags=True)
    smiles2 = Chem.MolToSmiles(mol, isomericSmiles = True, canonical = False, kekuleSmiles = True,
                              allBondsExplicit = False, allHsExplicit = False)
    print('\t', smiles2)
    
    # get atom list and shuffle
    mol = Chem.MolFromSmiles(smiles)
    ans = list(range(mol.GetNumAtoms()))
    np.random.shuffle(ans)
    # re-order the molecule
    smiles3 = Chem.MolToSmiles(Chem.RenumberAtoms(mol, ans), canonical=False)
    print('\t', smiles3)
    
        
    print('\t', Chem.MolToSmiles(Chem.MolFromSmiles(smiles3)))
    assert smiles1 == smiles2
    assert Chem.MolToSmiles(Chem.MolFromSmiles(smiles3)) == _smiles
    
    print('\n')

	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCC2CCCC2)nc(Nc2nc(C)cc(C)n2)n1
	 Cc1cc(nc(Nc2nc(nc(n2)N[C@@H](CC#C)CC(N[Dy])=O)NCC2CCCC2)n1)C
	 CC1=CC(C)=NC(NC2=NC(NCC3CCCC3)=NC(N[C@@H](CC#C)CC(N[Dy])=O)=N2)=N1
	 CC1=CC(C)=NC(NC2=NC(NCC3CCCC3)=NC(N[C@@H](CC#C)CC(N[Dy])=O)=N2)=N1
	 C1CCCC1CNc1nc(Nc2nc(C)cc(C)n2)nc(N[C@@H](CC#C)CC(N[Dy])=O)n1
	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCC2CCCC2)nc(Nc2nc(C)cc(C)n2)n1


	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCc2cnc(F)cc2C)nc(Nc2ncccc2C(=O)OCC)n1
	 N(C(C[C@@H](Nc1nc(nc(n1)Nc1c(C(OCC)=O)cccn1)NCc1cnc(F)cc1C)CC#C)=O)[Dy]
	 N(C(C[C@@H](NC1=NC(NCC2=CN=C(F)C=C2C)=NC(NC2=C(C(OCC)=O)C=CC=N2)=N1)CC#C)=O)[Dy]
	 N(C(C[C@@H](NC1=NC(NCC2=CN=C(F)C=C2C)=NC(NC2=C(C(OCC)=O)C=CC=N2)=N1)CC#C)=O)[Dy]
	 c1(C)c(CNc2nc(N[C@H](CC(=O)N[Dy])CC#C)nc(Nc3ncccc3C(=O)OCC)n2)cnc(F)c1
	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCc2cnc(F)cc2C)nc(Nc2ncccc2C(=O)OCC)n1


	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCCNS(=O)(=O)c2ccn(C)c2)nc(Nc2cnc(Br)c(C)c2)n1
	 c1cn(C)cc1S(=O)(=O)NCCNc1nc(nc(N[C@@H](CC#C)CC(N[Dy])=O)n1)Nc1cc(C)c(nc1)

	 N1=NC(C)=C(CNC2=NC(NC3=CC=C(Br)C(F)=N3)=NC(N[C@H](CC(N[Dy])=O)CC#C)=N2)S1
	 n1c(F)c(Br)ccc1Nc1nc(N[C@@H](CC#C)CC(=O)N[Dy])nc(NCc2c(C)nns2)n1
	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCc2snnc2C)nc(Nc2ccc(Br)c(F)n2)n1


	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(Nc2ccc(C(=O)N3CCOCC3)cc2)nc(Nc2c(C)cc(Cl)nc2Cl)n1
	 c1(ccc(C(=O)N2CCOCC2)cc1)Nc1nc(nc(Nc2c(nc(Cl)cc2C)Cl)n1)N[C@H](CC(N[Dy])=O)CC#C
	 C1(NC2=NC(N[C@H](CC(N[Dy])=O)CC#C)=NC(NC3=C(Cl)N=C(Cl)C=C3C)=N2)=CC=C(C(=O)N2CCOCC2)C=C1
	 C1(NC2=NC(N[C@H](CC(N[Dy])=O)CC#C)=NC(NC3=C(Cl)N=C(Cl)C=C3C)=N2)=CC=C(C(=O)N2CCOCC2)C=C1
	 c1c(C(=O)N2CCOCC2)ccc(Nc2nc(Nc3c(C)cc(Cl)nc3Cl)nc(N[C@@H](CC#C)CC(N[Dy])=O)n2)c1
	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(Nc2ccc(C(=O)N3CCOCC3)cc2)nc(Nc2c(C)cc(Cl)nc2Cl)n1


	 C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCC(O)CS(C)=O)nc(Nc2ccc(CC3COC(=O)N3)cc2)n1
	 O=S(C)CC(O)CNc1nc(Nc2ccc(CC3NC(OC3)=O)cc2)nc(N[C@@H](CC#C)CC(=O)N[Dy])n1
	 O=S(C)CC(O)CNC1=NC(NC2=CC=C(CC3NC(=O)OC3)C=C2)=NC(N[C@@H](CC#C)CC(=O)N[Dy])=N1
	 O=S(C)CC(O)CNC1=NC(NC2=CC=C(CC3NC(=O)OC3)C=C2)

In [153]:
def shuffle_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    ans = list(range(mol.GetNumAtoms()))
    np.random.shuffle(ans)
    # re-order the molecule
    smiles2 = Chem.MolToSmiles(Chem.RenumberAtoms(mol, ans), canonical=False)
    assert Chem.MolToSmiles(Chem.MolFromSmiles(smiles2)) == smiles
    return smiles2

In [154]:
l1 = []
l2 = []
smiles = test_df[1234, 'molecule']
for i in tqdm(range(100_000)):
    l1.append(shuffle_mol(smiles))

for i in tqdm(range(100_000)):
    l2.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), doRandom = True, canonical = False))
    

100%|█████████████████████████████████| 100000/100000 [02:18<00:00, 724.25it/s]
100%|████████████████████████████████| 100000/100000 [00:54<00:00, 1837.97it/s]


In [150]:
len(set(l1)), len(set(l2)), set(l1) == set(l2)

(4019, 8989, False)

In [151]:
len(set(l2).intersection(set(l1)))

679

In [123]:
print(l1[:5])
print(l2[:5])

['c1(CNc2nc(Nc3nc(OCc4ccccc4)c4nc[nH]c4n3)nc(N[C@@H](CC#C)CC(N[Dy])=O)n2)c[nH]nc1', '[nH]1ncc(CNc2nc(Nc3nc4[nH]cnc4c(OCc4ccccc4)n3)nc(N[C@@H](CC#C)CC(N[Dy])=O)n2)c1', '[C@@H](CC#C)(Nc1nc(Nc2nc3[nH]cnc3c(OCc3ccccc3)n2)nc(NCc2c[nH]nc2)n1)CC(N[Dy])=O', 'c1(N[C@H](CC(=O)N[Dy])CC#C)nc(NCc2c[nH]nc2)nc(Nc2nc3c(nc[nH]3)c(OCc3ccccc3)n2)n1', 'n1[nH]cc(CNc2nc(N[C@@H](CC#C)CC(N[Dy])=O)nc(Nc3nc(OCc4ccccc4)c4nc[nH]c4n3)n2)c1']
['c1(N[C@@H](CC#C)CC(N[Dy])=O)nc(nc(n1)NCc1cn[nH]c1)Nc1nc(OCc2ccccc2)c2c([nH]cn2)n1', 'N(c1nc2c(nc[nH]2)c(OCc2ccccc2)n1)c1nc(nc(n1)NCc1c[nH]nc1)N[C@H](CC(N[Dy])=O)CC#C', 'N(C(C[C@@H](Nc1nc(NCc2cn[nH]c2)nc(n1)Nc1nc(c2c([nH]cn2)n1)OCc1ccccc1)CC#C)=O)[Dy]', 'O=C(N[Dy])C[C@@H](Nc1nc(nc(n1)Nc1nc(c2nc[nH]c2n1)OCc1ccccc1)NCc1cn[nH]c1)CC#C', 'O=C(C[C@@H](Nc1nc(Nc2nc(c3c(n2)[nH]cn3)OCc2ccccc2)nc(NCc2c[nH]nc2)n1)CC#C)N[Dy]']


In [5]:
for smiles in test_df[:20, 'molecule']:
    print('\t', smiles)
#     mol = Chem.MolFromSmiles(smiles)
    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
    Chem.Kekulize(mol, clearAromaticFlags=True)
    smiles = Chem.MolToSmiles(mol, kekuleSmiles=True, canonical=True)
    print('\t', smiles)
    print('\n')

	 C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]
	 C#CCCC[C@H](NC1=NC(NC2=CC=C(C=C)C=C2)=NC(NC2=CC=CC3=C2NN=C3)=N1)C(=O)N[Dy]


	 C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2ncn3CC(C)O)n1)C(=O)N[Dy]
	 C#CCCC[C@H](NC1=NC(NC2=CC=C(C=C)C=C2)=NC(NC2=CC=CC3=C2NN=C3)=N1)C(=O)N[Dy]


	 C#CCCC[C@H](Nc1nc(NCC2(O)CCCC2(C)C)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]
	 C#CCCC[C@H](NC1=NC(NC2=CC=C(C=C)C=C2)=NC(NC2=CC=CC3=C2NN=C3)=N1)C(=O)N[Dy]


	 C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2sc(Cl)cc2C(=O)OC)n1)C(=O)N[Dy]
	 C#CCCC[C@H](NC1=NC(NC2=CC=C(C=C)C=C2)=NC(NC2=CC=CC3=C2NN=C3)=N1)C(=O)N[Dy]


	 C#CCCC[C@H](Nc1nc(NCC2CCC(SC)CC2)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]
	 C#CCCC[C@H](NC1=NC(NC2=CC=C(C=C)C=C2)=NC(NC2=CC=CC3=C2NN=C3)=N1)C(=O)N[Dy]


	 C#CCCC[C@H](Nc1nc(NCc2ccc(C)cc2N2CCCC2)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]
	 C#CCCC[C@H](NC1=NC(NC2=CC=C(C=C)C=C2)=NC(NC2=CC=CC3=C2NN=C3)=N1)C(=O)N[Dy]


	 C#CCCC[C@H](Nc1nc(NCCc2ccc(OCC(=O)Nc3cccc(C)c3)cc2)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]
	 C#CCCC[C@H](NC1=NC

In [None]:
# def kekulize(smiles):
    

def standardise(self, smiles: str, canonicalise: Optional[bool] = None) -> Optional[str]:
        """
        Standardise a SMILES string if valid (canonical + kekulized)

        Args:
            smiles: SMILES string
            canonicalise: optional flag to override `self.canonicalise`

        Returns: standard version the SMILES if valid, None otherwise

        """
        try:
            mol = Chem.MolFromSmiles(smiles, sanitize=False)
        except Exception as e:
            # invalid?
            logging.warning(f'Chem.MolFromSmiles failed smiles="{smiles}" error={e}')
            return None

        if mol is None:
            # invalid?
            logging.warning(f'Chem.MolFromSmiles failed smiles="{smiles}"')
            return None

        flags = Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_CLEANUP
        Chem.SanitizeMol(mol, flags, catchErrors=True)

        if self.canonicalise or canonicalise:
            # bug where permuted smiles are not canonicalised to the same form.
            # This is fixed by round tripping SMILES
            mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
            if mol is None:
                logging.warning(f'Chem.MolFromSmiles failed after sanitization smiles="{smiles}"')
                return None

        try:
            Chem.Kekulize(mol, clearAromaticFlags=True)
            smiles = Chem.MolToSmiles(mol, kekuleSmiles=True, canonical=self.canonicalise or canonicalise)
        except (ValueError, RuntimeError):
            logging.warning(f'SMILES failed Kekulization! {smiles}')
            return None

        return smiles