## Introduction

In this notebook, I generate Morgan fingerprints from SMILES strings using RDKit. These binary molecular descriptors capture substructural patterns and are commonly used in cheminformatics tasks.

Main steps:
- Load SMILES from the preprocessed ECBD compound dataframe.
- Use RDKit to compute 256-bit Morgan fingerprints with radius 2.
- Store the resulting binary vectors for each compound as a new dataframe.

In [20]:
import os
import glob
from datetime import date

import numpy as np
import pandas as pd
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

In [21]:
# paths to data
save_path = "result/"

# feature generation parameters
features_count = 256

We will use 256 features, because we have this average of columns from other sources (cell_painting, chemichal data). If we will use more columns then data will be noisy. If less then fingerprint data will not show much of structural environments of atoms.

In [22]:
# loading our initial dataset
file_ecbd_type = '/*[0-9]_' + 'ecbd.csv'
files_ecbd = glob.glob(save_path + file_ecbd_type)

# gets latest file
max_file_ecbd = max(files_ecbd, key=os.path.getctime)

# load file
df_ecbd = pd.read_csv(max_file_ecbd)
df_ecbd

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,probe,experimental probe,calculated probe,available,...,moa_potentiator,moa_stabiliser,moa_stimulator,moa_substrate,drug_status_approved,drug_status_experimental,drug_status_investigational,drug_status_nutraceutical,drug_status_vet_approved,drug_status_withdrawn
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# initializes the generator
morgan_gen = GetMorganGenerator(radius=2, fpSize=features_count)

In [24]:
# this function generates the fingerprints
def smiles_to_fp_modern(smiles):
    try:
        # convert SMILES to RDKit molecule object
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros(morgan_gen.GetFingerprint(mol).GetNumBits())  # fallback
        # make sure the molecule is canonical
        canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
        mol_canon = Chem.MolFromSmiles(canonical_smiles)
        # generate the Morgan fingerprint
        fp = morgan_gen.GetFingerprint(mol_canon)
        return np.array(fp)
    except:
        return np.zeros(morgan_gen.GetFingerprint(Chem.MolFromSmiles("C")).GetNumBits())  # fallback

In [25]:
# apply the function to the DataFrame
tqdm.pandas()
fingerprints = df_ecbd['Metadata_smiles'].progress_apply(smiles_to_fp_modern)

# add prefix fp_ to the columns with fingerprints
df_fingerprint = pd.DataFrame(
    np.vstack(fingerprints),
    columns=[f'fp_{i}' for i in range(features_count)],
    index=df_ecbd.index
)
df_fingerprint

100%|██████████| 2464/2464 [00:02<00:00, 875.97it/s]


Unnamed: 0,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,fp_7,fp_8,fp_9,...,fp_246,fp_247,fp_248,fp_249,fp_250,fp_251,fp_252,fp_253,fp_254,fp_255
0,0,0,0,0,0,0,1,0,0,0,...,1,1,1,0,0,1,0,0,0,0
1,1,1,1,0,1,0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2461,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [26]:
# concat the fingerprints with the original DataFrame
df_ecbd_fp = pd.concat([df_ecbd, df_fingerprint], axis=1)
df_ecbd_fp

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,probe,experimental probe,calculated probe,available,...,fp_246,fp_247,fp_248,fp_249,fp_250,fp_251,fp_252,fp_253,fp_254,fp_255
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,1,1,1,0,0,1,0,0,0,0
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,1,0,1,0,0,0,1
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,1,1,0,1,0
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [27]:
# save the DataFrame with fingerprints to a CSV file
filename = save_path + str(date.today()) + "_ecbd_fp.csv"
df_ecbd_fp.to_csv(filename, index = False)

# Summary
-	Computed 256-dimensional Morgan fingerprints for all compounds with valid SMILES.
-	Created a feature matrix indexed by compound ID.
-	Output: a ready-to-use molecular feature table for downstream model input.