In [1]:
# install rdkit
pip install rdkit

Note: you may need to restart the kernel to use updated packages.


In [2]:
#install data progress bar library
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [3]:
#importing the cheminformatics module needed
import molvs
import pandas as pd
import numpy as np
from rdkit import Chem
import tempfile
import os
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem import MolStandardize
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from tqdm import tqdm
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.drawOptions.comicMode=True
import rdkit
print(rdkit.__version__)

2022.09.5


In [4]:
#load the datasets containing the replicase standardized smiles
df = pd.read_csv('C:/Users/ogbue/Replicase_stand_smi_data.csv')
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,pchembl_value,bioactivity_class,standardized_smiles
0,CHEMBL194398,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H...,5.94,low activity,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H...
1,CHEMBL393608,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,5.3,low activity,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...
2,CHEMBL238216,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,5.48,low activity,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...
3,CHEMBL235873,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,4.82,low activity,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...
4,CHEMBL397154,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,5.0,low activity,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...


In [5]:
#generate the standardized molecule from the standardized smiles 
stand_mol = []
stand_smi = df["standardized_smiles"].tolist()
for i in stand_smi:
    standardized_mol = Chem.MolFromSmiles(i)
    stand_mol.append(standardized_mol)
df['standardized_molecule'] = pd.DataFrame(stand_mol)

In [6]:
#generate the maccs keys chemical fingerprints for substructure search
def generate_MACCSfpts(data):
    maccs_fpts = []
    for mol in tqdm(data):
        mkeyfpts = MACCSkeys.GenMACCSKeys(mol)
        maccs_fpts.append(mkeyfpts)
    return np.array(maccs_fpts)
maccs_fpts = generate_MACCSfpts(df['standardized_molecule'])
maccskeys_fingerprints = pd.DataFrame(maccs_fpts, columns=['Col_A_{}'.format(i + 1)
                                  for i in range(maccs_fpts.shape[1])])
maccskeys_fingerprints

100%|█████████████████████████████████████████████████████████████████████████████| 1087/1087 [00:01<00:00, 742.61it/s]


Unnamed: 0,Col_A_1,Col_A_2,Col_A_3,Col_A_4,Col_A_5,Col_A_6,Col_A_7,Col_A_8,Col_A_9,Col_A_10,...,Col_A_158,Col_A_159,Col_A_160,Col_A_161,Col_A_162,Col_A_163,Col_A_164,Col_A_165,Col_A_166,Col_A_167
0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,0,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1082,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,1
1083,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,0
1084,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,1
1085,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,0


In [7]:
#path based fingerprints - rdkfingerprints, Daylight-like
def generate_RDKfpts(data):
    RDK_fpts = []
    for mol in tqdm(data):
        rdkfpts = AllChem.RDKFingerprint(mol, maxPath=5, fpSize=2048, nBitsPerHash=2 )
        RDK_fpts.append(rdkfpts)
    return np.array(RDK_fpts)
RDK_fpts = generate_RDKfpts(df['standardized_molecule'])
#put it in dataframe
RDK_fingerprints = pd.DataFrame(RDK_fpts, columns=['Col_B_{}'.format(i + 1)
                                  for i in range(RDK_fpts.shape[1])])

100%|████████████████████████████████████████████████████████████████████████████| 1087/1087 [00:00<00:00, 2308.80it/s]


In [8]:
##toplogical fingerprints - atom pair
def generate_APfpts(data):
    AP_fpts = []
    for mol in tqdm(data):
        apfpts = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=2048)
        AP_fpts.append(apfpts)
    return np.array(AP_fpts)

AP_fpts = generate_APfpts(df['standardized_molecule'])
#put it in dataframe
AP_fingerprints = pd.DataFrame(AP_fpts, columns=['Col_C_{}'.format(i + 1)
                                  for i in range(AP_fpts.shape[1])])
AP_fingerprints.head()
AP_fingerprints.shape

100%|████████████████████████████████████████████████████████████████████████████| 1087/1087 [00:00<00:00, 7030.09it/s]


(1087, 2048)

In [9]:
##toplogical fingerprints - topological torsion
def generate_TTfpts(data):
    TT_fpts = []
    for mol in tqdm(data):
        ttfpts = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=2048)
        TT_fpts.append(ttfpts)
    return np.array(TT_fpts)

TT_fpts = generate_TTfpts(df['standardized_molecule'])
#put it in dataframe
TT_fingerprints = pd.DataFrame(TT_fpts, columns=['Col_D_{}'.format(i + 1)
                                  for i in range(TT_fpts.shape[1])])
TT_fingerprints.head()
TT_fingerprints.shape

100%|████████████████████████████████████████████████████████████████████████████| 1087/1087 [00:00<00:00, 5970.42it/s]


(1087, 2048)

In [10]:
##extended connectivity finger prints -  FINGERPRINTS
def generate_Morganfpts(data, radius):
    MORGAN_fpts = []
    for mol in tqdm(data):
        morganfpts = AllChem.GetMorganFingerprintAsBitVect(mol,radius, nBits=2048)
        MORGAN_fpts.append(morganfpts)
    return np.array(MORGAN_fpts)

ECFP4= generate_Morganfpts(df['standardized_molecule'], 2)
#put it in dataframe
ECFP4_fingerprints = pd.DataFrame(ECFP4, columns=['Col_E_{}'.format(i + 1)
                                  for i in range(ECFP4.shape[1])])
ECFP4_fingerprints.head()
ECFP4_fingerprints.shape

100%|███████████████████████████████████████████████████████████████████████████| 1087/1087 [00:00<00:00, 11099.47it/s]


(1087, 2048)

In [11]:
##extended connectivity finger prints - FEATURE CONNECTIVITY FINGERPRINTS
def generate_Featurefpts(data, radius):
    FEATURE_fpts = []
    for mol in tqdm(data):
        featurefpts = AllChem.GetMorganFingerprintAsBitVect(mol,radius, useFeatures=True, nBits=2048)
        FEATURE_fpts.append(featurefpts)
    return np.array(FEATURE_fpts)

FCFP4= generate_Morganfpts(df['standardized_molecule'], 2)
#put it in dataframe
FCFP4_fingerprints = pd.DataFrame(FCFP4, columns=['Col_F_{}'.format(i + 1)
                                  for i in range(FCFP4.shape[1])])
FCFP4_fingerprints.head()
FCFP4_fingerprints.shape

100%|████████████████████████████████████████████████████████████████████████████| 1087/1087 [00:00<00:00, 9772.15it/s]


(1087, 2048)

In [12]:
#calculate the RDKit descriptors
def RDKit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    mol_descriptors = []
    for mol in mols:
        mol = Chem.AddHs(mol)
        descriptors = calc.CalcDescriptors(mol)
        mol_descriptors.append(descriptors)
    return mol_descriptors, desc_names
mol_descriptors, desc_names = RDKit_descriptors(stand_smi)
df_with_208descriptors = pd.DataFrame(mol_descriptors, columns=desc_names)
df_with_208descriptors

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,15.578289,-6.051165,15.578289,1.177661,0.214429,580.682000,540.362,580.289700,226,0,...,0,0,0,0,0,0,0,0,0,0
1,15.321838,-6.228222,15.321838,1.209786,0.200115,547.653000,506.325,547.300599,216,0,...,0,0,0,0,0,0,0,0,0,0
2,15.451414,-5.821998,15.451414,1.156920,0.192647,581.670000,542.358,581.284949,226,0,...,0,0,0,0,0,0,0,0,0,0
3,15.688578,-6.305191,15.688578,1.251911,0.110032,659.781000,610.389,659.353028,260,0,...,0,0,0,0,0,0,0,0,0,0
4,15.559002,-6.572998,15.559002,1.304776,0.105334,625.764000,574.356,625.368678,250,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1082,14.991201,-7.626566,14.991201,0.000000,0.161561,579.692000,537.356,579.259031,216,0,...,0,0,0,0,0,0,0,0,0,0
1083,14.707674,-5.890763,14.707674,1.094124,0.353786,477.602000,438.290,477.283886,190,0,...,0,0,0,0,0,0,0,0,0,0
1084,15.031361,-7.621745,15.031361,0.000000,0.127831,581.664000,541.344,581.238295,216,0,...,0,0,0,0,0,0,0,0,0,0
1085,14.707674,-5.890763,14.707674,1.094124,0.352307,479.614204,438.290,479.296439,190,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#generate the toplological pharmacophore atom triplets fingerprints
#first convert the standardized smiles to an sdf file and generate file path
def toSDF(smiles_list):
    temp_dir = tempfile.mkdtemp()
    w = Chem.SDWriter(os.path.join(temp_dir, "temp.sdf"))

    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            AllChem.Compute2DCoords(mol)
            w.write(mol)

    w.close()

    sdf_path = os.path.join(temp_dir, "temp.sdf")
    return sdf_path
#Generate the SDF file using the function toSDF
sdf_file_path = toSDF(stand_smi)
print("Generated SDF file path:", sdf_file_path)

Generated SDF file path: C:\Users\ogbue\AppData\Local\Temp\tmpn9usxpj0\temp.sdf


In [14]:
#now copy the sdf_file path
#Download and install strawberry perl from https://strawberryperl.com/ to use perl script
#run the following command on the terminal to generate a csv file replicaseTPATF.csv containing the tpatf.
# perl TopologicalPharmacophoreAtomTripletsFingerprints.pl --AtomTripletsSetSizeToUse FixedSize -v ValuesString -r replicaseTPATF -o sdf_file_path

#open the csv file and put the fingerprints into array
file_path = "C:/Users/ogbue/OneDrive/Desktop/github_documents/replicaseTPATF.csv"

with open(file_path, 'r') as f:
    all_features = []
    for line in f.readlines():
        if "Cmpd" in line:
            line = line.split(';')[5].replace('"', '')
            features = [int(i) for i in line.split(" ")]
            all_features.append(features)

features_array = np.array(features)
all_features_array = np.array(all_features)
all_features_array.shape

(1087, 2692)

In [15]:
all_features_array

array([[18, 36, 12, ...,  0,  0,  0],
       [10,  9, 12, ...,  0,  0,  0],
       [18, 36, 12, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [16]:
TPAT_fingerprints = pd.DataFrame(all_features_array, columns=['Col_G_{}'.format(i + 1)
                                  for i in range(all_features_array.shape[1])])
TPAT_fingerprints.head()
#TPAT_fingerprints.shape

Unnamed: 0,Col_G_1,Col_G_2,Col_G_3,Col_G_4,Col_G_5,Col_G_6,Col_G_7,Col_G_8,Col_G_9,Col_G_10,...,Col_G_2683,Col_G_2684,Col_G_2685,Col_G_2686,Col_G_2687,Col_G_2688,Col_G_2689,Col_G_2690,Col_G_2691,Col_G_2692
0,18,36,12,0,0,0,33,8,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,9,12,0,0,0,2,8,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18,36,12,0,0,0,32,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,27,0,0,0,0,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Concatenate the data frames column-wise
result = pd.concat([df, maccskeys_fingerprints,RDK_fingerprints, AP_fingerprints, TT_fingerprints, ECFP4_fingerprints, TPAT_fingerprints, df_with_208descriptors], axis=1)
result.shape

(1087, 11265)

In [22]:
# Step 1: Check for NaN or blank spaces in all columns
rows_with_nan_or_blank = result.isnull().any(axis=1)

# Step 2: Drop the rows with NaN or blank spaces from the DataFrame
result_df = result.drop(result[rows_with_nan_or_blank].index)

In [24]:
result_df.shape

(1028, 11265)

In [25]:
result_df.to_csv('features_column.csv', index=False)