In [1]:
import sys, os
import numpy as np
import pandas as pd
import rdkit
import random
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

### Calculate 1D and 2D molecular descriptors

In [2]:
Total_descriptors = [x[0] for x in Descriptors._descList]
len(Total_descriptors)

208

In [3]:
DescCalc = MolecularDescriptorCalculator(Total_descriptors)

def GetRDKitDescriptors(smile):
# Function for the calculation of ligand descriptors
    mol = Chem.MolFromSmiles(smile)
    mol.UpdatePropertyCache(strict=False)
    Chem.GetSymmSSSR(mol)
    return DescCalc.CalcDescriptors(mol)

In [4]:
df = pd.read_csv('dataset/train_set.csv')
df.head(2)

Unnamed: 0,logBB,BBclass,Check,smiles
0,0.43,1,2,CN1CCC[C@@H](c2nc3ccccc3n2Cc2ccc(F)cc2)C1
1,-0.13,0,2,C=CCn1c2c(c3cc(C(=O)N4CCC(C)CC4)ccc31)CN(CC1CC...


In [5]:
Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)

In [6]:
df.head(2)

Unnamed: 0,logBB,BBclass,Check,smiles,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.43,1,2,CN1CCC[C@@H](c2nc3ccccc3n2Cc2ccc(F)cc2)C1,13.206916,-0.191326,13.206916,0.191326,0.725377,323.415,...,0,0,0,0,0,0,0,0,0,0
1,-0.13,0,2,C=CCn1c2c(c3cc(C(=O)N4CCC(C)CC4)ccc31)CN(CC1CC...,13.161295,0.203395,13.161295,0.203395,0.703853,391.559,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df.to_csv('dataset/feature_set/train_set.csv', index=False)

Similar for test set.

In [8]:
df = pd.read_csv('dataset/test_set.csv')

Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)
df.to_csv('dataset/feature_set/test_set.csv', index=False)

### Calculate_MorganFingerprint

In [2]:
df = pd.read_csv('dataset/feature_set/train_set.csv')
df.head(2)

Unnamed: 0,logBB,BBclass,Check,smiles,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.43,1,2,CN1CCC[C@@H](c2nc3ccccc3n2Cc2ccc(F)cc2)C1,13.206916,-0.191326,13.206916,0.191326,0.725377,323.415,...,0,0,0,0,0,0,0,0,0,0
1,-0.13,0,2,C=CCn1c2c(c3cc(C(=O)N4CCC(C)CC4)ccc31)CN(CC1CC...,13.161295,0.203395,13.161295,0.203395,0.703853,391.559,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def Cal_fp(smile):
    fp = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3)
    #fp = AllChem.GetHashedAtomPairFingerprintAsBitVect(Chem.MolFromSmiles(smile))
    #fp = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(Chem.MolFromSmiles(smile))
    #fp =  AllChem.GetMACCSKeysFingerprint(Chem.MolFromSmiles(smile))
    return list(map(int, fp.ToBitString()))

columns = ['MFP_%d'%(i+1) for i in range(167)]

In [5]:
Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

In [6]:
df.head(2)

Unnamed: 0,logBB,BBclass,Check,smiles,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,MFP_158,MFP_159,MFP_160,MFP_161,MFP_162,MFP_163,MFP_164,MFP_165,MFP_166,MFP_167
0,0.43,1,2,CN1CCC[C@@H](c2nc3ccccc3n2Cc2ccc(F)cc2)C1,13.206916,-0.191326,13.206916,0.191326,0.725377,323.415,...,0,1,0,1,1,1,1,0,1,0
1,-0.13,0,2,C=CCn1c2c(c3cc(C(=O)N4CCC(C)CC4)ccc31)CN(CC1CC...,13.161295,0.203395,13.161295,0.203395,0.703853,391.559,...,0,1,0,1,1,1,1,1,1,0


In [7]:
df.to_csv('dataset/feature_set_all/train_set.csv', index=False)

Similar for test set.

In [8]:
df = pd.read_csv('dataset/feature_set/test_set.csv')

Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

df.to_csv('dataset/feature_set_all/test_set.csv', index=False)