In [1]:
import sys, os
import numpy as np
import pandas as pd
import rdkit
import random
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem, Descriptors, SaltRemover
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
import warnings
warnings.filterwarnings('ignore')
remover = SaltRemover.SaltRemover()

### Calculate 1D and 2D molecular descriptors

In [2]:
Total_descriptors = [x[0] for x in Descriptors._descList]
len(Total_descriptors)

209

In [3]:
DescCalc = MolecularDescriptorCalculator(Total_descriptors)

def GetRDKitDescriptors(smile):
# Function for the calculation of ligand descriptors
    mol = Chem.MolFromSmiles(smile)
    mol.UpdatePropertyCache(strict=False)
    Chem.GetSymmSSSR(mol)
    return DescCalc.CalcDescriptors(mol)

In [4]:
def GetLarge(string):
    '''
    Get the longest smiles string
    '''
    List = string.split('.')
    List = sorted(List, key=lambda x: len(x), reverse=True)
    return List[0]

def Check_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(GetLarge(smiles))
        if mol is None:
            return np.NaN
        else:
            mol = remover.StripMol(mol, dontRemoveEverything=True)
            return GetLarge(Chem.MolToSmiles(mol))
    except:
        return np.NaN

In [5]:
df = pd.read_csv('/home/cyanguestc/work/archive/old_examples/geo-gcn/data/molecules/bbbp_train.csv')
df['smiles'] = df['smiles'].map(lambda x: Check_smiles(x))
df.head(3)

Unnamed: 0,smiles,y
0,CC(C)NCC(O)COc1cccc2ccccc12,1
1,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1
2,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,1


In [6]:
len(df)

1631

In [7]:
df = df.loc[~df['smiles'].isnull()]
len(df)

1631

In [8]:
Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)

In [9]:
df.head(2)

Unnamed: 0,smiles,y,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC(C)NCC(O)COc1cccc2ccccc12,1,9.843954,9.843954,0.299333,-0.498733,0.837506,259.349,238.181,259.157229,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1,11.682268,11.682268,0.134704,-0.409691,0.474821,360.325,333.109,359.141884,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df.to_csv('data_curation/train_set.csv', index=False)

Similar for valid set.

In [11]:
df = pd.read_csv('/home/cyanguestc/work/archive/old_examples/geo-gcn/data/molecules/bbbp_val.csv')
df['smiles'] = df['smiles'].map(lambda x: Check_smiles(x))

Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)
df.to_csv('data_curation/val_set.csv', index=False)

Similar for test set.

In [12]:
df = pd.read_csv('/home/cyanguestc/work/archive/old_examples/geo-gcn/data/molecules/bbbp_test.csv')
df['smiles'] = df['smiles'].map(lambda x: Check_smiles(x))

Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)
df.to_csv('data_curation/test_set.csv', index=False)

### Calculate_MorganFingerprint

In [13]:
df = pd.read_csv('data_curation/train_set.csv')
df.head(2)

Unnamed: 0,smiles,y,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC(C)NCC(O)COc1cccc2ccccc12,1,9.843954,9.843954,0.299333,-0.498733,0.837506,259.349,238.181,259.157229,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1,11.682268,11.682268,0.134704,-0.409691,0.474821,360.325,333.109,359.141884,...,0,0,0,0,0,0,0,0,0,0


In [14]:
def Cal_fp(smile):
    fp = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3)
    #fp = AllChem.GetHashedAtomPairFingerprintAsBitVect(Chem.MolFromSmiles(smile))
    #fp = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(Chem.MolFromSmiles(smile))
    #fp =  AllChem.GetMACCSKeysFingerprint(Chem.MolFromSmiles(smile))
    return list(map(int, fp.ToBitString()))

columns = ['MFP_%d'%(i+1) for i in range(2048)]

In [15]:
Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

In [16]:
df.head(2)

Unnamed: 0,smiles,y,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,MFP_2039,MFP_2040,MFP_2041,MFP_2042,MFP_2043,MFP_2044,MFP_2045,MFP_2046,MFP_2047,MFP_2048
0,CC(C)NCC(O)COc1cccc2ccccc12,1,9.843954,9.843954,0.299333,-0.498733,0.837506,259.349,238.181,259.157229,...,1,0,0,0,0,0,0,0,0,0
1,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1,11.682268,11.682268,0.134704,-0.409691,0.474821,360.325,333.109,359.141884,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df.to_csv('data_curation/train_set_all.csv', index=False)

Similar for valid set.

In [18]:
df = pd.read_csv('data_curation/val_set.csv')

Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

df.to_csv('data_curation/val_set_all.csv', index=False)

Similar for test set.

In [19]:
df = pd.read_csv('data_curation/test_set.csv')

Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

df.to_csv('data_curation/test_set_all.csv', index=False)