In [1]:
import sys, os
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdFMCS, Descriptors, PandasTools,SaltRemover
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
%matplotlib inline
remover = SaltRemover.SaltRemover()

### 1. Check_smiles_and_remove_duplicates_for_Dataset

In [2]:
df = pd.read_csv('dataset/MoleculeNet/BBBP.csv')
df.head(2)

Unnamed: 0,num,name,BBclass,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl


In [3]:
def GetLarge(string):
    List = string.split('.')
    List = sorted(List, key=lambda x:len(x), reverse=True)
    return List[0]

In [4]:
Check = []
new_smile = []
for i in df.smiles.tolist():
    try:
        mol = Chem.MolFromSmiles(GetLarge(i))
        if mol is None:
            Check.append(1)
            new_smile.append(0)
        else:
            Check.append(2)
            mol = remover.StripMol(mol, dontRemoveEverything=True)
            new_smile.append(GetLarge(Chem.MolToSmiles(mol)))
    except:
        Check.append(0)
        new_smile.append(0)

RDKit ERROR: [23:28:29] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [23:28:29] Explicit valence for atom # 5 N, 4, is greater than permitted


In [5]:
df['Check'] = pd.Series(Check, index=df.index)
df['new_smile'] = pd.Series(new_smile, index=df.index)
df = df[['num','name','BBclass','Check','new_smile']]
df = df.rename(columns={"new_smile": "smiles"})
df.head(2)

Unnamed: 0,num,name,BBclass,Check,smiles
0,1,Propanolol,1,2,CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,2,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1


In [6]:
len(df), len(df.loc[df['Check']==2])

(2050, 2039)

In [7]:
df = df.loc[df['Check']==2]

In [10]:
def Check_duplicate_label(df):
    duplicate = df[df.duplicated(['smiles'])]
    dup_dict = {}
    
    for i,j in zip(duplicate['smiles'],duplicate['BBclass']):
        if i not in dup_dict.keys():
            dup_dict[i] = set()
            dup_dict[i].add(j)
        else:
            dup_dict[i].add(j)
            
    inconsistent = [i for i, j in dup_dict.items() if len(j) > 1 ]
    
    return inconsistent

In [11]:
inconsistent = Check_duplicate_label(df)
len(inconsistent)

0

In [12]:
df = df.drop_duplicates('smiles')
print ('After drop duplicates: ',len(df))

After drop duplicates:  1967


#### Remove overlap with the test set

In [13]:
ss = pd.read_csv('dataset/test_set.csv')
df = df.loc[~df['smiles'].isin(ss.smiles.tolist())]
print ('After removing overlap with test set : ',len(df))

After removing overlap with test set :  1954


In [14]:
df.to_csv('dataset/train_set_2.csv',index=False)

### 2. Calculate 1D and 2D molecular descriptors

In [15]:
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

Total_descriptors = [x[0] for x in Descriptors._descList]
len(Total_descriptors)

208

In [16]:
DescCalc = MolecularDescriptorCalculator(Total_descriptors)

def GetRDKitDescriptors(smile):
# Function for the calculation of ligand descriptors
    mol = Chem.MolFromSmiles(smile)
    mol.UpdatePropertyCache(strict=False)
    Chem.GetSymmSSSR(mol)
    return DescCalc.CalcDescriptors(mol)

In [17]:
df = pd.read_csv('dataset/train_set_2.csv')

Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)
df.to_csv('dataset/feature_set/train_set_2.csv', index=False)

### 3. Calculate_MorganFingerprint

In [18]:
def Cal_fp(smile):
    fp = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3)
    
    return list(map(int, fp.ToBitString()))

columns = ['MFP_%d'%(i+1) for i in range(2048)]

In [19]:
df = pd.read_csv('dataset/feature_set/train_set_2.csv')

Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

df.to_csv('dataset/feature_set_all/train_set_2.csv', index=False)