In [1]:
import sys, os
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdFMCS, Descriptors, PandasTools,SaltRemover
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
%matplotlib inline
remover = SaltRemover.SaltRemover()

In [3]:
df1 = pd.read_csv('dataset/train_set.csv')
df1 = df1[['BBclass','Check','smiles']]
df1.head(2)

Unnamed: 0,BBclass,Check,smiles
0,1,2,CN1CCC[C@@H](c2nc3ccccc3n2Cc2ccc(F)cc2)C1
1,0,2,C=CCn1c2c(c3cc(C(=O)N4CCC(C)CC4)ccc31)CN(CC1CC...


In [4]:
df2 = pd.read_csv('dataset/test_set.csv')
df2 = df2[['BBclass','Check','smiles']]
df2.head(2)

Unnamed: 0,BBclass,Check,smiles
0,1,2,Cc1nnc2n1-c1ccc(Cl)cc1C(c1ccccc1)=NC2
1,1,2,Oc1ncnc2[nH]ncc12


In [5]:
df = pd.concat([df1,df2], ignore_index=True)
len(df)

4463

### 1. Remove_duplicates_for_Dataset

In [7]:
def Check_duplicate_label(df):
    duplicate = df[df.duplicated(['smiles'])]
    dup_dict = {}
    
    for i,j in zip(duplicate['smiles'],duplicate['BBclass']):
        if i not in dup_dict.keys():
            dup_dict[i] = set()
            dup_dict[i].add(j)
        else:
            dup_dict[i].add(j)
            
    inconsistent = [i for i, j in dup_dict.items() if len(j) > 1 ]
    
    return inconsistent

In [8]:
inconsistent = Check_duplicate_label(df)
len(inconsistent)

0

In [9]:
df = df.drop_duplicates('smiles')
print ('After drop duplicates: ',len(df))

After drop duplicates:  4463


In [10]:
ss = pd.read_csv('dataset/train_set_2.csv')
df = df.loc[~df['smiles'].isin(ss.smiles.tolist())]
print ('After removing overlap with the train set : ',len(df))

After removing overlap with the train set :  2563


In [11]:
df.to_csv('dataset/test_set_combined.csv',index=False)

### 2. Calculate 1D and 2D molecular descriptors

In [12]:
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

Total_descriptors = [x[0] for x in Descriptors._descList]
len(Total_descriptors)

208

In [13]:
DescCalc = MolecularDescriptorCalculator(Total_descriptors)

def GetRDKitDescriptors(smile):
# Function for the calculation of ligand descriptors
    mol = Chem.MolFromSmiles(smile)
    mol.UpdatePropertyCache(strict=False)
    Chem.GetSymmSSSR(mol)
    return DescCalc.CalcDescriptors(mol)

In [14]:
df = pd.read_csv('dataset/test_set_combined.csv')

Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)
df.to_csv('dataset/feature_set/test_set_combined.csv', index=False)

### Calculate_MorganFingerprint

In [15]:
def Cal_fp(smile):
    fp = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3)
    
    return list(map(int, fp.ToBitString()))

columns = ['MFP_%d'%(i+1) for i in range(2048)]

In [16]:
df = pd.read_csv('dataset/feature_set/test_set_combined.csv')

Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

df.to_csv('dataset/feature_set_all/test_set_combined.csv', index=False)