In [1]:
import sys, os
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdFMCS, Descriptors, PandasTools,SaltRemover
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
%matplotlib inline
remover = SaltRemover.SaltRemover()

### 1. Check_smiles_and_remove_duplicates_for_test_set

In [2]:
def GetLarge(string):
    List = string.split('.')
    List = sorted(List, key=lambda x:len(x), reverse=True)
    return List[0]

In [3]:
def Check_duplicate_label(df):
    duplicate = df[df.duplicated(['smiles'])]
    dup_dict = {}
    
    for i,j in zip(duplicate['smiles'],duplicate['BBclass']):
        if i not in dup_dict.keys():
            dup_dict[i] = set()
            dup_dict[i].add(j)
        else:
            dup_dict[i].add(j)
            
    inconsistent = [i for i, j in dup_dict.items() if len(j) > 1 ]
    
    return inconsistent

In [4]:
df = pd.read_csv('dataset/LightBBB_dataset/y_indices_external.csv')
df.head(2)

Unnamed: 0,smiles,logBB,BBclass
0,CCCN(CCC)CCc1ccc(c2c1CC(N2)=C)O,,1
1,c12c(C(c3ccccc3)=NCc3n1c(nn3)C)cc(Cl)cc2,,1


In [5]:
Check = []
new_smile = []
for i in df.smiles.tolist():
    try:
        mol = Chem.MolFromSmiles(GetLarge(i))
        if mol is None:
            Check.append(1)
            new_smile.append(0)
        else:
            Check.append(2)
            mol = remover.StripMol(mol, dontRemoveEverything=True)
            new_smile.append(GetLarge(Chem.MolToSmiles(mol)))
    except:
        Check.append(0)
        new_smile.append(0)

RDKit ERROR: [00:25:16] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [00:25:16] Explicit valence for atom # 4 O, 3, is greater than permitted
RDKit ERROR: [00:25:16] Explicit valence for atom # 13 O, 3, is greater than permitted
RDKit ERROR: [00:25:16] Explicit valence for atom # 14 N, 4, is greater than permitted


In [6]:
df['Check'] = pd.Series(Check, index=df.index)
df['new_smile'] = pd.Series(new_smile, index=df.index)
df = df[['logBB','BBclass','Check','new_smile']]
df = df.rename(columns={"new_smile": "smiles"})
print ('Before check smiles:', len(df))
df = df.loc[df['Check']==2]
print ('After remove invalid smiles:', len(df))

Before check smiles: 74
After remove invalid smiles: 70


In [7]:
inconsistent = Check_duplicate_label(df)
len(inconsistent)

0

In [8]:
df = df.drop_duplicates('smiles')
len(df)

70

remove overlaps with train_set_2.csv

In [9]:
ss = pd.read_csv('dataset/train_set_2.csv')
df = df.loc[~df['smiles'].isin(ss.smiles.tolist())]
len(df)

44

In [10]:
df.to_csv('dataset/test_set_2.csv',index=False)

### 2. Calculate 1D and 2D molecular descriptors

In [11]:
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

Total_descriptors = [x[0] for x in Descriptors._descList]
len(Total_descriptors)

208

In [12]:
DescCalc = MolecularDescriptorCalculator(Total_descriptors)

def GetRDKitDescriptors(smile):
# Function for the calculation of ligand descriptors
    mol = Chem.MolFromSmiles(smile)
    mol.UpdatePropertyCache(strict=False)
    Chem.GetSymmSSSR(mol)
    return DescCalc.CalcDescriptors(mol)

In [13]:
df = pd.read_csv('dataset/test_set_2.csv')

Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)
df.to_csv('dataset/feature_set/test_set_2.csv', index=False)

### 3. Calculate_MorganFingerprint

In [14]:
def Cal_fp(smile):
    fp = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3)
    
    return list(map(int, fp.ToBitString()))

columns = ['MFP_%d'%(i+1) for i in range(2048)]

In [15]:
df = pd.read_csv('dataset/feature_set/test_set_2.csv')

Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

df.to_csv('dataset/feature_set_all/test_set_2.csv', index=False)