In [1]:
import sys, os
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdFMCS, Descriptors, PandasTools,SaltRemover
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
%matplotlib inline
remover = SaltRemover.SaltRemover()

In [2]:
df = pd.read_csv('dataset/B3DB_dataset/B3DB_classification.tsv',sep='\t', header=0)
df.head(2)

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,BBB-,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,


In [3]:
len(df)

7807

In [4]:
df['BBclass'] = df['BBB+/BBB-'].map(lambda x: 0 if x=='BBB-' else 1)
df.BBclass.value_counts()

1    4956
0    2851
Name: BBclass, dtype: int64

In [5]:
len(df.loc[~df['logBB'].isnull()])

1058

In [6]:
df = df.rename(columns={"SMILES": "smiles"})
df = df[['smiles','logBB','BBclass']]

### 1. Check_smiles_and_remove_duplicates_for_test_set

In [7]:
def GetLarge(string):
    List = string.split('.')
    List = sorted(List, key=lambda x:len(x), reverse=True)
    return List[0]

In [8]:
def Check_duplicate_label(df):
    duplicate = df[df.duplicated(['smiles'])]
    dup_dict = {}
    
    for i,j in zip(duplicate['smiles'],duplicate['BBclass']):
        if i not in dup_dict.keys():
            dup_dict[i] = set()
            dup_dict[i].add(j)
        else:
            dup_dict[i].add(j)
            
    inconsistent = [i for i, j in dup_dict.items() if len(j) > 1 ]
    
    return inconsistent

In [9]:
Check = []
new_smile = []
for i in df.smiles.tolist():
    try:
        mol = Chem.MolFromSmiles(GetLarge(i))
        if mol is None:
            Check.append(1)
            new_smile.append(0)
        else:
            Check.append(2)
            mol = remover.StripMol(mol, dontRemoveEverything=True)
            new_smile.append(GetLarge(Chem.MolToSmiles(mol)))
    except:
        Check.append(0)
        new_smile.append(0)

In [10]:
df['Check'] = pd.Series(Check, index=df.index)
df['new_smile'] = pd.Series(new_smile, index=df.index)
df = df[['logBB','BBclass','Check','new_smile']]
df = df.rename(columns={"new_smile": "smiles"})
print ('Before check smiles:', len(df))
df = df.loc[df['Check']==2]
print ('After remove invalid smiles:', len(df))

Before check smiles: 7807
After remove invalid smiles: 7807


In [11]:
inconsistent = Check_duplicate_label(df)
len(inconsistent)

0

In [12]:
df = df.drop_duplicates('smiles')
len(df)

7804

remove overlaps with train_set_2.csv and train_set.csv

In [13]:
s1 = pd.read_csv('dataset/train_set_2.csv')
df = df.loc[~df['smiles'].isin(s1.smiles.tolist())]
print (len(df))
s2 = pd.read_csv('dataset/train_set.csv')
df = df.loc[~df['smiles'].isin(s2.smiles.tolist())]
print (len(df))

6083
3806


In [14]:
df1 = df.loc[~df['logBB'].isnull()]
len(df1)

313

In [15]:
df1.to_csv('dataset/test_set_6.csv',index=False)

### 2. Calculate 1D and 2D molecular descriptors

In [16]:
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

Total_descriptors = [x[0] for x in Descriptors._descList]
len(Total_descriptors)

208

In [17]:
DescCalc = MolecularDescriptorCalculator(Total_descriptors)

def GetRDKitDescriptors(smile):
# Function for the calculation of ligand descriptors
    mol = Chem.MolFromSmiles(smile)
    mol.UpdatePropertyCache(strict=False)
    Chem.GetSymmSSSR(mol)
    return DescCalc.CalcDescriptors(mol)

In [18]:
df = pd.read_csv('dataset/test_set_6.csv')

Features = []

for i in df.smiles.tolist():
    Features.append(GetRDKitDescriptors(i))
    
ss = pd.DataFrame(Features, columns=Total_descriptors)
df = pd.concat([df,ss], axis=1, ignore_index=False)
df.to_csv('dataset/feature_set/test_set_6.csv', index=False)

### 3. Calculate_MorganFingerprint

In [19]:
def Cal_fp(smile):
    fp = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3)
    
    return list(map(int, fp.ToBitString()))

columns = ['MFP_%d'%(i+1) for i in range(2048)]

In [20]:
df = pd.read_csv('dataset/feature_set/test_set_6.csv')

Features = []
for i in df.smiles.tolist():
    Features.append(Cal_fp(i))
ss = pd.DataFrame(Features, columns=columns)
df = pd.concat([df,ss], axis=1, ignore_index=False)

df.to_csv('dataset/feature_set_all/test_set_6.csv', index=False)

In [26]:
df.BBclass.value_counts()

1    263
0     50
Name: BBclass, dtype: int64