In [1]:
import sys, os
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdFMCS, Descriptors, PandasTools,SaltRemover
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
%matplotlib inline
remover = SaltRemover.SaltRemover()

### 1. Check_smiles_and_remove_duplicates_for_train_set

In [2]:
df = pd.read_csv('dataset/LightBBB_dataset/y_test_indices.csv')
df.head(2)

Unnamed: 0,smiles,logBB,BBclass
0,c1cc(F)ccc1Cn(c(c23)cccc2)c(n3)[C@@H]4CCCN(C)C4,0.43,1
1,CC1CCN(CC1)C(=O)c(c2)ccc3n(CC=C)c(c4c23)CCN(C4...,-0.13,0


In [3]:
def GetLarge(string):
    List = string.split('.')
    List = sorted(List, key=lambda x:len(x), reverse=True)
    return List[0]

In [4]:
Check = []
new_smile = []
for i in df.smiles.tolist():
    try:
        mol = Chem.MolFromSmiles(GetLarge(i))
        if mol is None:
            Check.append(1)
            new_smile.append(0)
        else:
            Check.append(2)
            mol = remover.StripMol(mol, dontRemoveEverything=True)
            new_smile.append(GetLarge(Chem.MolToSmiles(mol)))
    except:
        Check.append(0)
        new_smile.append(0)

RDKit ERROR: [15:38:14] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:38:14] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:38:15] Explicit valence for atom # 

In [5]:
df['Check'] = pd.Series(Check, index=df.index)
df['new_smile'] = pd.Series(new_smile, index=df.index)
df = df[['logBB','BBclass','Check','new_smile']]
df = df.rename(columns={"new_smile": "smiles"})
df.head(2)

Unnamed: 0,logBB,BBclass,Check,smiles
0,0.43,1,2,CN1CCC[C@@H](c2nc3ccccc3n2Cc2ccc(F)cc2)C1
1,-0.13,0,2,C=CCn1c2c(c3cc(C(=O)N4CCC(C)CC4)ccc31)CN(CC1CC...


In [6]:
len(df), len(df.loc[df['Check']==2])

(7162, 7141)

In [7]:
df = df.loc[df['Check']==2]

In [8]:
def Check_duplicate_label(df):
    duplicate = df[df.duplicated(['smiles'])]
    dup_dict = {}
    
    for i,j in zip(duplicate['smiles'],duplicate['BBclass']):
        if i not in dup_dict.keys():
            dup_dict[i] = set()
            dup_dict[i].add(j)
        else:
            dup_dict[i].add(j)
            
    inconsistent = [i for i, j in dup_dict.items() if len(j) > 1 ]
    
    return inconsistent

In [9]:
inconsistent = Check_duplicate_label(df)
len(inconsistent)

77

for example

In [10]:
df.loc[df['smiles']==inconsistent[-1]]

Unnamed: 0,logBB,BBclass,Check,smiles
1791,,0,2,CCCCc1nc(Cl)c(CO)n1Cc1ccc(-c2ccccc2-c2nnn[nH]2...
5222,,1,2,CCCCc1nc(Cl)c(CO)n1Cc1ccc(-c2ccccc2-c2nnn[nH]2...
6220,,0,2,CCCCc1nc(Cl)c(CO)n1Cc1ccc(-c2ccccc2-c2nnn[nH]2...


In [11]:
df.loc[df['smiles']==inconsistent[-2]]

Unnamed: 0,logBB,BBclass,Check,smiles
2416,,1,2,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...
4743,,1,2,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...
6216,,0,2,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...


So, drop duplicates and inconsistent cases

In [12]:
df = df.drop_duplicates('smiles')
print ('After drop duplicates: ',len(df))
df = df.loc[~df['smiles'].isin(inconsistent)]
print ('After drop inconsistent cases: ',len(df))

After drop duplicates:  4507
After drop inconsistent cases:  4430


In [13]:
df.to_csv('dataset/train_set.csv',index=False)

### 2. Similar curation for test set

In [14]:
df = pd.read_csv('dataset/LightBBB_dataset/y_indices_external.csv')
df.head(2)

Unnamed: 0,smiles,logBB,BBclass
0,CCCN(CCC)CCc1ccc(c2c1CC(N2)=C)O,,1
1,c12c(C(c3ccccc3)=NCc3n1c(nn3)C)cc(Cl)cc2,,1


In [15]:
Check = []
new_smile = []
for i in df.smiles.tolist():
    try:
        mol = Chem.MolFromSmiles(GetLarge(i))
        if mol is None:
            Check.append(1)
            new_smile.append(0)
        else:
            Check.append(2)
            mol = remover.StripMol(mol, dontRemoveEverything=True)
            new_smile.append(GetLarge(Chem.MolToSmiles(mol)))
    except:
        Check.append(0)
        new_smile.append(0)

RDKit ERROR: [15:38:31] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [15:38:31] Explicit valence for atom # 4 O, 3, is greater than permitted
RDKit ERROR: [15:38:31] Explicit valence for atom # 13 O, 3, is greater than permitted
RDKit ERROR: [15:38:31] Explicit valence for atom # 14 N, 4, is greater than permitted


In [16]:
df['Check'] = pd.Series(Check, index=df.index)
df['new_smile'] = pd.Series(new_smile, index=df.index)
df = df[['logBB','BBclass','Check','new_smile']]
df = df.rename(columns={"new_smile": "smiles"})
print ('Before check smiles:', len(df))
df = df.loc[df['Check']==2]
print ('After remove invalid smiles:', len(df))

Before check smiles: 74
After remove invalid smiles: 70


In [17]:
inconsistent = Check_duplicate_label(df)
len(inconsistent)

0

In [18]:
df = df.drop_duplicates('smiles')
len(df)

70

In [19]:
df.head(2)

Unnamed: 0,logBB,BBclass,Check,smiles
0,,1,2,C=C1Cc2c(CCN(CCC)CCC)ccc(O)c2N1
1,,1,2,Cc1nnc2n1-c1ccc(Cl)cc1C(c1ccccc1)=NC2


remove overlaps with train set

In [20]:
ss = pd.read_csv('dataset/train_set.csv')
df = df.loc[~df['smiles'].isin(ss.smiles.tolist())]
len(df)

33

In [21]:
df.to_csv('dataset/test_set.csv',index=False)