In [16]:
import pandas as pd
import numpy as np
from rdkit import DataStructs
from rdkit.Chem import Draw, MolFromSmiles, AllChem
from tqdm import tqdm
import matplotlib.pyplot as plt
from hurry.filesize import size
import random

In [3]:
# df_train = pd.read_csv('template_matching/MIT_train.txt')
# df_test = pd.read_csv('template_matching/MIT_test.txt')

# df_train['mol'] = df_train['prod'].apply(lambda x: MolFromSmiles(x))
# df_test['mol'] = df_test['prod'].apply(lambda x: MolFromSmiles(x))

df_train = pd.read_csv('~/ml_physics/MolecularTransformer2/data/data/USPTO_15k/clean-traintgt.txt')
df_test = pd.read_csv('~/ml_physics/MolecularTransformer2/data/data/USPTO_15k/clean-testtgt.txt')

df_train['mol'] = df_train['SMILES'].apply(lambda x: MolFromSmiles(x))
df_test['mol'] = df_test['SMILES'].apply(lambda x: MolFromSmiles(x))

fprints_train = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024) for mol in df_train['mol'].values]
fprints_test = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024) for mol in df_test['mol'].values]

In [20]:
S_matrix = np.empty((len(fprints_train), len(fprints_test)))
bad_index_count=np.zeros(len(fprints_test))
bad_indices=[]
for i in tqdm(range(len(fprints_train))):
    for j in range(len(fprints_test)):
        S_matrix[i][j] = DataStructs.FingerprintSimilarity(fprints_train[i], fprints_test[j])
        
        sim = DataStructs.FingerprintSimilarity(fprints_train[i], fprints_test[j])
        if sim > 0.6:
            bad_index_count[j]=1
            bad_indices.append((i,j))
            
print('S_matrix size: {}'.format(size(S_matrix.nbytes)))

100%|██████████| 9236/9236 [02:20<00:00, 65.97it/s]

S_matrix size: 183M





In [21]:
print('bad_indices: {:.1f}%'.format(100*np.sum(bad_index_count)/len(bad_index_count)))
                                    
print('\nExamples:')
for pair in random.sample(bad_indices,100):
    i,j = pair
    print(df_train['SMILES'][i], df_test['SMILES'][j])

bad_indices: 37.1%

Examples:
Cc1nn(C)cc1-n1c(=O)n(C)c2cnc3ccc(-c4cnc5ccccc5c4)cc3c21 Cc1ncc(-c2ccc3ncc4c(c3c2)n(-c2cn(C)nc2C)c(=O)n4C)cc1N(C)C
O=C(NCc1cn(-c2ccccc2)c2cc(Cl)ccc2c1=O)N1CCOCC1 O=C(NCc1cn(-c2ccccc2)c2cc(Cl)ccc2c1=O)N1CCC(c2ccccc2)CC1
COCCCON1C(=O)c2ccccc2C1=O COCCON1C(=O)c2ccccc2C1=O
Nc1ncnn2c(-c3cccc(CNC4CCC4)c3)cc(-c3ccc4cn(Cc5ccccc5)nc4c3)c12 Nc1ncnn2c(-c3ccc(CN4CCC(F)(F)C4)cc3)cc(-c3ccc4cn(Cc5ccccc5)nc4c3)c12
CN(C(=O)N(C)C1CN(C(=O)CC#N)CC1c1ccc(F)cc1)c1cc(C(F)(F)F)cc(C(F)(F)F)c1 CN(C(=O)N(C)C1CN(C(=O)C2CC(F)(F)C2)CC1c1ccc(F)cc1)c1cc(C(F)(F)F)cc(C(F)(F)F)c1
CC(C)(C)c1cc2cc(NC(=O)C3(c4ccc5c(c4)OCO5)CC3)cc(C(N)=O)c2[nH]1 COC(=O)c1cc(NC(=O)C2(c3ccc4c(c3)OCO4)CC2)cc2cc(C(C)(C)C)[nH]c12
CC(=O)Nc1cc(-c2ccc3ncc4c(c3c2)n(-c2cn(C)nc2C)c(=O)n4C)cnc1Cl Cc1nn(C)cc1-n1c(=O)n(C)c2cnc3ccc(-c4cnc5cc[nH]c5c4)cc3c21
COC(=O)CCC(C(N)=O)N1Cc2c(OCc3ccc(CN4CCC(OC)CC4)cc3)cccc2C1=O COC(=O)CCC(C(N)=O)N1Cc2c(OCc3ccc(Cn4cnnc4)cc3)cccc2C1=O
CCOc1cc(C(C)(C)C)ncc1C1=NC(C)(c2ccc(Cl)cc2)C(C)(c2ccc(Cl

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(50,50))
plt.matshow(S_matrix, fignum=1)
cb = plt.colorbar(fraction=0.046, pad=0.04)

In [44]:
df = pd.read_csv('~/ml_physics/MolecularTransformer2/data/data/USPTO_15k/tgt.txt')

df['mol'] = df['SMILES'].apply(lambda x: MolFromSmiles(x))

fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024) for mol in df['mol'].values]

In [56]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.20)

fprints_train = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024) for mol in df_train['mol'].values]
fprints_test = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024) for mol in df_test['mol'].values]

S_matrix = np.empty((len(fprints_train), len(fprints_test)))
for i in tqdm(range(len(fprints_train))):
    for j in range(len(fprints_test)):
        S_matrix[i][j] = DataStructs.FingerprintSimilarity(fprints_train[i], fprints_test[j])

100%|██████████| 9472/9472 [01:09<00:00, 136.70it/s]


In [61]:
sim=0.5
print('Percentage of test set with similarity > {}: {:.1f}%'.format(sim, 100*np.count_nonzero(np.any(S_matrix>sim, axis=0))/len(S_matrix[0])))

Percentage of test set with similarity > 0.5: 52.3%


In [62]:
print('Original train/test ratio: {:.1f}%/{:.1f}%'.format(100*len(df_train)/len(df), 100*len(df_test)/len(df)))

df_train_new = df_train.append(df_test.iloc[np.any(S_matrix>sim, axis=0)])

df_test_new = df_test[~np.any(S_matrix>sim, axis=0)]

print('Post-split train/test ratio: {:.1f}%/{:.1f}%'.format(100*len(df_train_new)/len(df), 100*len(df_test_new)/len(df)))

Original train/test ratio: 80.0%/20.0%
Post-split train/test ratio: 90.5%/9.5%
