In [26]:
import pandas as pd
import numpy as np

res_df = pd.read_csv('../datasets/ams_order_results.csv.gz')
cluster_df = pd.read_csv('../datasets/clustering.csv.gz')

In [27]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.FilterCatalog import *

FP_radius=2
FP_size=1024
saltRemover = SaltRemover(defnFilename='../datasets/raw/Salts.txt')
rdkit_mols = res_df['rdkit SMILES'].astype(str).apply((lambda x: Chem.MolFromSmiles(x)))
rdkit_mols = rdkit_mols.apply((lambda x: saltRemover.StripMol(x)))
res_df['rdkit SMILES'] = rdkit_mols.apply((lambda x: Chem.MolToSmiles(x)))
res_df['1024 MorganFP Radius 2'] = rdkit_mols.apply((lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 
                                                                                       radius=FP_radius, 
                                                                                       nBits=FP_size).ToBitString()))

params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS_A)
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS_B)
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS_C)
pains_catalog = FilterCatalog(params)

res_df['PAINS Filter'] = rdkit_mols.apply((lambda x: not pains_catalog.HasMatch(x))).astype(int)
res_df.to_csv('../datasets/ams_order_results.csv.gz', index=False)

In [3]:
res_df['Molecule ID'] = res_df['Structure_ID']
res_df['SMSSF ID'] = res_df['SMSF']
res_df['Index ID'] = np.arange(res_df.shape[0])
cluster_df = cluster_df.iloc[:-1041,:]
cluster_df = pd.concat([cluster_df, res_df[cluster_df.columns.tolist()]])
cluster_df['Index ID'] = np.arange(cluster_df.shape[0])
cluster_df.to_csv('../datasets/clustering.csv.gz', index=False)

In [16]:
import pandas as pd
import numpy as np

res_df = pd.read_csv('../datasets/ams_order_results.csv.gz')

In [17]:
res_df['Replicate1 Filter >= 50.0'] = (res_df['Replicate1'] >= 50.0).astype(int)
res_df['Replicate2 Filter >= 50.0'] = (res_df['Replicate2'] >= 50.0).astype(int)
res_df['Hit'] = (res_df['Replicate1 Filter >= 50.0'].astype(bool) & res_df['Replicate2 Filter >= 50.0'].astype(bool) & res_df['PAINS Filter'].astype(bool)).astype(int)

res_df = res_df[res_df.columns.tolist()[:-3] + ['Replicate1 Filter >= 50.0', 'Replicate2 Filter >= 50.0', 'PAINS Filter']]
res_df.to_csv('../datasets/ams_order_results.csv.gz', index=False)

In [8]:
import pandas as pd
import numpy as np

res_df = pd.read_csv('../datasets/ams_order_results.csv.gz')
res_df = res_df[res_df.columns.tolist()[:-4] + ['Replicate1 Filter >= 50.0', 'Replicate2 Filter >= 50.0', 'PAINS Filter', 'Hit']]
res_df.to_csv('../datasets/ams_order_results.csv.gz', index=False)

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance

res_df = pd.read_csv('../datasets/ams_order_results.csv.gz')
train_df = pd.read_csv('../datasets/folds/training_df_single_fold_with_clustering.csv.gz')
cluster_df = pd.read_csv('../datasets/clustering.csv.gz')
X = np.vstack([np.fromstring(x, 'u1') - ord('0') for x in cluster_df['1024 MorganFP Radius 2']]).astype(float)

  import sys


In [29]:
cluster_2 = np.memmap('../datasets/prospective_clustering/cluster_assigment_vector_0.2.dat', 
                      mode='r', dtype='int32', shape=(cluster_df.shape[0],))
cluster_2l = np.memmap('../datasets/prospective_clustering/cluster_leader_idx_vector_0.2.dat', 
                      mode='r', dtype='int32', shape=(cluster_df.shape[0],))
cluster_3 = np.memmap('../datasets/prospective_clustering/cluster_assigment_vector_0.3.dat', 
                      mode='r', dtype='int32', shape=(cluster_df.shape[0],))
cluster_3l = np.memmap('../datasets/prospective_clustering/cluster_leader_idx_vector_0.3.dat', 
                      mode='r', dtype='int32', shape=(cluster_df.shape[0],))
cluster_4 = np.memmap('../datasets/prospective_clustering/cluster_assigment_vector_0.4.dat', 
                      mode='r', dtype='int32', shape=(cluster_df.shape[0],))
cluster_4l = np.memmap('../datasets/prospective_clustering/cluster_leader_idx_vector_0.4.dat', 
                      mode='r', dtype='int32', shape=(cluster_df.shape[0],))

train_df['BT_0.2 ID'] = cluster_2[:-res_df.shape[0]]
res_df['BT_0.2 ID'] = cluster_2[-res_df.shape[0]:]
train_df['BT_0.2 Leader'] = cluster_2l[:-res_df.shape[0]]
res_df['BT_0.2 Leader'] = cluster_2l[-res_df.shape[0]:]

train_df['BT_0.3 ID'] = cluster_3[:-res_df.shape[0]]
res_df['BT_0.3 ID'] = cluster_3[-res_df.shape[0]:]
train_df['BT_0.3 Leader'] = cluster_3l[:-res_df.shape[0]]
res_df['BT_0.3 Leader'] = cluster_3l[-res_df.shape[0]:]

train_df['BT_0.4 ID'] = cluster_4[:-res_df.shape[0]]
res_df['BT_0.4 ID'] = cluster_4[-res_df.shape[0]:]
train_df['BT_0.4 Leader'] = cluster_4l[:-res_df.shape[0]]
res_df['BT_0.4 Leader'] = cluster_4l[-res_df.shape[0]:]

train_df.to_csv('../datasets/folds/training_df_single_fold_with_clustering.csv.gz', index=False)
res_df.to_csv('../datasets/ams_order_results.csv.gz', index=False)

In [52]:
np.unique(cluster_2).shape, np.unique(cluster_3).shape, np.unique(cluster_4).shape

((339827,), (199697,), (88390,))

In [49]:
distance.jaccard(X[135150], X[136369])

0.11290322580645161

In [50]:
phits = res_df[res_df['Hit'] == 1]
uhits = res_df[res_df['Hit'] == 1]['BT_0.2 ID'].unique()

In [51]:
phits.shape, uhits.shape

((412, 33), (351,))

In [59]:
thits = train_df[train_df['PriA-SSB AS Activity'] == 1]

(351,)

In [4]:
import numpy as np

u_mat = np.memmap('../clustering/tmp/dissimilarity_matrix_428324_428324.dat', 
                       dtype='float16', mode='r', shape=(428324, 428324))

In [7]:
A = dis_mat[:427283,:427283]
B = dis_mat[:427283,427283:]
C = dis_mat[427283:,:427283]
D = dis_mat[427283:,427283:]
A.shape, B.shape, C.shape, D.shape

((427283, 427283), (427283, 1024), (1024, 427283), (1024, 1024))

In [8]:
u_mat[:427283,:427283] = A
u_mat.flush()

In [9]:
u_mat[:427283,427283+17:] = B
u_mat.flush()

In [10]:
u_mat[427283+17:,:427283] = C
u_mat.flush()

In [11]:
u_mat[427283+17:,427283+17:] = D
u_mat.flush()

In [12]:
def tanimoto_dissimilarity(X, Y, X_batch_size=50, Y_batch_size=50):
    n_features = X.shape[-1]
    if X.ndim == 1:
        X = X.reshape(-1, n_features)
    if Y.ndim == 1:
        Y = Y.reshape(-1, n_features)    
    tan_sim = []
    X_total_batches = X.shape[0] // X_batch_size + 1
    Y_total_batches = Y.shape[0] // Y_batch_size + 1
    for X_batch_i in range(X_total_batches):
        X_start_idx = X_batch_i*X_batch_size
        X_end_idx = min((X_batch_i+1)*X_batch_size, X.shape[0])
        X_batch = X[X_start_idx:X_end_idx,:]
        for Y_batch_i in range(Y_total_batches):
            Y_start_idx = Y_batch_i*Y_batch_size
            Y_end_idx = min((Y_batch_i+1)*Y_batch_size, Y.shape[0])
            Y_batch = Y[Y_start_idx:Y_end_idx,:]
            
            # adapted from: https://github.com/deepchem/deepchem/blob/2531eca8564c1dc68910d791b0bcd91fd586afb9/deepchem/trans/transformers.py#L752
            numerator = np.dot(X_batch, Y_batch.T).flatten() # equivalent to np.bitwise_and(X_batch, Y_batch), axis=1)
            denominator = n_features - np.dot(1-X_batch, (1-Y_batch).T).flatten() # np.sum(np.bitwise_or(X_rep, Y_rep), axis=1)
            
            tan_sim.append(numerator / denominator)
    tan_sim = np.hstack(tan_sim)
    return 1.0 - tan_sim

k = tanimoto_dissimilarity(X, Y)
k = k.reshape(428324, 17).T

In [13]:
u_mat[427283:427283+17,:] = k
u_mat.flush()

In [14]:
u_mat[:,427283:427283+17] = k.T
u_mat.flush()

In [15]:
del u_mat