In [1]:
import pandas as pd
import glob
import numpy as np
def tanimoto_dissimilarity(X, Y, X_batch_size=50, Y_batch_size=50):
    n_features = X.shape[-1]
    if X.ndim == 1:
        X = X.reshape(-1, n_features)
    if Y.ndim == 1:
        Y = Y.reshape(-1, n_features)    
    tan_sim = []
    X_total_batches = X.shape[0] // X_batch_size + 1
    Y_total_batches = Y.shape[0] // Y_batch_size + 1
    for X_batch_i in range(X_total_batches):
        X_start_idx = X_batch_i*X_batch_size
        X_end_idx = min((X_batch_i+1)*X_batch_size, X.shape[0])
        X_batch = X[X_start_idx:X_end_idx,:]
        for Y_batch_i in range(Y_total_batches):
            Y_start_idx = Y_batch_i*Y_batch_size
            Y_end_idx = min((Y_batch_i+1)*Y_batch_size, Y.shape[0])
            Y_batch = Y[Y_start_idx:Y_end_idx,:]
            
            # adapted from: https://github.com/deepchem/deepchem/blob/2531eca8564c1dc68910d791b0bcd91fd586afb9/deepchem/trans/transformers.py#L752
            numerator = np.dot(X_batch, Y_batch.T).flatten() # equivalent to np.bitwise_and(X_batch, Y_batch), axis=1)
            denominator = n_features - np.dot(1-X_batch, (1-Y_batch).T).flatten() # np.sum(np.bitwise_or(X_rep, Y_rep), axis=1)
            
            tan_sim.append(numerator / denominator)
    tan_sim = np.hstack(tan_sim)
    return 1.0 - tan_sim

num_files = len(glob.glob('../datasets/lc_clusters_cv_96/unlabeled_*.csv'))
csv_files_list = ['../datasets/lc_clusters_cv_96/unlabeled_{}.csv'.format(i) for i in range(num_files)]
    
df = pd.concat([pd.read_csv(f) for f in csv_files_list])
X_train = np.vstack([np.fromstring(x, 'u1') - ord('0') for x in df['Morgan FP_2_1024']]).astype(float)



In [2]:
X_train = np.vstack([np.fromstring(x, 'u1') - ord('0') for x in df['Morgan FP_2_1024']]).astype(float)
c2 = np.memmap('../datasets/clustering/cluster_assigment_vector_0.2.dat', 
                mode='r', dtype='int32', shape=(df.shape[0],))
c3 = np.memmap('../datasets/clustering/cluster_assigment_vector_0.3.dat', 
                mode='r', dtype='int32', shape=(df.shape[0],))
c4 = np.memmap('../datasets/clustering/cluster_assigment_vector_0.4.dat', 
                mode='r', dtype='int32', shape=(df.shape[0],))
dissimilarity_matrix = np.memmap('../datasets/dissimilarity_matrix_94857_94857.dat', 
                                 shape=(94857,94857), mode='r', dtype='float16')

c2_df = pd.DataFrame(data=np.vstack([c2, np.arange(c2.shape[0])]).T,
                     columns=['Cluster_0.2', 'Index ID'])
c3_df = pd.DataFrame(data=np.vstack([c3, np.arange(c3.shape[0])]).T,
                     columns=['Cluster_0.3', 'Index ID'])
c4_df = pd.DataFrame(data=np.vstack([c4, np.arange(c4.shape[0])]).T,
                     columns=['Cluster_0.4', 'Index ID'])


cl2 = np.memmap('../datasets/clustering/cluster_leader_idx_vector_0.2.dat', 
                mode='r', dtype='int32', shape=(df.shape[0],))
cl3 = np.memmap('../datasets/clustering/cluster_leader_idx_vector_0.3.dat', 
                mode='r', dtype='int32', shape=(df.shape[0],))
cl4 = np.memmap('../datasets/clustering/cluster_leader_idx_vector_0.4.dat', 
                mode='r', dtype='int32', shape=(df.shape[0],))

cl2_df = pd.DataFrame(data=np.vstack([cl2, np.arange(cl2.shape[0])]).T,
                     columns=['Cluster_0.2_leader_idx', 'Index ID'])
cl3_df = pd.DataFrame(data=np.vstack([cl3, np.arange(cl3.shape[0])]).T,
                     columns=['Cluster_0.3_leader_idx', 'Index ID'])
cl4_df = pd.DataFrame(data=np.vstack([cl4, np.arange(cl4.shape[0])]).T,
                     columns=['Cluster_0.4_leader_idx', 'Index ID'])

  if __name__ == '__main__':


In [3]:
u2, cc2 = np.unique(c2, return_counts=True)
u3, cc3 = np.unique(c3, return_counts=True)
u4, cc4 = np.unique(c4, return_counts=True)

In [4]:
u2.shape, u3.shape, u4.shape, np.where(cc2==1)[0].shape, np.where(cc3==1)[0].shape, np.where(cc4==1)[0].shape

((87178,), (64171,), (29044,), (82152,), (51996,), (15646,))

In [24]:
import scipy.spatial.distance
h_list = np.where(np.in1d(c3, u3[np.where(cc3 == 1)[0]]))[0][2100:2200]
h_list = df.reset_index()[df.reset_index()['Index ID'].isin(h_list)].index.values
cnidx = -1
for h in h_list:
    mint = 1000
    for i in range(df.shape[0]):
        if i != h:
            curr_min = scipy.spatial.distance.jaccard(X_train[h], X_train[i])
            if curr_min < mint:
                mint = curr_min
            
    print(h, mint)
    assert(mint >= 0.3)

1211 0.4461538461538462
1986 0.41333333333333333
2040 0.5588235294117647
3670 0.4444444444444444
3690 0.5490196078431373
4164 0.3770491803278688
4323 0.32786885245901637
5084 0.36923076923076925
6256 0.44
8047 0.3684210526315789
9829 0.45454545454545453
9932 0.37037037037037035
10974 0.30612244897959184
11560 0.3939393939393939
11733 0.37735849056603776
12239 0.3888888888888889
12474 0.4642857142857143
12947 0.39705882352941174
13095 0.38461538461538464
13456 0.3888888888888889
14109 0.3898305084745763
14283 0.32608695652173914
14648 0.4927536231884058
14810 0.37254901960784315
15272 0.5072463768115942
15841 0.3793103448275862
15856 0.3382352941176471
15860 0.3559322033898305
19190 0.3548387096774194
19353 0.391304347826087
21071 0.4
21538 0.3333333333333333
22576 0.4230769230769231
22818 0.4444444444444444
24269 0.42857142857142855
24692 0.3492063492063492
25006 0.3333333333333333
26082 0.35714285714285715
26576 0.5686274509803921
26765 0.47692307692307695
28209 0.3611111111111111
285

In [5]:
import pandas as pd
import glob
import numpy as np

new_fmt = '../datasets/lc_clusters_cv_96_new/unlabeled_{}.csv'
num_files = len(glob.glob('../datasets/lc_clusters_cv_96/unlabeled_*.csv'))
csv_files_list = ['../datasets/lc_clusters_cv_96/unlabeled_{}.csv'.format(i) for i in range(num_files)]

for i, f in enumerate(csv_files_list):
    df = pd.read_csv(f)
    merge_df = pd.merge(df.drop('Cluster_0.2', axis=1), c2_df, how='inner', on='Index ID')
    assert np.array_equal(df['Index ID'].values, merge_df['Index ID'].values)
    merge_df = pd.merge(merge_df.drop('Cluster_0.3', axis=1), c3_df, how='inner', on='Index ID')
    assert np.array_equal(df['Index ID'].values, merge_df['Index ID'].values)
    merge_df = pd.merge(merge_df.drop('Cluster_0.4', axis=1), c4_df, how='inner', on='Index ID')
    assert np.array_equal(df['Index ID'].values, merge_df['Index ID'].values)
    merge_df.to_csv(new_fmt.format(i), index=False)

In [13]:
import pandas as pd
import glob
import numpy as np

new_fmt = '../datasets/lc_clusters_cv_96_new/unlabeled_{}.csv'
num_files = len(glob.glob('../datasets/lc_clusters_cv_96/unlabeled_*.csv'))
csv_files_list = ['../datasets/lc_clusters_cv_96/unlabeled_{}.csv'.format(i) for i in range(num_files)]

for i, f in enumerate(csv_files_list):
    df = pd.read_csv(f)
    merge_df = pd.merge(df, cl2_df, how='inner', on='Index ID')
    assert np.array_equal(df['Index ID'].values, merge_df['Index ID'].values)
    merge_df = pd.merge(merge_df, cl3_df, how='inner', on='Index ID')
    assert np.array_equal(df['Index ID'].values, merge_df['Index ID'].values)
    merge_df = pd.merge(merge_df, cl4_df, how='inner', on='Index ID')
    assert np.array_equal(df['Index ID'].values, merge_df['Index ID'].values)
    merge_df.to_csv(new_fmt.format(i), index=False)

In [14]:
import pandas as pd
import glob
import numpy as np

new_fmt = '../datasets/lc_clusters_cv_96_new/unlabeled_{}.csv'
num_files = len(glob.glob(new_fmt.format('*')))
csv_files_list = [new_fmt.format(i) for i in range(num_files)]

df = pd.concat([pd.read_csv(f) for f in csv_files_list])

In [15]:
df.to_csv('../datasets/all_data.csv.gz', compression='gzip', index=False)

In [20]:
df[df['Cluster_0.2'] == 3333]

Unnamed: 0,Index ID,Molecule,Murcko Scaffold ID,rdkit SMILES,Morgan FP_2_1024,PriA-SSB Activity,PriA-SSB % inhibition,rdkit_BT_Cluster_0.3,Cluster_0.2,Cluster_0.3,Cluster_0.4,Cluster_0.2_leader_idx,Cluster_0.3_leader_idx,Cluster_0.4_leader_idx
63,79440,SMSSF-0600364,8575,CCOc1ccc(-c2cc(C(=O)N3CCC(Cc4ccccc4)CC3)no2)cc1,0000000000000000000000000000010001000000001000...,0,-7.278879,9866,3333,8084,8442,22331,22331,79440
5,22331,SMSSF-0054313,8575,COc1ccc(-c2cc(C(=O)N3CCC(Cc4ccccc4)CC3)no2)cc1,0000000000000000000000000000010001000000001000...,0,-10.66485,9866,3333,8084,8442,22331,22331,79440
