In [5]:
import pandas as pd
import numpy as np
import glob

data_df = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/lc_clusters_cv_96/unlabeled_*.csv')])

print('Cpds #: {}. Actives #: {}'.format(data_df.shape, data_df['PriA-SSB Activity'].sum()))
u, c = np.unique(data_df['Cluster_0.4'], return_counts=True)
n_clusters = u.shape[0]
n_singletons = np.where(c == 1)[0].shape[0]
n_singletons_with_hits = data_df[data_df['Cluster_0.4'].isin(u[np.where(c == 1)[0]])]['PriA-SSB Activity'].sum()
print('Clusters #: {}. Singletons #: {}. Singletons with hits #: {}'.format(n_clusters, 
                                                                            n_singletons,
                                                                            n_singletons_with_hits))

Cpds #: (94857, 14). Actives #: 133
Clusters #: 29044. Singletons #: 15646. Singletons with hits #: 23


---
# Stratifying LC1234

In [1]:
import pandas as pd
import numpy as np
import os
import glob

output_dir = '../datasets/'
n_folds = 10
random_seed = 20191101
np.random.seed(random_seed)

split_size = 96

dataset_dir = '../datasets/lc_clusters_cv/'
output_dir = '../datasets/lc_clusters_cv_96/'
data_df = pd.concat([pd.read_csv(x) for x in glob.glob(dataset_dir+'/*')])
cluster_col_name = 'Cluster_0.3'
task_col_name = 'PriA-SSB Activity'

active_indices = np.where(data_df[task_col_name] == 1)[0]
inactive_indices = np.where(data_df[task_col_name] == 0)[0]

print('Total molecules: {}, Total active: {}, Total inactive: {}, Total clusters: {}.'.format(data_df.shape[0], 
                                                                        active_indices.shape[0], 
                                                                        inactive_indices.shape[0],
                                                                        np.unique(data_df[cluster_col_name]).shape[0]))

Total molecules: 94857, Total active: 133, Total inactive: 94724, Total clusters: 68243.


In [2]:
data_df = data_df.iloc[:,1:]

In [3]:
num_splits = data_df.shape[0]//split_size + 1
actives_per_split = int(np.ceil(active_indices.shape[0]/num_splits))
inactives_per_split = split_size - actives_per_split
rnd_active_idx = np.random.permutation(active_indices)
rnd_inactive_idx = np.random.permutation(inactive_indices)

active_i, inactive_i = 0, 0
total_mols, total_actives, total_inactives = 0, 0, 0
for split_i in range(num_splits):
    curr_indices = []
    curr_split_size = split_size
    if split_i == (num_splits-1):
        curr_split_size = data_df.shape[0] - split_size*(num_splits-1)
    
    for i in range(actives_per_split):
        if active_i < rnd_active_idx.shape[0]:
            curr_indices.append(rnd_active_idx[active_i])
            active_i += 1
            curr_split_size-=1
    
    for i in range(curr_split_size):
        if inactive_i < rnd_inactive_idx.shape[0]:
            curr_indices.append(rnd_inactive_idx[inactive_i])
            inactive_i += 1
    
    curr_indices = np.random.permutation(curr_indices)
    curr_split_df = data_df.iloc[curr_indices,:]
    
    split_mols = curr_split_df.shape[0]
    split_actives = curr_split_df[curr_split_df[task_col_name] == 1].shape[0]
    split_inactives = curr_split_df[curr_split_df[task_col_name] == 0].shape[0]
    total_mols += split_mols
    total_actives += split_actives
    total_inactives += split_inactives
    curr_split_df.to_csv(output_dir+'/unlabeled_{}.csv'.format(split_i), 
                         index=False)
    print('Split {}: Total molecules: {}, Total active: {}, Total inactive: {}'.format(split_i, split_mols, 
                                                                                       split_actives, 
                                                                                       split_inactives))
    
print('Total molecules: {}, Total active: {}, Total inactive: {}.'.format(total_mols, total_actives, total_inactives))

Split 0: Total molecules: 96, Total active: 1, Total inactive: 95
Split 1: Total molecules: 96, Total active: 1, Total inactive: 95
Split 2: Total molecules: 96, Total active: 1, Total inactive: 95
Split 3: Total molecules: 96, Total active: 1, Total inactive: 95
Split 4: Total molecules: 96, Total active: 1, Total inactive: 95
Split 5: Total molecules: 96, Total active: 1, Total inactive: 95
Split 6: Total molecules: 96, Total active: 1, Total inactive: 95
Split 7: Total molecules: 96, Total active: 1, Total inactive: 95
Split 8: Total molecules: 96, Total active: 1, Total inactive: 95
Split 9: Total molecules: 96, Total active: 1, Total inactive: 95
Split 10: Total molecules: 96, Total active: 1, Total inactive: 95
Split 11: Total molecules: 96, Total active: 1, Total inactive: 95
Split 12: Total molecules: 96, Total active: 1, Total inactive: 95
Split 13: Total molecules: 96, Total active: 1, Total inactive: 95
Split 14: Total molecules: 96, Total active: 1, Total inactive: 95
Split

In [30]:
unlabeled_df.reset_index()

Unnamed: 0,index,Index ID,Molecule,Murcko Scaffold ID,rdkit SMILES,Morgan FP_2_1024,PriA-SSB Activity,PriA-SSB % inhibition,Cluster_0.3
0,0,3599,SMSSF-0032901,11002,CCc1ccc(S(=O)(=O)N2CCN(c3ccc4nnc(C)n4n3)CC2)cc1,0000000000000000000000000000000011000000000000...,0,-4.695041,184
1,1,91664,SMSSF-0610354,16575,CC(C)c1ccc(OCC(=O)NC2CCN(c3ccccn3)C2)cc1,0100000000000000000000000000000001000000010000...,0,-5.624436,17564
2,2,37201,SMSSF-0056669,3285,CCC(C)N[S-](=O)([O-])c1ccc2oc(=O)ccc2c1,0110000000000000000000100000000001010000000000...,0,-4.398591,49864
3,3,47794,SMSSF-0036191,17180,Cc1ccc(C(=O)Cn2cnc3c(cnn3C3CCS(=O)(=O)C3)c2=O)...,0000000000000000000000000000000001000000000100...,0,11.418715,44584
4,4,7419,SMSSF-0052376,21384,CN(C)C(CNC(=O)C(=O)Nc1ccc(OC(F)(F)F)cc1)c1cccs1,0100000000000010000000000000000001000000000000...,0,6.238975,65767
5,5,45710,SMSSF-0025915,23164,COc1cccc(-c2ccc(=O)n(Cc3nc(-c4ccc(SC)cc4)no3)n...,0100000000000001000000000000000001000000000000...,0,-15.342849,304
6,6,61788,SMSSF-0033953,5816,CC(=O)c1ccc(NC(=O)CSc2ccc(NC(=O)c3cccs3)nn2)cc1,0000000000000010000000000000000001000000010100...,0,-10.451976,343
7,7,56216,SMSSF-0544500,10109,Cc1noc(C)c1S(=O)(=O)N1CCN(C(=O)c2cnn3ccccc23)CC1,0001000000000000000000000000000001000000100101...,0,16.300820,38945
8,8,73146,SMSSF-0595682,13365,Cc1cccc(CC(=O)N2CCN=C2SCc2ccccc2Cl)c1,0000000000000001000000000000000101000000001000...,0,-0.227492,5430
9,9,38239,SMSSF-0061881,20088,CCCCCOc1ccc(C(=O)Nc2sc3c(c2C#N)CCC3)cc1,0100100000100100000000000000000001000000001000...,0,-8.436282,13817
