In [1]:
import pandas as pd
import numpy as np
import os

FP_size, FP_radius = 1024, 2
output_dir = '../datasets/'
n_folds = 10
random_seed = 20181210
np.random.seed(random_seed)

master_df_file = '../datasets/master_df/master_mlpcn_lc_2018_10_12.csv.gz'
master_df = pd.read_csv(master_df_file,
                        compression='gzip')

# step 1: remove retests
retest_rows = master_df['Plate Name'].str.contains('CP')
master_df = master_df[~retest_rows]

# step 2: standardize library ids
library_id_dict = {'LC-3': 'LC3', 'Life Chemicals 1': 'LC1', 
                   'Life Chemical 2': 'LC2', 'LC-4': 'LC4',
                   '2011 MLPCN': 'MLPCN', '2011MLPCN': 'MLPCN'}
for library_id in pd.unique(master_df['Library ID']):
    matching_lids = master_df['Library ID'] == library_id
    master_df.loc[matching_lids, 'Library ID'] = library_id_dict[library_id]
    
# step 3: drop unneeded columns
master_df = master_df[['Molecule ID', 'Duplicate ID', 'SMSSF ID', 'Library ID', 
                       'rdkit SMILES', '{} MorganFP Radius {}'.format(FP_size, FP_radius),
                       'PriA-SSB AS % inhibition', 'PriA-SSB AS Activity',
                       'Primary Filter', 'Retest Filter', 'PAINS Filter']]

# step 4: group by Molecule ID and calculate median of primary screens
def grouping_logic(grouped_df):
     return pd.Series({'SMSSF ID': ', '.join(list(set(grouped_df['SMSSF ID']))),
                       'Library ID': ', '.join(list(set(grouped_df['Library ID']))),
                       'rdkit SMILES': grouped_df['rdkit SMILES'].iloc[0],
                       '{} MorganFP Radius {}'.format(FP_size, FP_radius): grouped_df['{} MorganFP Radius {}'.format(FP_size, FP_radius)].iloc[0],
                       'PriA-SSB AS % inhibition (Primary Median)': grouped_df['PriA-SSB AS % inhibition'].median(),
                       'PriA-SSB AS Activity': grouped_df['PriA-SSB AS Activity'].iloc[0], 
                       'Primary Filter': grouped_df['Primary Filter'].iloc[0],
                       'Retest Filter': grouped_df['Retest Filter'].iloc[0],
                       'PAINS Filter': grouped_df['PAINS Filter'].iloc[0]})
    
grouped_by_molid = master_df.groupby(by='Molecule ID')
res_df = grouped_by_molid.apply(grouping_logic)
res_df['Molecule ID'] = res_df.index
res_df = res_df[['Molecule ID', 'SMSSF ID', 'Library ID', 
                 'rdkit SMILES', '{} MorganFP Radius {}'.format(FP_size, FP_radius),
                 'PriA-SSB AS % inhibition (Primary Median)', 'PriA-SSB AS Activity',
                 'Primary Filter', 'Retest Filter', 'PAINS Filter']]
res_df = res_df.reset_index(level=0, drop=True)
res_df[['PriA-SSB AS Activity', 'Primary Filter', 
        'Retest Filter', 'PAINS Filter']] = res_df[['PriA-SSB AS Activity', 'Primary Filter', 
                                                    'Retest Filter', 'PAINS Filter']].astype(float)

# save current df
res_df.to_csv(output_dir+'/folds/training_df_single_fold.csv.gz',
              index=False,
              compression='gzip')
print('Total molecules: {}, Total active: {}, Total inactive: {}'.format(res_df.shape[0], 
                                                                        res_df[res_df['PriA-SSB AS Activity'] == 1].shape[0], 
                                                                        res_df[res_df['PriA-SSB AS Activity'] == 0].shape[0]))

Total molecules: 427300, Total active: 100, Total inactive: 427200


## Group by Library ID and then Stratify

In [2]:
# step 5: seperate by library id, then stratify sample by library id
fold_lists = [([],[]) for _ in range(n_folds)]
fold_actives_counts = np.array([0 for i in range(n_folds)]) # maintain number of actives in each fold 
for library_id in pd.unique(res_df['Library ID']):
    matching_lids = res_df['Library ID'] == library_id
    curr_df = res_df[matching_lids]
    active_inds = np.random.permutation(np.array(curr_df[curr_df['PriA-SSB AS Activity'] == 1].index))
    inactive_inds = np.random.permutation(np.array(curr_df[curr_df['PriA-SSB AS Activity'] == 0].index))
    active_inds = np.array_split(active_inds, n_folds)
    inactive_inds = np.array_split(inactive_inds, n_folds)
    for i, fl_i in enumerate(np.argsort(fold_actives_counts)):
        fold_lists[fl_i][0].extend(active_inds[i])
        fold_lists[fl_i][1].extend(inactive_inds[i])
        fold_actives_counts[fl_i] += len(active_inds[i])

print('Stratify folding by library id')
total_mols = 0
total_actives = 0
total_inactives = 0
for i in range(n_folds):
    fold_df = res_df.iloc[fold_lists[i][0]+fold_lists[i][1],:]
    fold_mols = fold_df.shape[0]
    fold_actives = fold_df[fold_df['PriA-SSB AS Activity'] == 1].shape[0]
    fold_inactives = fold_df[fold_df['PriA-SSB AS Activity'] == 0].shape[0]
    total_mols += fold_mols
    total_actives += fold_actives
    total_inactives += fold_inactives
    
    fold_df.to_csv(output_dir+'/folds/library_strat_folds/fold_{}.csv'.format(i), 
                   index=False)
    print('Fold {}: Total molecules: {}, Total active: {}, Total inactive: {}'.format(i, fold_mols, 
                                                                                    fold_actives, 
                                                                                    fold_inactives))
print('Total molecules: {}, Total active: {}, Total inactive: {}'.format(total_mols, 
                                                                         total_actives, 
                                                                         total_inactives))

assert total_mols == res_df.shape[0] and \
       total_actives == res_df[res_df['PriA-SSB AS Activity'] == 1].shape[0] and \
       total_inactives == res_df[res_df['PriA-SSB AS Activity'] == 0].shape[0]

Stratify folding by library id
Fold 0: Total molecules: 42735, Total active: 10, Total inactive: 42725
Fold 1: Total molecules: 42737, Total active: 10, Total inactive: 42727
Fold 2: Total molecules: 42734, Total active: 10, Total inactive: 42724
Fold 3: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 4: Total molecules: 42729, Total active: 10, Total inactive: 42719
Fold 5: Total molecules: 42727, Total active: 10, Total inactive: 42717
Fold 6: Total molecules: 42728, Total active: 10, Total inactive: 42718
Fold 7: Total molecules: 42727, Total active: 10, Total inactive: 42717
Fold 8: Total molecules: 42728, Total active: 10, Total inactive: 42718
Fold 9: Total molecules: 42725, Total active: 10, Total inactive: 42715
Total molecules: 427300, Total active: 100, Total inactive: 427200


## Directly Stratify

In [3]:
# step 6: seperate by library id, then stratify sample by library id
fold_lists = [([],[]) for _ in range(n_folds)]
active_inds = np.random.permutation(np.array(res_df[res_df['PriA-SSB AS Activity'] == 1].index))
inactive_inds = np.random.permutation(np.array(res_df[res_df['PriA-SSB AS Activity'] == 0].index))
active_inds = np.array_split(active_inds, n_folds)
inactive_inds = np.array_split(inactive_inds, n_folds)
for i in range(n_folds):
    fold_lists[i][0].extend(active_inds[i])
    fold_lists[i][1].extend(inactive_inds[i])

print('Plain stratify folding')
total_mols = 0
total_actives = 0
total_inactives = 0
for i in range(n_folds):
    fold_df = res_df.iloc[fold_lists[i][0]+fold_lists[i][1],:]
    fold_mols = fold_df.shape[0]
    fold_actives = fold_df[fold_df['PriA-SSB AS Activity'] == 1].shape[0]
    fold_inactives = fold_df[fold_df['PriA-SSB AS Activity'] == 0].shape[0]
    total_mols += fold_mols
    total_actives += fold_actives
    total_inactives += fold_inactives
    
    fold_df.to_csv(output_dir+'/folds/strat_only_folds/fold_{}.csv'.format(i), 
                   index=False)
    print('Fold {}: Total molecules: {}, Total active: {}, Total inactive: {}'.format(i, fold_mols, 
                                                                                    fold_actives, 
                                                                                    fold_inactives))
print('Total molecules: {}, Total active: {}, Total inactive: {}'.format(total_mols, 
                                                                         total_actives, 
                                                                         total_inactives))

assert total_mols == res_df.shape[0] and \
       total_actives == res_df[res_df['PriA-SSB AS Activity'] == 1].shape[0] and \
       total_inactives == res_df[res_df['PriA-SSB AS Activity'] == 0].shape[0]

Plain stratify folding
Fold 0: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 1: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 2: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 3: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 4: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 5: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 6: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 7: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 8: Total molecules: 42730, Total active: 10, Total inactive: 42720
Fold 9: Total molecules: 42730, Total active: 10, Total inactive: 42720
Total molecules: 427300, Total active: 100, Total inactive: 427200


In [5]:
import tarfile
tar = tarfile.open(output_dir+'/lc_mlpcn_folds_2018_10_12.tar.gz', 'w:gz')
tar.add(output_dir+'/folds/', arcname='lc_mlpcn_folds_2018_10_12')
tar.close()