In [1]:
import os
import random

#data
import pandas as pd

# chemistry
import rdkit
from rdkit import RDLogger
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS
from rdkit import DataStructs
from rdkit.Chem import PandasTools
from rdkit.Chem import Lipinski

# math
import numpy as np

# plotting
from matplotlib import pyplot as plt

# Machine learning
import sklearn
from sklearn import tree

Here I am defining a validation set and a training set:

In [2]:
# get the full data set
df = pd.read_csv('../../../Analysis_of_Docking/data/rmsd_values_featurized_w_sasa_without_bad_pairs.csv')

I am going to group together pairs that have each other in common:

to guarantee that the training data is different from the testing and validation data.

In [5]:
# create a list of all the combinations of pairs of compounds that exist in the data set
# the pair ("A"_template, "B"_docked) is the same as ("B"_docked, "A"_template)
list_of_sets = []
# it is a list of lists of indexes
list_of_indexes = []
for index, template, docked in df[['template', 'docked']].itertuples():
    if {template, docked} not in list_of_sets:
        list_of_sets += [{template, docked}]
        list_of_indexes += [[index]]
    else:
        list_of_indexes[list_of_sets.index({template, docked})] += [index]

# group together all sets until all the sets are disjoint in relation to each other
list_of_sets_copy = list_of_sets.copy()
list_of_lists = [[]]
list_of_lists_indexes = [[]]
while_loop_controller_1 = True
while while_loop_controller_1:
    set1 = list_of_sets_copy[0]
    tempset = set()
    tempset.update(set1)
    while_loop_controller_2 = True
    while while_loop_controller_2:
        i = 0
        i_s = []
        for set2 in list_of_sets_copy:
            if not tempset.isdisjoint(set2):
                tempset.update(set2)
                list_of_lists[-1] += [set2]
                list_of_lists_indexes[-1] += list_of_indexes[list_of_sets.index(set2)]
                i_s += [i]
            i += 1
        if len(i_s) == 0:
            while_loop_controller_2 = False
        list_of_sets_copy = [list_of_sets_copy[j] for j in range(len(list_of_sets_copy)) if j not in i_s]
    if len(list_of_sets_copy) > 0:
        list_of_lists += [[]]
        list_of_lists_indexes += [[]]
    else:
        while_loop_controller_1 = False

In [7]:
size_of_groups = {}
df['group'] = -1
group = 0
for list in list_of_lists_indexes:
    group += 1
    size_of_groups[group] = len(list)
    for index in list:
        df.at[index, 'group'] = group

In [9]:
# the size of the different disjoint groups is variable
print(size_of_groups)

{1: 7399, 2: 4296, 3: 2421, 4: 6, 5: 788, 6: 1053, 7: 726, 8: 1215, 9: 408, 10: 695, 11: 593, 12: 260, 13: 106, 14: 2, 15: 94, 16: 39, 17: 5, 18: 119, 19: 504, 20: 2, 21: 483, 22: 313, 23: 248, 24: 222, 25: 324, 26: 314, 27: 39, 28: 199, 29: 218, 30: 93, 31: 2, 32: 13, 33: 190, 34: 162, 35: 20, 36: 2, 37: 3, 38: 55, 39: 92, 40: 76, 41: 89, 42: 84, 43: 103, 44: 116, 45: 48, 46: 118, 47: 74, 48: 11, 49: 17, 50: 37, 51: 53, 52: 6, 53: 45, 54: 23, 55: 2, 56: 37, 57: 48, 58: 22, 59: 7, 60: 7, 61: 11, 62: 20, 63: 8, 64: 7, 65: 11, 66: 2, 67: 5, 68: 6, 69: 8, 70: 2, 71: 2, 72: 2, 73: 2, 74: 4, 75: 4, 76: 1, 77: 6}


In [22]:
# I am going to have a validation set of about 20 % the size of the full data set
def check_sizes_of_folds(folds):
    sizes = []
    size_of_components = []
    for fold in folds:
        sizes += [sum([size_of_groups[g] for g in fold])]
        size_of_components += [np.array([size_of_groups[g] for g in fold])]
    proportions =  [sizes[i]/sum(sizes) for i in range(len(sizes))]
    return size_of_components, sizes, proportions

k = 5

groups = df['group'].drop_duplicates().values


base_size = len(groups)//(k)
base_rest = len(groups)%(k)

print(base_size, base_rest)

random.shuffle(groups)
folds = [groups[i*base_size:(i+1)*base_size] for i in range(k)]

for i in range(base_rest):
    folds[-(i+1)] = np.append(groups[-(i+1)], folds[-(i+1)])

folds, check_sizes_of_folds(folds)

15 2


([array([46,  4, 36,  8,  3, 16, 51, 68, 54, 50, 64,  7, 31, 13, 35]),
  array([27, 72, 48,  2, 66, 61, 60,  6, 18, 24, 55, 53, 75, 25, 26]),
  array([15, 77, 33, 21, 52, 73, 22, 14, 41, 19, 59, 67, 43, 70,  5]),
  array([57, 38, 11, 20, 28, 23, 37,  9, 56, 32, 42, 30, 74, 10, 40, 44]),
  array([29, 58, 71, 17, 47, 63, 45, 62, 65, 49, 34, 76, 69,  1, 39, 12])],
 ([array([ 118,    6,    2, 1215, 2421,   39,   53,    6,   23,   37,    7,
           726,    2,  106,   20]),
   array([  39,    2,   11, 4296,    2,   11,    7, 1053,  119,  222,    2,
            45,    4,  324,  314]),
   array([ 94,   6, 190, 483,   6,   2, 313,   2,  89, 504,   7,   5, 103,
            2, 788]),
   array([ 48,  55, 593,   2, 199, 248,   3, 408,  37,  13,  84,  93,   4,
          695,  76, 116]),
   array([ 218,   22,    2,    5,   74,    8,   48,   20,   11,   17,  162,
             1,    8, 7399,   92,  260])],
  [4781, 6451, 2594, 2674, 8347],
  [0.19241759568559585,
   0.25962892904576007,
   0.1043989

In [40]:
# the validation fold becomes:
# which has a size of 4781 and is about 19% of the total number
validation_fold = [46,  4, 36,  8,  3, 16, 51, 68, 54, 50, 64,  7, 31, 13, 35]

validation_fold_df = df.loc[df['group'].isin(validation_fold)]

# save the validation set
validation_fold_df.to_csv('../data/validation_rmsd_values_featurized_w_sasa_without_bad_pairs.csv',
                          index=False)

In [42]:
# the set used for choosing hyperparameters and training
train_test_df = df.loc[~df['group'].isin(validation_fold)]

# save the train_test
train_test_df.to_csv('../data/train_test_rmsd_values_featurized_w_sasa_without_bad_pairs.csv',
                     index=False)