-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_full_dataset_sample_per_chrom.py
350 lines (329 loc) · 18.5 KB
/
load_full_dataset_sample_per_chrom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# TODO notes for Ananth: Add the recombination data
# Also work on the nearby_indels feature
import cs273b
import load_coverage as lc
import numpy as np
from math import ceil
import utils
import entropy
import pandas as pd
# Base location of all input data files
data_dir = "/datadrive/project_data/"
class DatasetLoader(object):
def __init__(self, _kw=0, windowSize=100, batchSize=100, testBatchSize=500, seed=1, pos_frac=0.5, load_coverage=True, load_entropy=False, triclass=False, nearby=0, offset=0, complexity_threshold=0):
##
# If window size k, means we read k base pairs before the center and k after, for a total of 2k+1 base pairs in the input
self.window = windowSize
# Number of training examples per batch
self.batchSize = batchSize
# Number of testing examples per test batch (we can't test everything at once due to memory)
self.testBatchSize = testBatchSize
##
# Whether to do tri-class classification (Insertion, Deletion, or neither) as opposed to binary (Indel or non-indel)
self.triclass = triclass
# If nearby is nonzero, negative examples are only sampled from within 'nearby' of some positive example. Otherwise, they are sampled at random from the genome.
self.nearby = nearby
# Either 0 or 1, to handle 1-indexing of the gnomad_indels.tsv file. Technically should be 1, but in practice 0 seems to work just as well??
self.offset = offset
##
# Whether to use calculated sequence entropy as input to the model
self.load_entropy = load_entropy
# Whether to use coverage data as input to the model
self.load_coverage = load_coverage
# Whether to use recombination data as input to the model
#self.load_recombination = load_recombination
##
# The minimum complexity of the sequence needed to be a part of our train/test/val sets
self.complexity_threshold = complexity_threshold
##
# Load the reference genome
self.referenceChrFull, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp")
# Preserve memory
del ambiguous_bases
##
# Index of next training example (for batching purposes)
self.cur_index = 0
# Index of next testing example (for batching purposes)
self.test_index = 0
# Index of next location in the chromosome (for load_chromosome_window_batch function)
self.chrom_index = 0
##
if seed is not None:
np.random.seed(seed)
self.__initializeTrainData(pos_frac)
self.__initializeTestData()
# Returns dataset in which each element is a 2D array: window of size k around indels: [(2k + 1) * 4 base pairs one-hot encoded]
# Also includes desired number of negative training examples (positions not listed as indels)
# In this module, we only look at high complexity sequences, ie. locations which have complexity (window size 20 to measure it) >= 1.1
def __initializeTrainData(self, frac_positives):
##
# for brevity
k = self.window
# The window size used to compute sequence complexity
k_seq_complexity = 20
# We use chromosomes 2-22, we won't use chromosome 1 until the very end
num_chrom_used = 21
##
# Number of indels in the entire dataset used to train/test/val
lengthIndels = 25000*num_chrom_used
# Number of non-indels in the entire dataset
num_negatives = int(int((1./frac_positives-1) * lengthIndels)/num_chrom_used)*num_chrom_used
# Number of locations in the entire dataset
total_length = lengthIndels + num_negatives
##
# Number of indels in the entire dataset per chromosome
num_negatives_per_chrom = int(num_negatives/num_chrom_used)
# Number of non-indels in the entire dataset per chromosome
lengthIndels_per_chrom = int(lengthIndels/num_chrom_used)
# Number of locations in the entire dataset per chromosome
total_length_per_chrom = lengthIndels_per_chrom + num_negatives_per_chrom
##
# one-hot encoded sequences of size 2*k + 1 around each location
dataset = np.zeros((total_length, 2*k + 1, 4))
# coverage corresponding to each location in the dataset
coverageDataset = np.zeros((total_length, 2*k + 1))
# entropy of expanding windows in the dataset
entropyDataset = np.zeros((total_length, 2*k + 1))
# indices on the genome of the locations in the dataset
indices = np.zeros(total_length, dtype=np.uint32)
# allele count values for indels, 0 for non-indels
allele_count = np.zeros(total_length, dtype=np.uint32)
nearby_indels = np.zeros(total_length, dtype=np.uint32)
# label is either a bool or an int depending on the number of classes
if self.triclass:
labeltype = np.uint8
else:
labeltype = np.bool
# 0 for non-indels 1 (and 2) in case of indels
labels = np.zeros(total_length, dtype=labeltype)
# seems to be the same as indices, ToDo does it neet to be there???
genome_positions = np.zeros(total_length, dtype=np.uint32)
# the chromosome number corresponding to each location
chrom_num = np.zeros(total_length, dtype=np.uint32)
# Test the number of indels in a non-indel window, as well as multiple indel in a single indel window
num_indel_neg_set = 0
num_indel_pos_set = 0
# Load data from chromosomes 2-22
# populate dataset and related variables per chromosome
for chromosome in range(2, 23):
##
# Load the chromosome from the full genome
referenceChr = self.referenceChrFull[str(chromosome)]
## Load and process the positive (indels) dataset
# This is a 4 column data: indel locations, allele count, filter value, insertion (1) or deletion (0)
indel_data_load = np.load(data_dir + "indelLocationsFiltered" + str(chromosome) + ".npy")
indel_indices_set = set(np.array(indel_data_load[:, 0], dtype = int))
indel_data_load = indel_data_load[indel_data_load[:, 0] + k < referenceChr.shape[0]]
# Remove those that have complexity below the threshold
indel_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity
indel_sequence_indices = np.repeat(indel_sequence_indices, indel_data_load.shape[0], axis = 0)
indel_sequence_indices = np.reshape(indel_sequence_indices, [-1, indel_data_load.shape[0]])
indel_sequence_indices += np.transpose(np.array(indel_data_load[:, 0], dtype = int))
indel_sequence_complexity = entropy.entropySequence(referenceChr[indel_sequence_indices.transpose(), :])
del indel_sequence_indices
# Filter by sequence complexity and filter value around 20 sized window and complexity threshold
total_indices = np.arange(indel_data_load.shape[0])
filtered_indices = np.logical_and(indel_data_load[:, 2] == 1, indel_sequence_complexity >= self.complexity_threshold)
# Add an additional filter for allele count = 1
filtered_indices = np.logical_and(indel_data_load[:, 1] == 1, filtered_indices)
# Sample the indels, taking into consideration the classification problem in hand
if self.triclass:
filtered_indices_insert = np.logical_and(indel_data_load.iloc[:, 3] == 1, filtered_indices)
filtered_indices_insert = total_indices[filtered_indices_insert]
filtered_indices_delete = np.logical_and(indel_data_load.iloc[:, 3] == 0, filtered_indices)
filtered_indices_delete = total_indices[filtered_indices_delete]
insertionLocations = np.random.choice(filtered_indices_insert, size = int(lengthIndels_per_chrom/2), replace = False)
deletionLocations = np.random.choice(filtered_indices_delete, size = lengthIndels_per_chrom - int(lengthIndels_per_chrom/2), replace = False)
indel_indices = np.concatenate((insertionLocations, deletionLocations))
del filtered_indices_insert, filtered_indices_delete, insertionLocations, deletionLocations
else:
filtered_indices = total_indices[filtered_indices]
indel_indices = np.random.choice(filtered_indices, size = lengthIndels_per_chrom, replace = False)
##
indelLocations = np.array(indel_data_load[indel_indices, 0], dtype = int)
allele_count_val = indel_data_load[indel_indices, 1]
del indel_data_load, indel_indices, filtered_indices, total_indices
indelLocations = indelLocations - self.offset
## Load the coverage data if needed
coverage = None
if self.load_coverage:
coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome))
## Create the negative dataset
rel_size_neg_large = 2
neg_positions_large = np.load(data_dir + "nonindelLocationsSampled" + str(chromosome) + '.npy')
neg_positions_large = np.random.choice(neg_positions_large, size = rel_size_neg_large*num_negatives_per_chrom, replace = False)
# Remove those that have complexity below the threshold
neg_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity
neg_sequence_indices = np.repeat(neg_sequence_indices, len(neg_positions_large), axis = 0)
neg_sequence_indices = np.reshape(neg_sequence_indices, [-1, len(neg_positions_large)])
neg_sequence_indices += np.transpose(neg_positions_large)
neg_sequence_complexity = entropy.entropySequence(referenceChr[neg_sequence_indices.transpose(), :])
neg_positions_large = neg_positions_large[neg_sequence_complexity >= self.complexity_threshold]
del neg_sequence_indices, neg_sequence_complexity
##
if self.nearby:
# Create a list of all permissible nearby locations
nearby_locations = np.arange(-self.nearby, self.nearby + 1)
nearby_locations = np.repeat(nearby_locations, len(indelLocations), axis = 0)
nearby_locations = np.reshape(nearby_locations, [-1, len(indelLocations)])
nearby_locations += np.transpose(indelLocations)
nearby_locations = np.reshape(nearby_locations, -1)
# Remove all indel locations and low-complexity non-indel locations from nearby locations
nearby_locations = np.array((set(nearby_locations) - set(indelLocationsFull)) & set(neg_positions_large))
if len(nearby_locations) >= num_negatives_per_chrom:
neg_positions = np.random.choice(nearby_locations, size = num_negatives_per_chrom, replace = False)
else:
# Else sample the remaining from the negative positions- this is the best that can be done, try increasing the nearby size
print "Try increasing nearby or rel_size_neg_large. Not enough nearby-non-indels could be sampled in chromosome {}".format(chromosome)
num_neg_needed = num_negatives_per_chrom - len(nearby_locations)
not_nearby = np.random.choice(list((set(neg_positions_large) - set(indelLocationsFull)) - set(nearby_locations)), size = num_neg_needed, replace = False)
neg_positions = np.concatenate((nearby_locations, not_nearby))
else:
neg_positions = np.random.choice(neg_positions_large, size = num_negatives_per_chrom, replace = False)
for i in range(lengthIndels_per_chrom + num_negatives_per_chrom):
if i < lengthIndels_per_chrom:
if not self.triclass:
label = 1 # standard binary classification labels
elif i < int(lengthIndels_per_chrom/2):
label = 1 # insertions will be labeled as 1
else:
label = 2 # deletions will be labeled as 2
pos = indelLocations[i]
allele_count[total_length_per_chrom*(chromosome - 2) + i] = allele_count_val[i]
num_indel_pos_set += len(indel_indices_set & set(range(pos - k, pos + k + 1)))
else:
label = 0
pos = neg_positions[i - lengthIndels_per_chrom]
# Compute the true value of nearby_indels TODO
#if self.nearby:
num_indel_neg_set += len(indel_indices_set & set(range(pos - k, pos + k + 1)))
indices[total_length_per_chrom*(chromosome - 2) + i] = pos
coverageWindow = np.zeros(2*k + 1)
# get k base pairs before and after the position
window = referenceChr[pos - k : pos + k + 1]
if coverage is not None:
coverageWindow += np.mean(utils.flatten(coverage[pos - k : pos + k + 1]))#= utils.flatten(coverage[pos - k : pos + k + 1])
dataset[total_length_per_chrom*(chromosome - 2) + i] = window
coverageDataset[total_length_per_chrom*(chromosome - 2) + i] = coverageWindow
labels[total_length_per_chrom*(chromosome - 2) + i] = label
genome_positions[total_length_per_chrom*(chromosome - 2) + i] = pos
chrom_num[total_length_per_chrom*(chromosome - 2) + i] = chromosome
if self.load_entropy:
entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset)
##
# Randomly choose the validation and test chromosome
self.val_chrom, self.test_chrom = np.random.choice(range(2, 23), 2, replace=False)
# Set the number of training examples, and the respective set indices
total_indices = np.arange(total_length)
self.num_train_examples = total_length_per_chrom*(num_chrom_used - 2)
self.train_indices = total_indices[np.logical_and(chrom_num != self.val_chrom, chrom_num != self.test_chrom)]
self.test_indices = total_indices[chrom_num == self.test_chrom]
self.val_indices = total_indices[chrom_num == self.val_chrom]
##
# Set the respective variables
self.dataset = dataset
self.coverageDataset = coverageDataset
self.entropyDataset = entropyDataset
self.indices = indices
self.allele_count = allele_count
self.nearby_indels = nearby_indels
self.genome_positions = genome_positions
if self.triclass:
self.labels = utils.to_onehot(labels, 3)
else:
self.labels = np.expand_dims(labels, axis=1) # Make labels n by 1 (for convenience)
del dataset, coverageDataset, entropyDataset, indices, allele_count, nearby_indels, genome_positions, labels
print num_indel_pos_set
print float(num_indel_pos_set)/lengthIndels
print num_indel_neg_set
print float(num_indel_neg_set)/num_negatives
print np.mean(np.mean(self.coverageDataset, axis = 1))
print np.mean(np.var(self.coverageDataset, axis = 1))
def get_batch(self):
return self.get_randbatch() # default: random batch
# Get the next batch of training examples. The training data is shuffled after every epoch.
def get_randbatch(self, batchSize=0):
if batchSize == 0: batchSize = self.batchSize
# Randomize the order of examples, if we are at the beginning of the next epoch
if self.cur_index == 0:
np.random.shuffle(self.train_indices)
np.random.shuffle(self.train_indices)
start, end = self.cur_index, self.cur_index + batchSize
batch_indices = self.train_indices[start : end] # Indices of the training examples that will make up the batch
retval = [self.dataset[batch_indices]]
if self.load_coverage: # Load all additional data sources that are requested
retval.append(self.coverageDataset[batch_indices])
if self.load_entropy:
retval.append(self.entropyDataset[batch_indices])
#if self.load_recombination:
# retval.append(self.recombinationDataset[batch_indices])
retval.append(self.labels[batch_indices])
self.cur_index = end # Start of next batch
if end >= self.num_train_examples:
self.cur_index = 0 # Epoch just ended
return tuple(retval)
def __initializeTestData(self):
self.test_data = [self.dataset[self.test_indices]]
print("Number of test examples: {}".format(len(self.test_data[0])))
if self.load_coverage:
self.test_data.append(self.coverageDataset[self.test_indices])
if self.load_entropy:
self.test_data.append(self.entropyDataset[self.test_indices])
self.test_data.append(self.labels[self.test_indices])
self.test_data = tuple(self.test_data)
# Total number of training batches based on given batch size.
def num_trainbatches(self):
return int(ceil(float(self.num_train_examples) / self.batchSize))
# Total number of test examples
def len_testdata(self):
return len(self.test_data[1])
# Total number of test batches based on given test batch size.
def num_testbatches(self):
return int(ceil(float(len(self.test_data[1])) / self.testBatchSize))
# Needed if manually testing batch by batch, and doing so multiple times
def reset_test_index(self):
self.test_index = 0
# Get the next batch of testing data
def get_testbatch(self):
if self.test_index < len(self.test_data[1]):
rv = [self.test_data[0][self.test_index:self.test_index+self.testBatchSize]]
if self.load_coverage:
rv.append(self.test_data[1][self.test_index:self.test_index+self.testBatchSize])
if self.load_entropy:
rv.append(self.test_data[-2][self.test_index:self.test_index+self.testBatchSize])
rv.append(self.test_data[-1][self.test_index:self.test_index+self.testBatchSize])
else:
raise RuntimeError("test index is {}, only {} examples".format(self.test_index, len(self.test_data[1])))
self.test_index += self.testBatchSize
return tuple(rv)
# Get the validation set
def val_set(self, length=1000):
np.random.shuffle(self.val_indices)
val_indices = self.val_indices[:length]
retval = [self.dataset[val_indices, :, :]]
if self.load_coverage:
retval.append(self.coverageDataset[val_indices, :])
if self.load_entropy:
retval.append(self.entropyDataset[val_indices, :])
retval.append(self.labels[val_indices])
return tuple(retval)
def load_chromosome_window_data(self, chromosome):
self.referenceChr = self.referenceChrFull[str(chromosome)]
indel_data_load = np.load(data_dir + "indelLocationsFiltered" + str(chromosome) + ".npy")
self.setOfIndelLocations = set(indel_data_load[np.logical_and(indel_data_load[:, 2] == 1, indel_data_load[:, 1] == 1), 0])
def load_chromosome_window_batch_modified(self, window_size, batch_size):
lb = max(window_size, self.chrom_index) # we should probably instead pad with random values (actually may not be needed)
ub = min(len(self.referenceChr) - window_size, lb + batch_size) # ditto to above. also, ub is not inclusive
num_ex = ub - lb
X, Y = np.mgrid[lb - window_size : lb - window_size + num_ex, 0 : 2*window_size + 1]
self.chrom_index = ub
labels = [ex in self.setOfIndelLocations for ex in range(lb, ub)]
return self.referenceChr[X+Y, :], labels, range(lb, ub)
def load_chromosome_window_batch(self, window_size, batch_size):
lb = max(window_size, self.chrom_index) # we should probably instead pad with random values (actually may not be needed)
ub = min(len(referenceChr) - window_size, lb + batch_size) # ditto to above. also, ub is not inclusive
num_ex = ub - lb
X, Y = np.mgrid[lb - window_size : lb - window_size + num_ex, 0 : 2*window_size + 1]
self.chrom_index = ub
labels = [ex in self.setOfIndelLocations for ex in range(lb, ub)]
return referenceChr[X+Y, :], labels, lb, ub