/
main.par.overlapping.py
559 lines (492 loc) · 28.8 KB
/
main.par.overlapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
#!/usr/bin/python$
import argparse
import subprocess
import os
import numpy
import sys
import multiprocessing as mp
from gen_tree_overlappingCNA import gen_tree
from gen_readcount import gen_readcount
from gen_readcount import get_beta_dist
from Gen_Ref_Fa import make_fa, make_fa_wABs
from Gen_Ref_Fa import init_ref
from joblib import Parallel, delayed
from tqdm import tqdm
import logging
## set logger
logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(" ")
#from Gen_Ref_Fa import getlen_ref
# given normal genome size (diploid)
#genome_size = 6.32 * 10^9
# fixed bulk uniformity
x0_bulk = 0.5
y0_bulk = 0.38
# see how many processors I have
#NUM_OF_PROCESSES = mp.cpu_count()
def run_one_seg(dir,fa_f, chr_name, start, end,this_readcount,l,out_fq1,out_fq2):
# chr_name_array[j]
tmp_fa_file = "_".join([dir+'/'+fa_f.split('/')[-1], str(chr_name), str(start), str(end)]) + ".fa"
# use samtools faidx to get this sequence that is to be sequenced into reads
args = "samtools faidx " + fa_f + " " + chr_name + ":" + str(start) + "-" + str(end) + " > " + tmp_fa_file
# TODO check how many N's
# conclusion: Popen cannot exit. check_call can. But check_call does not work on all Ns as wgsim does not work on it.
print(args)
try:
subprocess.check_call(args, shell=True)
except subprocess.CalledProcessError:
print("Cannot work on " + args)
# check N's
N_true = check_Ns(tmp_fa_file)
if N_true == 1:
args = wgsim_dir + "wgsim -h -N " + str(this_readcount) + " -1 " + str(l) + " -2 " + str(l) + " " + tmp_fa_file + " " + out_fq1 + " " + out_fq2
print(args)
try:
subprocess.check_call(args, shell=True)
except subprocess.CalledProcessError:
print("Cannot work on " + args)
args = "rm " + tmp_fa_file
try:
subprocess.check_call(args, shell=True)
except subprocess.CalledProcessError:
print("Cannot remove " + tmp_fa_file)
return
# for both bulk and single-cell sampling purposes
def gen_reads(dir, index, leaf_index, all_chrlen, fa_prefix, Alpha, Beta, x0, y0, cov, l, window_size, u, chr_name_array, cell_i, original_level, bulk_or_sc):
this_leaf_index = leaf_index[index]
print(all_chrlen)
logger.info("Now sequencing cell " + str(cell_i) + " on node number " + str(index) + " which is on level " + str(original_level) + ". ")
# each allele
logger.info("len(all_chrlen):"+str(len(all_chrlen)))
for i in range(len(all_chrlen)):
logger.info('i: '+str(i))
fa_f = fa_prefix + str(this_leaf_index) + "_" + str(i + 1) + ".fa"
# sequence the reads using wgsim
#out_fq_prefix = "leaf" + str(this_leaf_index) + "_cell" + str(cell_i) + "_allele" + str(i)
out_fq_prefix = "level" + original_level + "_node" + str(this_leaf_index) + "_cell" + str(cell_i) + "_allele" + str(i)
if bulk_or_sc == "bulk":
out_fq_prefix = "level" + original_level + "_node" + str(this_leaf_index) + "_allele" + str(i)
out_fq1 = dir + "/" + out_fq_prefix + "_1.fq"
out_fq2 = dir + "/" + out_fq_prefix + "_2.fq"
os.system('rm '+ out_fq1+' '+out_fq2)
# args = "samtools faidx " + fa_f
# # uncomment below two lines later can!!!!!!!!!!
# popen = subprocess.Popen(args, stdout=subprocess.PIPE, shell=True)
# popen.wait()
# each chromosome
for j in range(len(all_chrlen[i])):
this_chrlen = all_chrlen[i][j]
readcounts = gen_readcount(dir, Alpha, Beta, x0, y0, cov, l, window_size, u, all_chrlen[i][j])
# each bin
start = 0
end = 0
# seg_list = []
for k in readcounts:
this_readcount = int(k/2)
# extract this segment of fa
start = end + 1
end = start + window_size - 1
print('end: '+str(end))
if end > this_chrlen:
end = this_chrlen
if start > this_chrlen:
break
run_one_seg(dir, fa_f, chr_name_array[j], start, end,this_readcount,l,out_fq1,out_fq2)
# seg_list.append((start,end,this_chrlen))
# n_thread = 30
# sequences = Parallel(n_jobs=n_thread)(delayed(run_one_seg)\
# (fa_f, chr_name_array[j], seg[0], seg[1],seg[2],l,out_fq1,out_fq2) for seg in tqdm(seg_list))
#popen = subprocess.Popen(args)
#, stdout=subprocess.PIPE, shell=True)
#popen.wait()
#popen.communicate()
#output = popen.stdout.read()
#print output
#popen.terminate()
if bulk_or_sc == "bulk":
print("Done with node " + str(this_leaf_index) + " for bulk sequencing. ")
else:
print("Done with node " + str(this_leaf_index) + " cell " + str(cell_i))
def convert_coverage(cov, all_chrlen, ref_total_size):
# calculate the total length of the genome
l = 0
for i in range(len(all_chrlen)):
for j in range(len(all_chrlen[i])):
l += all_chrlen[i][j]
return cov / l * (2 * ref_total_size)
def check_Ns(file):
N_line = 0
total_line = 0
with open(file, "r") as f:
for line in f:
total_line = total_line + 1
if line.find('N') != -1:
N_line = N_line + 1
f.close()
if total_line != 0 and N_line/float(total_line) > 0.2:
return 0
return 1
if len(sys.argv) <= 1:
print("""
A single cell simulator generating low coverage data. The program automatically generates a phylogenetic tree with copy number variations on the branches. On each leave of the tree, it generates the reads whose error profile, such as uneven coverage, mimics the real single cell data.
Usage: python main.py -t [ref.fa] -n [number_leafs] -S [wgsim_dir]
-p (--processors) Numbers of processors available.
-r (--directory) Location of simulated data. The program will remove the whole directory if it already exists. Otherwise it will create one. (default: test)
-S (--wgsim-dir) The directory of the binary of wgsim. It is in the same folder of this main.py. (need to specify)
-n (--cell-num) Number of the cells on a level of interest. Always greater than -F treewidth. Treewidth controls the total number of clones whereas cell-num controls the total number of cells sequenced at a certain tree depth. (default: 8)
-B (--Beta) The program uses the Beta-splitting model to generate the phylogenetic tree. Specify a value between [0, 1]. (default: 0.5)
-A (--Alpha) The Alpha in Beta-splitting model. Specify a value between [0, 1]. The closer Alpha and Beta, the more balanced the tree. (default: 0.5).
-D (--Delta) The rate of a node to disappear. Specify a value between [0, 1]. If all nodes have daughter nodes, take 0. (default: 0)
-G (--treedepth) The mean of the tree depth distribution. The final tree depth will be sampled from a Gaussian with this mean and a fixed standard deviation. (default: 4 counting from the first cancer cell)
-K (--treedepthsigma) The standard deviation of the tree depth distribution. To get exactly the tree depth defined by -F, use a very small standard deviation, e.g., 0.0001. (default: 0.5)
-F (--treewidth) The mean of the tree width distribution. The final tree width will be sampled from a Gaussian with this mean and a fixed standard deviation. (default: 8)
-H (--treewidthsigma) The standard deviation of the tree width distribution. To get exactly the tree width defined by -F, use a very small standard deviation, e.g., 0.0001. (default: 0.5)
-c (--cn-num) The average number of copy number variations to be added on a branch. (default: 1)
-d (--del-rate) The rate of deletion as compared to amplification. (default: 0.5)
-m (--min-cn-size) Minimum copy number size. (default: 200,000bp)
-e (--exp-theta) The parameter for the Exponential distribution for copy number size, beyond the minimum one. (default: 0.000001)
-a (--amp-p) The parameter for the Genometric distribution for the number of copies amplified. (default: 0.5)
-t (--template-ref) The reference file to sequence the reads.
-o (--outfile) The standard output file, will be saved in output folder, just give the file name. (default: std.out)
-f (--fa-prefix) The prefix of the alleles and read names. (default: ref)
-x (--Lorenz-x) The value on the x-axis of the point furthest from the diagonal on the Lorenz curve imitating the real coverage uneveness. (default: 0.5)
-y (--Lorenz-y) The value on the y-axis of the Lorenz curve imitating the real coverage unevenness. x > y. The closer (x, y) to the diagonal, the better the coverage evenness. (default: 0.4)
-v (--coverage) The average coverage of the sequence. (default: 0.02)
-l (--readlen) Read length for each read sequenced. (default: 35bp)
-w (--window-size) Within a window, the coverage is according to a Gaussian distribution. Neighboring windows' read coverage is according to a Metropolis Hasting process. (default: 200000bp)
-u (--acceptance-rate) The probability to accept a proposal in Metropolis Hasting. (default: 0.5)
-k (--skip-first-step) If the alleles for all nodes have been made, the step can be skipped. Make it 1 then. (default: 0)
-R (--snv-rate) The rate of the snv. snv-rate * branch-length = # snvs. (default: 1)
-X (--multi-root) The multiplier of the mean CNV on root. (default: 4)
-W (--whole-amp) If there is whole chromosome amplification, 1 as yes. (default: 1)
-C (--whole-amp-rate) Whole amplification rate: rate of an allele chosen to be amplified (default: 0.2)
-E (--whole-amp-num) Whole amplification copy number addition, which occurs to one allele at a time. (default: 1)
-J (--amp-num-geo-par) Whole amplification copy number distribution (geometric distribution parameter: the smaller, the more evenly distributed). (default: 1)
-Y (--leaf-index-range) For parallele job submission. >= min, < max leaf index will be processed. min.max. This counts leaf nodes from 0. (default: -1)
-L (--levels) This is for both tree inference and sampling ancestral nodes. If the user is interested in sequencing nodes on multiple levels, user can specify the levels of interest by separating them by semicolon. The first tumor cell/clone under the trunk has level 1. If counting from the bottom (leaf) of the tree, use minus before the number. For example, -1 is the leaf level. The range of the level should be within [-depth, depth]. Users can specify desired levels according to -G to know which levels are available. If that is the case, use a very small -K to make sure the depth is not smaller than the biggest level you specify. (default: -1)
-U (--bulk-levels) The levels of the bulk sequencing separated by semicolon. The definition of the levels is the same as in -L. The default for this option is NA, meaning no bulk sequencing.
-V (--cov-bulk) The coverage of the bulk sequencing. The same for all levels. This parameter is needed when -U is identified. (default: 30)
-M (--single-cell-per-node) If this is on, each node represents one cell and there is no clonality in the node. In this case tree_width will be the same as n (leaf num). 1 is on. (default: 0)
""")
sys.exit(0)
parser = argparse.ArgumentParser(description='SCSim: A simulator for simulating cancer phylogenetic tree with copy number variations and single nucleotide variations for single cells and bulk sequencing.')
parser.add_argument('-p', '--processors', default=4)
parser.add_argument('-r', '--directory', default="test")
parser.add_argument('-S', '--wgsim-dir', default="")
parser.add_argument('-n', '--cell-num', default=8)
parser.add_argument('-B', '--Beta', default=0.5)
parser.add_argument('-A', '--Alpha', default=0.5)
parser.add_argument('-D', '--Delta', default=0)
# add the width to the tree
parser.add_argument('-F', '--treewidth', default=8)
# add the depth to the tree
parser.add_argument('-G', '--treedepth', default=4)
parser.add_argument('-H', '--treewidthsigma', default=0.5)
parser.add_argument('-K', '--treedepthsigma', default=0.5)
#parser.add_argument('-O', '--Output', default="test")
parser.add_argument('-c', '--cn-num', default=1)
parser.add_argument('-d', '--del-rate', default=0.5)
parser.add_argument('-m', '--min-cn-size', default=2000000)
#parser.add_argument('-m', '--min-cn-size', default=10)
parser.add_argument('-e', '--exp-theta', default=5000000)
parser.add_argument('-a', '--amp-p', default=0.5)
parser.add_argument('-t', '--template-ref', default="hg19.fa")
#parser.add_argument('-t', '--template-ref', default="chr1.fa")
parser.add_argument('-o', '--outfile', default="std.out")
parser.add_argument('-f', '--fa-prefix', default="ref")
parser.add_argument('-x', '--Lorenz-x', default=0.5)
parser.add_argument('-y', '--Lorenz-y', default=0.4)
parser.add_argument('-v', '--coverage', default=0.02)
#parser.add_argument('-v', '--coverage', default=10)
parser.add_argument('-l', '--readlen', default=35)
parser.add_argument('-w', '--window-size', default=200000)
#parser.add_argument('-w', '--window-size', default=20)
parser.add_argument('-u', '--acceptance-rate', default=0.5)
parser.add_argument('-k', '--skip-first-step', default=0)
parser.add_argument('-R', '--snv_rate', default=0)
parser.add_argument('-X', '--multi-root', default=4)
parser.add_argument('-W', '--whole-amp', default=0)
parser.add_argument('-C', '--whole-amp-rate', default=0.2)
parser.add_argument('-E', '--whole-amp-num', default=1)
parser.add_argument('-J', '--amp-num-geo-par', default=1)
parser.add_argument('-Y', '--leaf-index-range', default="-1")
#parser.add_argument('-I', '--leaf-ID-range', default="-1")
parser.add_argument('-L', '--levels', default="-1")
parser.add_argument('-U', '--bulk-levels', default="NA")
parser.add_argument('-V', '--cov-bulk', default="30")
parser.add_argument('-M', '--single-cell-per-clone', default=0)
args = parser.parse_args()
NUM_OF_PROCESSES = int(args.processors)
skip = int(args.skip_first_step)
dir = args.directory
save_prefix = dir + "/" + "from_first_step"
wgsim_dir = args.wgsim_dir
n = int(args.cell_num)
Beta = float(args.Beta)
Alpha = float(args.Alpha)
Delta = float(args.Delta)
treeWidth = float(args.treewidth)
treeDepth = float(args.treedepth)
treeWidthSigma = float(args.treewidthsigma)
treeDepthSigma = float(args.treedepthsigma)
#Output = args.Output
cn_num = int(args.cn_num)
del_rate = float(args.del_rate)
min_cn_size = int(args.min_cn_size)
exp_theta = float(args.exp_theta)
amp_p = float(args.amp_p)
template_ref = args.template_ref
outfile = dir + "/" + args.outfile
fa_prefix = dir + "/" + args.fa_prefix
x0 = float(args.Lorenz_x)
y0 = float(args.Lorenz_y)
cov = float(args.coverage)
l = int(args.readlen)
window_size = int(args.window_size)
u = float(args.acceptance_rate)
#sigma = args.sigma_for_Gaussian
snv_rate = float(args.snv_rate)
root_mult = int(args.multi_root)
whole_amp = int(args.whole_amp)
whole_amp_rate = float(args.whole_amp_rate)
whole_amp_num = int(args.whole_amp_num)
amp_num_geo_par = float(args.amp_num_geo_par)
# for one cell per clone, specify leaf_index_range instead of leaf_ID_range
leaf_index_range = args.leaf_index_range
single_cell_per_clone = (int(args.single_cell_per_clone) == 1)
index_min = 0
index_max = 10000000
if leaf_index_range != "-1":
index_min, index_max = leaf_index_range.split(".")
index_min = int(index_min)
index_max = int(index_max)
# leaf_ID_range is related with the level_index
#leaf_ID_range = args.leaf_ID_range
#leaf_index_min = 0
#leaf_index_max = 10000000
#if leaf_ID_range != "-1":
# leaf_index_min, leaf_index_max = leaf_ID_range.split(".")
# leaf_index_min = int(leaf_index_min)
# leaf_index_max = int(leaf_index_max)
# levels for tree inferencer and longitudinal study
# process level so that the negative ones are turned to positive, the ones over the deepest level will be reported, and all levels will be sorted in a numerical increasing order
levels = args.levels.split(";")
bulk_levels = []
if args.bulk_levels != "NA":
bulk_levels = args.bulk_levels.split(";")
cov_bulk = float(args.cov_bulk)
if skip == 0:
if not os.path.exists(dir):
subprocess.check_call("mkdir " + dir, shell=True)
else:
print(dir + " exists. Exit. ")
sys.exit(0)
leaf_chrlen = []
leaf_index = []
chr_name_array = []
# Step 1. generate a phylogentic tree that has copy number alterations on branches.
# Now all CNs are in tree, not generating fa or remember it in the tree nodes.
if skip == 0:
print("before generating tree:")
[tree, tree_elements_arr] = gen_tree(Beta, Alpha, Delta, treeWidth, treeWidthSigma, treeDepth, treeDepthSigma, dir, cn_num, del_rate, min_cn_size, exp_theta, amp_p, template_ref, outfile, fa_prefix, snv_rate, root_mult, whole_amp, whole_amp_rate, whole_amp_num, amp_num_geo_par)
#[tree, tree_elements_arr] = gen_tree(n, Beta, Alpha, Delta, treeDepth, treeDepthSigma, dir, cn_num, del_rate, min_cn_size, exp_theta, amp_p, template_ref, outfile, fa_prefix, snv_rate, root_mult, whole_amp, whole_amp_rate, whole_amp_num, amp_num_geo_par)
#[level_chrlens, level_indices, chr_name_array, tree] = gen_tree(n, Beta, Alpha, Delta, treeWidth, treeWidthSigma, treeDepth, treeDepthSigma, dir, cn_num, del_rate, min_cn_size, exp_theta, amp_p, template_ref, outfile, fa_prefix, snv_rate, root_mult, whole_amp, whole_amp_rate, whole_amp_num, amp_num_geo_par)
#numpy.save(save_prefix + ".tree_elements.npy", tree_elements)
#numpy.save(save_prefix + ".leaf_chrlen.npy", leaf_chrlen)
#numpy.save(save_prefix + ".leaf_index.npy", leaf_index)
#numpy.save(save_prefix + ".level_chrlen.npy", level_chrlens)
#numpy.save(save_prefix + ".level_index.npy", level_indices)
#numpy.save(save_prefix + ".chr_name_array.npy", chr_name_array)
# save the tree for parallele job submission afterwards
numpy.save(save_prefix + ".tree_elements.npy", tree_elements_arr)
numpy.save(save_prefix + ".tree.npy", tree)
#numpy.save(save_prefix + ".ref.npy", ref)
print("Done with generating the tree. Save to npy. ")
#print leaf_index
# Step 2. Consider even coverage, use metropolis hasting to sample read count in each bin based on a given point on Lorenz curve.
# if first step was skipped, read the previous stored file
if skip == 1:
print("Skip the first step. Reading ")
#abs_path = os.getcwd()
tree_elements_f = save_prefix + ".tree_elements.npy"
#leaf_chrlen_f = save_prefix + ".leaf_chrlen.npy"
# level chrlen, instead of having only the chrlen on the leaves, now also record the chrlen on each level
#level_chrlen_f = save_prefix = ".level_chrlen.npy"
#leaf_index_f = save_prefix + ".leaf_index.npy"
# each level leads to the node indices on this level, dict: level (1 for the first tumor cell) -> node indices separated by semi-colon
#level_index_f = save_prefix + ".level_index.npy"
#chr_name_array_f = save_prefix + ".chr_name_array.npy"
tree_f = save_prefix + ".tree.npy"
#ref_f = save_prefix + ".ref.npy"
#leaf_chrlen = numpy.load(leaf_chrlen_f)
# record the chrlen for each level, dict: level (1 for the first tumor cell) -> array with chrlen at this level
#level_chrlens = numpy.load(level_chrlen_f)
#leaf_index = numpy.load(leaf_index_f)
#level_indices = numpy.load(level_index_f)
#chr_name_array = numpy.load(chr_name_array_f)
tree_elements_arr = numpy.load(tree_elements_f, allow_pickle=True)
tree_elements = tree_elements_arr[0]
tree = numpy.load(tree_f, allow_pickle=True)
# depth is the level where the leaves are
depth = tree_elements.tree_D
# max_depth is the level whre the leaves are, tree_D is tempoariraly defunct
max_depth = tree_elements.max_depth
#depth = tree_elements.get_depth()
level_chrlens = tree_elements.level_chrlens
level_indice = tree_elements.level_indice
chr_name_array = tree_elements.chr_name_array
[ref, tmp_chr_name, tmp_len_chr] = init_ref(template_ref)
# get the length of the reference for normalizing the coverage
#tmp_name_array, chr_sz = getlen_ref(ref)
ref_total_sz = 0
for i in tmp_len_chr:
ref_total_sz += i
#ref = numpy.load(ref_f)
#print(leaf_chrlen_f)
#print(leaf_index_f)
#print(chr_name_array_f)
# make it either making a tree, or generating the leaves
# now alpha and beta are redefined and are for the read coverage uniformity
[Alpha, Beta] = get_beta_dist(x0, y0)
if len(bulk_levels) != 0:
[Alpha_bulk, Beta_bulk] = get_beta_dist(x0_bulk, y0_bulk)
print("Bulk sequencing: Alpha = " + str(Alpha_bulk) + ", Beta = " + str(Beta_bulk))
# start sequencing bulk
for bulk_level in bulk_levels:
index = 0
original_level = bulk_level
bulk_level = int(bulk_level)
if bulk_level < 0:
bulk_level = max_depth + bulk_level + 1
print("bulk_level = " + str(bulk_level))
print("level_chrlens = " + str(level_chrlens))
if len(level_chrlens[bulk_level]) == 0:
sys.exit("Warning: level " + original_level + " is out of range. Please specify a small -K, and use the -G to constrain your specified levels. ")
level_chrlen = level_chrlens[bulk_level]
level_index = level_indice[bulk_level]
processes = []
fq_file_names = []
# the parallel computing is on each node on the same level
for all_chrlen in level_chrlen:
make_fa_wABs(level_index[index], tree, ref, chr_name_array, fa_prefix)
perc = tree[level_index[index]].perc
ref_files = fa_prefix + str(level_index[index]) + "_*.fa"
this_cov_bulk = perc * convert_coverage(cov_bulk, all_chrlen, ref_total_sz)
processes.append(mp.Process(target=gen_reads, args=(dir, index, level_index, all_chrlen, fa_prefix, Alpha_bulk, Beta_bulk, x0_bulk, y0_bulk, this_cov_bulk, l, window_size, u, chr_name_array, -1, original_level, "bulk")))
# when all nodes at this level is sampled, merge them into one big fastq file as this is bulk sequencing
fq_file_name1 = "level" + original_level + "_node" + str(level_index[index]) + "_allele1.fq"
fq_file_names.append(fq_file_name1)
fq_file_name2 = "level" + original_level + "_node" + str(level_index[index]) + "_allele2.fq"
fq_file_names.append(fq_file_name2)
index = index + 1
bulk_fq = "level" + original_level + "_bulk.fq"
for p in processes:
p.start()
for p in processes:
p.join()
for f in fq_file_names:
os.system("cat " + f + " >> " + bulk_fq)
print("Done with generating bulk for level " + original_level)
#print("Alpha = %.2f, Beta = %.2f", Alpha, Beta)
print("Number of processes: " + str(NUM_OF_PROCESSES))
# assume for each level, the total number of cells remain the same
print("Number of cells for each level: " + str(n))
print("levels: " + str(levels))
# Serial for each level and each node. For each node, make it parallel.
for level in levels:
index = 0
# process level by level, levels are specified by the user
# original_level is a string and might be negative. User specified level representation will be in the fq file names.
original_level = level
level = int(level)
if level < 0:
# -1 is for leaves, will transform it to leaf's level
level = max_depth + level + 1
print("Will sequence level " + str(level))
if len(level_chrlens[level]) == 0:
sys.exit("Warning: level " + original_level + " is out of range. Please specify a small -K, and use the -G to constrain your specified levels. ")
level_chrlen = level_chrlens[level]
level_index = level_indice[level]
for all_chrlen in level_chrlen:
# convert the coverage so that it assumes a normal ploidy genome length (this is to avoid inflation of read number due to the increase of ploidy)
this_cov = convert_coverage(cov, all_chrlen, ref_total_sz)
print("Given cov is " + str(cov) + ", and the coverage for this cell after considering ploidy is " + str(this_cov))
# each node at this level
#for all_chrlen in leaf_chrlen:
# each leaf
# parallelize the whole process on the cells on a clone
processes = []
#TODO add a function here to retrieve the CN and apply to reference, and write to a fa file for the two alleles. This has been done.
#leaf_index_ = leaf_index[index]
# the index of the node obtained from level_index is w.r.t. the chrlen from level_chrlen
# this is the node id including interal nodes.
level_index_ = level_index[index]
#if leaf_index_range != "-1" and index < index_max and index >= index_min or leaf_ID_range != "-1" and leaf_index_ < leaf_index_max and leaf_index_ >= leaf_index_min or leaf_index_range == "-1" and leaf_ID_range == "-1":
#if leaf_index_range != "-1" and index < index_max and index >= index_min or leaf_ID_range != "-1" and level_index_ < leaf_index_max and level_index_ >= leaf_index_min or leaf_index_range == "-1" and leaf_ID_range == "-1":
if leaf_index_range != "-1" and index < index_max and index >= index_min or leaf_index_range == "-1":
#make_fa(leaf_index[index], tree, ref, chr_name_array, fa_prefix)
make_fa_wABs(level_index[index], tree, ref, chr_name_array, fa_prefix)
# since this is a subclone, get the percentage of it
#perc = tree[leaf_index[index]].perc
perc = tree[level_index[index]].perc
cell_num = int(n * perc)
if not single_cell_per_clone:
print("Cell number for the current clone is " + str(cell_num))
# allowing not considering multiple cells in a clone (node in a tree), thus each node has only one cell
else:
cell_num = 1
#total_cell_num += cell_num
#TODO need to take care the difference of the added cell number and the total
cell_i_last = False
for cell_i in range(cell_num):
# print(" Sequencing cell " + str(cell_i))
# sequence each cell in this subclone
if cell_i == cell_num - 1:
# for removing the fa file
cell_i_last = True
os.system("mkdir -p "+dir+'/cell%d/'%cell_i)
# parallelization happens inside each cell
# print(all_chrlen)
# p = mp.Process(target=gen_reads, args=(dir, index, level_index, all_chrlen, fa_prefix, Alpha, Beta, x0, y0, this_cov, l, window_size, u, chr_name_array, cell_i, original_level, "sc"))
# p.start()
# p.join()
for i in range(len(all_chrlen)):
fa_f = fa_prefix + str(level_index[index]) + "_" + str(i + 1) + ".fa"
args = "samtools faidx " + fa_f
popen = subprocess.Popen(args, stdout=subprocess.PIPE, shell=True)
popen.wait()
# n_thread = 15
sequences = Parallel(n_jobs=NUM_OF_PROCESSES )(delayed(gen_reads)(dir+'/cell%d/'%cell_i,
index,
level_index,
all_chrlen,
fa_prefix,
Alpha,
Beta,
x0,
y0,
this_cov,
l,
window_size,
u,
chr_name_array,
cell_i,
original_level,
"sc") for cell_i in tqdm(range(cell_num)))
if cell_i_last:
#print "Done with the whole clone " + str(leaf_index[index]) + ", will remove it. "
print("Done with the whole clone " + str(level_index[index]) + ", will remove it. ")
#ref_files = fa_prefix + str(leaf_index[index]) + "_*.fa"
ref_files = fa_prefix + str(level_index[index]) + "_*.fa"
args = "rm " + ref_files
os.system(args)
# deal with the next node on this level
index = index + 1
print("At level" + str(level))
# clean up the temp fa files
#args = "rm " + fa_prefix + "*_*_*_*_*.fa"
#popen = subprocess.Popen(args, stdout=subprocess.PIPE, shell=True)
#popen.wait()