/
shared.py
302 lines (271 loc) · 12.5 KB
/
shared.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""Pipeline functionality shared amongst multiple analysis types.
"""
import os
from contextlib import contextmanager
import functools
import operator
import tempfile
import pybedtools
import pysam
import six
import toolz as tz
from bcbio import bam, broad, utils
from bcbio.bam import ref
from bcbio.pipeline import datadict as dd
from bcbio.pipeline import config_utils
from bcbio.utils import file_exists, save_diskspace
from bcbio.distributed.transaction import file_transaction, tx_tmpdir
from bcbio.provenance import do
from functools import reduce
# ## Split/Combine helpers
def combine_bam(in_files, out_file, config):
"""Parallel target to combine multiple BAM files.
"""
runner = broad.runner_from_path("picard", config)
runner.run_fn("picard_merge", in_files, out_file)
for in_file in in_files:
save_diskspace(in_file, "Merged into {0}".format(out_file), config)
bam.index(out_file, config)
return out_file
def get_noalt_contigs(data):
"""Retrieve contigs without alternatives as defined in bwa *.alts files.
If no alt files present (when we're not aligning with bwa), work around
with standard set of alts based on hg38 -- anything with HLA, _alt or
_decoy in the name.
"""
alts = set([])
alt_files = [f for f in tz.get_in(["reference", "bwa", "indexes"], data, []) if f.endswith("alt")]
if alt_files:
for alt_file in alt_files:
with open(alt_file) as in_handle:
for line in in_handle:
if not line.startswith("@"):
alts.add(line.split()[0].strip())
else:
for contig in ref.file_contigs(dd.get_ref_file(data)):
if ("_alt" in contig.name or "_decoy" in contig.name or
contig.name.startswith("HLA-") or ":" in contig.name):
alts.add(contig.name)
return [c for c in ref.file_contigs(dd.get_ref_file(data)) if c.name not in alts]
def write_nochr_reads(in_file, out_file, config):
"""Write a BAM file of reads that are not mapped on a reference chromosome.
This is useful for maintaining non-mapped reads in parallel processes
that split processing by chromosome.
"""
if not file_exists(out_file):
with file_transaction(config, out_file) as tx_out_file:
samtools = config_utils.get_program("samtools", config)
cmd = "{samtools} view -b -f 4 {in_file} > {tx_out_file}"
do.run(cmd.format(**locals()), "Select unmapped reads")
return out_file
def write_noanalysis_reads(in_file, region_file, out_file, config):
"""Write a BAM file of reads in the specified region file that are not analyzed.
We want to get only reads not in analysis regions but also make use of
the BAM index to perform well on large files. The tricky part is avoiding
command line limits. There is a nice discussion on SeqAnswers:
http://seqanswers.com/forums/showthread.php?t=29538
sambamba supports intersection via an input BED file so avoids command line
length issues.
"""
if not file_exists(out_file):
with file_transaction(config, out_file) as tx_out_file:
bedtools = config_utils.get_program("bedtools", config)
sambamba = config_utils.get_program("sambamba", config)
cl = ("{sambamba} view -f bam -l 0 -L {region_file} {in_file} | "
"{bedtools} intersect -abam - -b {region_file} -f 1.0 -nonamecheck"
"> {tx_out_file}")
do.run(cl.format(**locals()), "Select unanalyzed reads")
return out_file
def subset_bam_by_region(in_file, region, config, out_file_base=None):
"""Subset BAM files based on specified chromosome region.
"""
if out_file_base is not None:
base, ext = os.path.splitext(out_file_base)
else:
base, ext = os.path.splitext(in_file)
out_file = "%s-subset%s%s" % (base, region, ext)
if not file_exists(out_file):
with pysam.Samfile(in_file, "rb") as in_bam:
target_tid = in_bam.gettid(region)
assert region is not None, \
"Did not find reference region %s in %s" % \
(region, in_file)
with file_transaction(config, out_file) as tx_out_file:
with pysam.Samfile(tx_out_file, "wb", template=in_bam) as out_bam:
for read in in_bam:
if read.tid == target_tid:
out_bam.write(read)
return out_file
def subset_bed_by_chrom(in_file, chrom, data, out_dir=None):
"""Subset a BED file to only have items from the specified chromosome.
"""
if out_dir is None:
out_dir = os.path.dirname(in_file)
base, ext = os.path.splitext(os.path.basename(in_file))
out_file = os.path.join(out_dir, "%s-%s%s" % (base, chrom, ext))
if not utils.file_uptodate(out_file, in_file):
with file_transaction(data, out_file) as tx_out_file:
_rewrite_bed_with_chrom(in_file, tx_out_file, chrom)
return out_file
def _rewrite_bed_with_chrom(in_file, out_file, chrom):
with open(in_file) as in_handle:
with open(out_file, "w") as out_handle:
for line in in_handle:
if line.startswith("%s\t" % chrom):
out_handle.write(line)
def _subset_bed_by_region(in_file, out_file, regions, ref_file, do_merge=True):
orig_bed = pybedtools.BedTool(in_file)
region_bed = pybedtools.BedTool("\n".join(["%s\t%s\t%s" % (c, s, e) for c, s, e in regions]) + "\n",
from_string=True)
sort_kwargs = {"faidx": ref.fasta_idx(ref_file)} if ref_file else {}
if do_merge:
orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\
filter(lambda x: len(x) >= 1).saveas().merge().saveas(out_file)
else:
orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\
filter(lambda x: len(x) >= 1).saveas(out_file)
def remove_lcr_regions(orig_bed, items):
"""If configured and available, update a BED file to remove low complexity regions.
"""
lcr_bed = tz.get_in(["genome_resources", "variation", "lcr"], items[0])
if lcr_bed and os.path.exists(lcr_bed) and "lcr" in get_exclude_regions(items):
return _remove_regions(orig_bed, [lcr_bed], "nolcr", items[0])
else:
return orig_bed
def remove_polyx_regions(in_file, items):
"""Remove polyX stretches, contributing to long variant runtimes.
"""
ex_bed = tz.get_in(["genome_resources", "variation", "polyx"], items[0])
if ex_bed and os.path.exists(ex_bed):
return _remove_regions(in_file, [ex_bed], "nopolyx", items[0])
else:
return in_file
def add_highdepth_genome_exclusion(items):
"""Add exclusions to input items to avoid slow runtimes on whole genomes.
"""
out = []
for d in items:
d = utils.deepish_copy(d)
if dd.get_coverage_interval(d) == "genome":
e = dd.get_exclude_regions(d)
if "highdepth" not in e:
e.append("highdepth")
d = dd.set_exclude_regions(d, e)
out.append(d)
return out
def remove_highdepth_regions(in_file, items):
"""Remove high depth regions from a BED file for analyzing a set of calls.
Tries to avoid spurious errors and slow run times in collapsed repeat regions.
Also adds ENCODE blacklist regions which capture additional collapsed repeats
around centromeres.
"""
encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
if encode_bed and os.path.exists(encode_bed):
return _remove_regions(in_file, [encode_bed], "glimit", items[0])
else:
return in_file
def _remove_regions(in_file, remove_beds, ext, data):
"""Subtract a list of BED files from an input BED.
General approach handling none, one and more remove_beds.
"""
from bcbio.variation import bedutils
out_file = "%s-%s.bed" % (utils.splitext_plus(in_file)[0], ext)
if not utils.file_uptodate(out_file, in_file):
with file_transaction(data, out_file) as tx_out_file:
with bedtools_tmpdir(data):
if len(remove_beds) == 0:
to_remove = None
elif len(remove_beds) == 1:
to_remove = remove_beds[0]
else:
to_remove = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
with open(to_remove, "w") as out_handle:
for b in remove_beds:
with utils.open_gzipsafe(b) as in_handle:
for line in in_handle:
parts = line.split("\t")
out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
if utils.file_exists(to_remove):
to_remove = bedutils.sort_merge(to_remove, data)
if to_remove and utils.file_exists(to_remove):
cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
do.run(cmd.format(**locals()), "Remove problematic regions: %s" % ext)
else:
utils.symlink_plus(in_file, out_file)
return out_file
@contextmanager
def bedtools_tmpdir(data):
with tx_tmpdir(data) as tmpdir:
orig_tmpdir = tempfile.gettempdir()
pybedtools.set_tempdir(tmpdir)
yield
if orig_tmpdir and os.path.exists(orig_tmpdir):
pybedtools.set_tempdir(orig_tmpdir)
else:
tempfile.tempdir = None
def get_exclude_regions(items):
"""Retrieve regions to exclude from a set of items.
Includes back compatibility for older custom ways of specifying different
exclusions.
"""
def _get_sample_excludes(d):
excludes = dd.get_exclude_regions(d)
# back compatible
if tz.get_in(("config", "algorithm", "remove_lcr"), d, False):
excludes.append("lcr")
return excludes
out = reduce(operator.add, [_get_sample_excludes(d) for d in items])
return sorted(list(set(out)))
def remove_exclude_regions(f):
"""Remove regions to exclude based on configuration: polyA, LCR, high depth.
"""
exclude_fns = {"lcr": remove_lcr_regions, "highdepth": remove_highdepth_regions,
"polyx": remove_polyx_regions}
@functools.wraps(f)
def wrapper(variant_regions, region, out_file, items=None, do_merge=True, data=None):
region_bed = f(variant_regions, region, out_file, items, do_merge, data)
if region_bed and isinstance(region_bed, six.string_types) and os.path.exists(region_bed) and items:
for e in get_exclude_regions(items):
if e in exclude_fns:
region_bed = exclude_fns[e](region_bed, items)
return region_bed
return wrapper
def to_multiregion(region):
"""Convert a single region or multiple region specification into multiregion list.
If a single region (chrom, start, end), returns [(chrom, start, end)]
otherwise returns multiregion.
"""
assert isinstance(region, (list, tuple)), region
if isinstance(region[0], (list, tuple)):
return region
else:
assert len(region) == 3
return [tuple(region)]
@remove_exclude_regions
def subset_variant_regions(variant_regions, region, out_file, items=None, do_merge=True, data=None):
"""Return BED file subset by a specified chromosome region.
variant_regions is a BED file, region is a chromosome name or tuple
of (name, start, end) for a genomic region.
"""
if region is None:
return variant_regions
elif variant_regions is None:
return region
elif not isinstance(region, (list, tuple)) and region.find(":") > 0:
raise ValueError("Partial chromosome regions not supported")
else:
merge_text = "-unmerged" if not do_merge else ""
subset_file = "{0}".format(utils.splitext_plus(out_file)[0])
subset_file += "%s-regions.bed" % (merge_text)
if not os.path.exists(subset_file):
data = items[0] if items else data
with file_transaction(data, subset_file) as tx_subset_file:
if isinstance(region, (list, tuple)):
_subset_bed_by_region(variant_regions, tx_subset_file, to_multiregion(region),
dd.get_ref_file(data), do_merge=do_merge)
else:
_rewrite_bed_with_chrom(variant_regions, tx_subset_file, region)
if os.path.getsize(subset_file) == 0:
return region
else:
return subset_file