-
Notifications
You must be signed in to change notification settings - Fork 2
/
import_bam_ref_nonref_counts.py
644 lines (502 loc) · 23.2 KB
/
import_bam_ref_nonref_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
#!/bin/env python
#
# Copyright 2013-2014 Graham McVicker and Bryce van de Geijn
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
#
"""
usage: import_bam_ref_nonref_counts.py [-h] [--assembly ASSEMBLY]
[--snp_index_track SNP_INDEX_TRACK]
[--snp_track SNP_TRACK]
[--data_type {uint8,uint16}]
[--hap_track HAP_TRACK]
[--samples_file SAMPLES_FILE]
[--individual INDIVIDUAL]
[--population POPULATION]
ref_as_count_track alt_as_count_track
other_as_count_track read_count_track
bam_filenames [bam_filenames ...]
positional arguments:
ref_as_count_track name of track to store counts of reads that match
reference
alt_as_count_track name of track to store counts of reads that match
alternate
other_as_count_track name of track to store counts of reads that match
neither reference or alternate
read_count_track name of track to store read counts in--positions of
LEFT end of read are used
bam_filenames BAM file(s) to read mapped reads from. BAMs must be
sorted and indexed.
optional arguments:
-h, --help show this help message and exit
--assembly ASSEMBLY genome assembly that reads were mapped to (e.g. hg19)
--snp_index_track SNP_INDEX_TRACK
name of SNP index track
(default=1000genomes/snp_index)
--snp_track SNP_TRACK
name of track containing table of SNPs
(default=1000genomes/snp_tab)
--data_type {uint8,uint16}
data type of stored counts; uint8 takes up less disk
space but has a maximum value of 255 (default=uint8)
--hap_track HAP_TRACK
name of haplotype track; if supplied when read
overlaps multiple SNPs counts are randomly assigned to
ONE of the overlapping HETEROZYGOUS SNPs; if not
supplied counts are randomly assigned to ONE of
overlapping SNPs (regardless of genotype)
--samples_file SAMPLES_FILE
path to file containing a list of individual
identifiers in the same order that the individuals
appear in the haplotype table. The file is assumed to
be in the same format as the sample file used by
IMPUTE2. This file contains 4 columns: 'sample'
'population' 'group' 'sex'. This must be provided if
hap_track is is specified. The sex column is not
currently used and can be omitted
--individual INDIVIDUAL
identifier for individual, used to determine which
SNPs are heterozygous (e.g. 18505). Must be provided
if hap_track is specified and must match one of the
individuals in the provided samples_file
--population POPULATION
indicate that haplotype table contains only
individuals from this population or group (e.g. YRI or
EUR)
This program reads BAM files and counts the number of reads that match
the alternate and reference allele at every SNP position stored in the
/impute2/snps HDF5 track. The read counts are stored in specified
ref_track, alt_track and other_track HDF5 tracks. Additionally counts
of all reads are stored in another track (at the left-most position of
the reads).
This program does not perform filtering of reads based on mappability.
It is assumed that this filtering will be done prior to calling this
script.
Reads that overlap known indels are not included in allele-specific
counts.
We are currently working on an improved allele-specific mapping method
so that criteria 1 and 2 can be relaxed can be done at the level of
the BAM (before running this script).
This program requires the use of the genome library, which can be
downloaded from here:
https://github.com/gmcvicker/genome
"""
import sys
import os
import gzip
import tables
import argparse
import numpy as np
import pysam
import genome.db
import genome.coord
import genome.trackstat as trackstat
# codes used by pysam for aligned read CIGAR strings
BAM_CMATCH = 0 # M
BAM_CINS = 1 # I
BAM_CDEL = 2 # D
BAM_CREF_SKIP = 3 # N
BAM_CSOFT_CLIP = 4 # S
BAM_CHARD_CLIP = 5 # H
BAM_CPAD = 6 # P
BAM_CEQUAL = 7 # =
BAM_CDIFF = 8 # X
BAM_CIGAR_DICT = {0 : "M",
1 : "I",
2 : "D",
3 : "N",
4 : "S",
5 : "H",
6 : "P",
7 : "=",
8 : "X"}
SNP_UNDEF = -1
MAX_UINT8_COUNT = 255
MAX_UINT16_COUNT = 65535
def create_carray(track, chrom, data_type):
if data_type == "uint8":
atom = tables.UInt8Atom(dflt=0)
elif data_type == "uint16":
atom = tables.UInt16Atom(dflt=0)
else:
raise NotImplementedError("unsupported datatype %s" % data_type)
zlib_filter = tables.Filters(complevel=1, complib="zlib")
# create CArray for this chromosome
shape = [chrom.length]
carray = track.h5f.createCArray(track.h5f.root, chrom.name,
atom, shape, filters=zlib_filter)
return carray
def get_carray(track, chrom):
return track.h5f.getNode("/%s" % chrom)
def is_indel(snp):
if (len(snp['allele1']) != 1) or (len(snp['allele2'])) != 1:
return True
def dump_read(f, read):
cigar_str = " ".join(["%s:%d" % (BAM_CIGAR_DICT[c[0]], c[1]) for c in read.cigar])
f.write("pos: %d\n"
"aend: %d\n"
"alen (len of aligned portion of read on genome): %d\n"
"qstart: %d\n"
"qend: %d\n"
"qlen (len of aligned qry seq): %d\n"
"rlen (read len): %d\n"
"tlen (insert size): %d\n"
"cigar: %s\n"
"seq: %s\n"
% (read.pos, read.aend, read.alen, read.qstart, read.qend,
read.qlen, read.rlen, read.tlen, cigar_str, read.seq))
def get_sam_iter(samfile, chrom):
try:
sam_iter = samfile.fetch(reference=chrom.name,
start=1, end=chrom.length)
except ValueError as ve:
sys.stderr.write("%s\n" % str(ve))
# could not find chromosome, try stripping leading 'chr'
# E.g. for drosophila, sometimes 'chr2L' is used but
# othertimes just '2L' is used. Annoying!
chrom_name = chrom.name.replace("chr", "")
sys.stderr.write("WARNING: %s does not exist in BAM file, "
"trying %s instead\n" % (chrom.name, chrom_name))
try:
sam_iter = samfile.fetch(reference=chrom_name,
start=1, end=chrom.length)
except ValueError:
sys.stderr.write("WARNING: %s does not exist in BAM file, "
"skipping chromosome\n" % chrom_name)
sam_iter = iter([])
return sam_iter
def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
"""Picks out a single SNP from those that the read overlaps.
Returns a tuple containing 4 elements: [0] the index of the SNP in
the SNP table, [1] the offset into the read sequence, [2] flag
indicating whether the read was 'split' (i.e. was a spliced
read), [3] flag indicating whether read overlaps known indel.
If there are no overlapping SNPs or the read cannot be processed,
(None, None, is_split, overlap_indel) is returned instead.
"""
read_offsets = []
snp_idx = []
read_start_idx = 0
genome_start_idx = read.pos
n_match_segments = 0
is_split = False
overlap_indel = False
for cig in read.cigar:
op = cig[0]
op_len = cig[1]
if op == BAM_CMATCH:
# this is a block of match/mismatch in read alignment
read_end = read_start_idx + op_len
genome_end = genome_start_idx + op_len
# get offsets of any SNPs that this read overlaps
idx = snp_index_array[genome_start_idx:genome_end]
is_def = np.where(idx != SNP_UNDEF)[0]
read_offsets.extend(read_start_idx + is_def)
snp_idx.extend(idx[is_def])
read_start_idx = read_end
genome_start_idx = genome_end
n_match_segments += 1
elif op == BAM_CREF_SKIP:
# spliced read, skip over this region of genome
genome_start_idx += op_len
is_split = True
else:
sys.stderr.write("skipping because contains CIGAR code %s "
" which is not currently implemented" % BAM_CIGAR_DICT[op])
# are any of the SNPs indels? If so, discard.
for i in snp_idx:
if is_indel(snp_tab[i]):
overlap_indel = True
return (None, None, is_split, overlap_indel)
n_overlap_snps = len(read_offsets)
if n_overlap_snps == 0:
# no SNPs overlap this read
return (None, None, is_split, overlap_indel)
if hap_tab:
# genotype info is provided by haplotype table
# pull out subset of overlapping SNPs that are heterozygous
# in this individual
het_read_offsets = []
het_snp_idx = []
for (i, read_offset) in zip(snp_idx, read_offsets):
haps = hap_tab[i, (ind_idx*2):(ind_idx*2 + 2)]
if ind_idx*2 > hap_tab.shape[1]:
raise ValueError("index of individual (%d) is >= number of "
"individuals in haplotype_tab (%d). probably "
"need to specify --population or use a different "
"--samples_tab" % (ind_idx, hap_tab.shape[1]/2))
if haps[0] != haps[1]:
# this is a het
het_read_offsets.append(read_offset)
het_snp_idx.append(i)
n_overlap_hets = len(het_read_offsets)
if n_overlap_hets == 0:
# none of the overlapping SNPs are hets
return (None, None, is_split, overlap_indel)
if n_overlap_hets == 1:
# only one overlapping SNP is a het
return (het_snp_idx[0], het_read_offsets[0], is_split, overlap_indel)
# choose ONE overlapping HETEROZYGOUS SNP randomly to add counts to
# we don't want to count same read multiple times
r = np.random.randint(0, n_overlap_hets-1)
return (het_snp_idx[r], het_read_offsets[r], is_split, overlap_indel)
else:
# We don't have haplotype tab, so we don't know which SNPs are
# heterozygous in this individual. But we can still tell
# whether read sequence matches reference or non-reference
# allele. Choose ONE overlapping SNP randomly to add counts to
if n_overlap_snps == 1:
return (snp_idx[0], read_offsets[0], is_split, overlap_indel)
else:
r = np.random.randint(0, n_overlap_snps-1)
return (snp_idx[r], read_offsets[r], is_split, overlap_indel)
def add_read_count(read, chrom, ref_array, alt_array, other_array,
read_count_array, snp_index_array, snp_tab, hap_tab,
warned_pos, max_count, ind_idx):
# pysam positions start at 0
start = read.pos+1
end = read.aend
if start < 1 or end > chrom.length:
sys.stderr.write("WARNING: skipping read aligned past end of "
"chromosome. read: %d-%d, %s:1-%d\n" %
(start, end, chrom.name, chrom.length))
return
if read.qlen != read.rlen:
sys.stderr.write("WARNING skipping read: handling of "
"partially mapped reads not implemented\n")
return
# look for SNPs that overlap mapped read position, and if there
# are more than one, choose one at random
snp_idx, read_offset, is_split, overlap_indel = \
choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx)
if overlap_indel:
return
# store counts of reads at start position
if read_count_array[start-1] < max_count:
read_count_array[start-1] += 1
else:
if not start in warned_pos:
sys.stderr.write("WARNING read count at position %d "
"exceeds max %d\n" % (start, max_count))
warned_pos[start] = True
if snp_idx is None:
return
snp = snp_tab[snp_idx]
base = read.seq[read_offset]
snp_pos = snp['pos']
if base == snp['allele1']:
# matches reference allele
if ref_array[snp_pos-1] < max_count:
ref_array[snp_pos-1] += 1
elif not snp_pos in warned_pos:
sys.stderr.write("WARNING ref allele count at position %d "
"exceeds max %d\n" % (snp_pos, max_count))
warned_pos[snp_pos] = True
elif base == snp['allele2']:
# matches alternate allele
if alt_array[snp_pos-1] < max_count:
alt_array[snp_pos-1] += 1
elif not snp_pos in warned_pos:
sys.stderr.write("WARNING alt allele count at position %d "
"exceeds max %d\n" % (snp_pos, max_count))
warned_pos[snp_pos] = True
else:
# matches neither
if other_array[snp_pos-1] < max_count:
other_array[snp_pos-1] += 1
elif not snp_pos in warned_pos:
sys.stderr.write("WARNING other allele count at position %d "
"exceeds max %d\n" % (snp_pos, max_count))
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--assembly", help="genome assembly that reads "
"were mapped to (e.g. hg19)", default=None)
parser.add_argument("--snp_index_track",
help="name of SNP index track "
"(default=1000genomes/snp_index)",
default="1000genomes/snp_index")
parser.add_argument("--snp_track",
help="name of track containing table of SNPs"
" (default=1000genomes/snp_tab)",
default="1000genomes/snp_tab")
parser.add_argument("--data_type",
help="data type of stored counts; uint8 takes "
"up less disk space but has a maximum value of 255 "
"(default=uint8)", choices=("uint8", "uint16"),
default="uint8")
parser.add_argument("--hap_track",
help="name of haplotype track; if supplied when "
"read overlaps multiple SNPs counts are randomly "
"assigned to ONE of the overlapping HETEROZYGOUS "
"SNPs; if not supplied counts are randomly assigned "
"to ONE of overlapping SNPs (regardless of genotype)",
default=None)
parser.add_argument("--samples_file",
help="path to file containing a list of individual "
"identifiers in the same order that the individuals appear "
"in the haplotype table. The file is assumed to be in the "
"same format as the sample file used by IMPUTE2. This file "
"contains 4 columns: 'sample' 'population' 'group' 'sex'. "
"This must be provided if hap_track is is specified. "
"The sex column is not currently used and can be omitted",
default=None)
parser.add_argument("--individual",
help="identifier for individual, used to determine which "
"SNPs are heterozygous (e.g. 18505). Must be "
"provided if hap_track is specified and must match "
"one of the individuals in the provided samples_file",
default=None)
parser.add_argument("--population",
help="indicate that haplotype table contains only "
"individuals from this population or group "
"(e.g. YRI or EUR)", default=None)
parser.add_argument("ref_as_count_track",
help="name of track to store counts of reads "
"that match reference")
parser.add_argument("alt_as_count_track",
help="name of track to store counts of reads "
"that match alternate")
parser.add_argument("other_as_count_track",
help="name of track to store counts of reads "
"that match neither reference or alternate")
parser.add_argument("read_count_track",
help="name of track to store read counts in--"
"positions of LEFT end of read are used")
parser.add_argument("bam_filenames", action="store", nargs="+",
help="BAM file(s) to read mapped reads from. "
"BAMs must be sorted and indexed.")
args = parser.parse_args()
if args.hap_track and (args.individual is None or args.samples_file is None):
parser.error("--indidivual and --samples_file arguments "
"must be provided when --hap_track is specified")
return args
def lookup_individual_index(samples_file, ind_name, population=None):
"""Gets the index of individual that is used
to lookup information in the genotype and haplotype tables"""
f = open(samples_file)
if population:
p = population.lower()
else:
p = None
idx = 0
for line in f:
if line.startswith("samples"):
# header line
continue
words = line.rstrip().split()
name = words[0].replace("NA", "")
if len(words) > 1:
pop = words[1].lower()
else:
pop = ""
if len(words) > 2:
group = words[2].lower()
else:
group = ""
# if specified, only consider a single population or group
if p and pop != p and group != p:
continue
if name == ind_name:
f.close()
return idx
idx += 1
raise ValueError("individual %s (with population=%s) "
"is not in samples file %s" %
(ind_name, population, samples_file))
def main():
args = parse_args()
# create a database track
gdb = genome.db.GenomeDB(assembly=args.assembly)
ref_count_track = gdb.create_track(args.ref_as_count_track)
alt_count_track = gdb.create_track(args.alt_as_count_track)
other_count_track = gdb.create_track(args.other_as_count_track)
read_count_track = gdb.create_track(args.read_count_track)
output_tracks = [ref_count_track, alt_count_track,
other_count_track, read_count_track]
snp_track = gdb.open_track(args.snp_track)
snp_index_track = gdb.open_track(args.snp_index_track)
if args.hap_track:
hap_track = gdb.open_track(args.hap_track)
ind_idx = lookup_individual_index(args.samples_file,
args.individual, args.population)
else:
hap_track = None
ind_idx = None
chrom_dict = {}
count = 0
# initialize every chromosome in output tracks
for chrom in gdb.get_all_chromosomes():
for track in output_tracks:
create_carray(track, chrom, args.data_type)
chrom_dict[chrom.name] = chrom
count = 0
if args.data_type == "uint8":
max_count = MAX_UINT8_COUNT
elif args.data_type == "uint16":
max_count = MAX_UINT16_COUNT
else:
raise NotImplementedError("unsupported datatype %s" % args.data_type)
for chrom in gdb.get_chromosomes(get_x=False):
sys.stderr.write("%s\n" % chrom.name)
warned_pos = {}
# initialize count arrays for this chromosome to 0
ref_carray = get_carray(ref_count_track, chrom)
alt_carray = get_carray(alt_count_track, chrom)
other_carray = get_carray(other_count_track, chrom)
read_count_carray = get_carray(read_count_track, chrom)
ref_array = np.zeros(chrom.length, np.uint8)
alt_array = np.zeros(chrom.length, np.uint8)
other_array = np.zeros(chrom.length, np.uint8)
read_count_array = np.zeros(chrom.length, np.uint8)
# fetch SNP info for this chromosome
sys.stderr.write("fetching SNPs\n")
snp_tab = snp_track.h5f.getNode("/%s" % chrom.name)
snp_index_array = snp_index_track.get_nparray(chrom)
if hap_track:
hap_tab = hap_track.h5f.getNode("/%s" % chrom.name)
else:
hap_tab = None
# loop over all BAM files, pulling out reads
# for this chromosome
for bam_filename in args.bam_filenames:
sys.stderr.write("reading from file %s\n" % bam_filename)
samfile = pysam.Samfile(bam_filename, "rb")
for read in get_sam_iter(samfile, chrom):
count += 1
if count == 10000:
sys.stderr.write(".")
count = 0
add_read_count(read, chrom, ref_array, alt_array,
other_array, read_count_array,
snp_index_array, snp_tab, hap_tab,
warned_pos, max_count, ind_idx)
# store results for this chromosome
ref_carray[:] = ref_array
alt_carray[:] = alt_array
other_carray[:] = other_array
read_count_carray[:] = read_count_array
sys.stderr.write("\n")
samfile.close()
# set track statistics and close HDF5 files
sys.stderr.write("setting track statistics\n")
for track in output_tracks:
sys.stderr.write("%s\n" % track.name)
trackstat.set_stats(gdb, track)
track.close()
snp_track.close()
snp_index_track.close()
if hap_track:
hap_track.close()
main()