/
mapDamage
executable file
·212 lines (157 loc) · 6.24 KB
/
mapDamage
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python
# -*- coding: ASCII -*-
from __future__ import print_function
import sys
import os
# check if pysam if available
module = "pysam"
url = "http://code.google.com/p/pysam/"
try:
__import__(module)
except ImportError, e:
sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (module,e))
sys.stderr.write(" If module is not installed, please download from '%s'.\n" % (url,))
sys.stderr.write(" A local install may be performed using the following command:\n")
sys.stderr.write(" $ python setup.py install --user\n\n")
sys.exit(1)
import pysam
import mapdamage
from mapdamage.version import __version__
"""
plot damage patterns from a SAM/BAM file
:Author: Aurelien Ginolhac
:Contact: aginolhac@snm.ku.dk
:Date: November 2012
:Type: tool
:Input: SAM/BAM
:Output: tabulated tables, pdf
"""
def getCoordinates(read):
if read.is_reverse:
fivep = read.aend
threep = read.pos
else:
fivep = read.pos
threep = read.aend
return(fivep, threep)
def getAround(coord, chrom, reflengths, lg, ref):
i = j = 0
before = after = ""
if min(coord) - lg > 0:
i = min(coord) - lg
if max(coord) + lg > reflengths[chrom]:
j = reflengths[chrom]
else:
j = max(coord) + lg
before = ref.fetch(chrom, i, min(coord))
after = ref.fetch(chrom, max(coord), j)
return(before, after)
def writeFasta(read,ref, seq, refseq, start, end, before, after, fout):
if read.is_reverse:
std = '-'
else:
std = '+'
# output coordinate in 1-based offset
fout.write(">%s:%d-%d\n%s\n" % (ref, start-len(before), start, before))
fout.write(">%s:%d-%d\n%s\n>%s_%s\n%s\n" % (ref, start+1, end+1, refseq, read.qname, std, seq))
fout.write(">%s:%d-%d\n%s\n" % (ref, end+2, end+2+len(after), after))
""" main """
def main(argv):
options = mapdamage.parseoptions.options(argv)
if not options:
return 1
# plot using R if results folder already done
if options.plotonly:
mapdamage.rscript.plot(options)
return 0
# open mapped reads and corresponding reference
in_bam = pysam.Samfile(options.filename)
ref = pysam.Fastafile(options.ref)
# for misincorporation patterns, to record mismaches, from 5'-ends
misincorp = {}
# for fragmentation patterns, record base compositions in the reference around
dnacomp = {}
# fetch all references and associated lengths in nucleotides
reflengths = dict(zip(in_bam.references, in_bam.lengths))
# initialize table of misincorporations with zeros
misincorp = mapdamage.tables.initializeMut(misincorp, reflengths, options.length)
# initialise base composition tables with zeros
dnacomp = mapdamage.tables.initializeComp(dnacomp, reflengths, options.around, options.length)
if not options.quiet:
print("\tReading from '%s'" % options.filename)
if options.verbose:
print("\t%d references are assumed in SAM/BAM file, for a total of %d nucleotides" % (len(reflengths), sum(reflengths.values())))
print("\tWriting results to '%s/'" % options.folder)
# reference in BAM must be in the fasta reference file
#if not referenceCheck():
#sys.stderr.write("Error: %s reference was not found in the reference file provided\n" % (chrom))
# return 1
# open file handler to write alignements in fasta format
if options.fasta:
# use name of the sam/bam filename without extension
ffasta = os.path.splitext(os.path.basename(options.filename))[0]+'.fasta'
if not options.quiet:
print("\tWriting alignments in '%s'" % ffasta)
ff = open(options.folder+"/"+ffasta,"w")
counter = filtered = 0
# main loop
for read in in_bam:
counter += 1
if read.is_unmapped:
filtered +=1
continue
# external coordinates 5' and 3' , 0-based offset
coordinate = getCoordinates(read)
# fetch reference name, chromosome or contig names
chrom = in_bam.getrname(read.tid)
(before, after) = getAround(coordinate, chrom, reflengths, options.around, ref)
refseq = ref.fetch(chrom, min(coordinate), max(coordinate))
# read.query contains aligned sequences while read.seq is the read itself
seq = read.query
# add gaps according to the cigar string
(seq, refseq) = mapdamage.align.align(read.cigar, seq, refseq)
# reverse complement read and reference when mapped reverse strand
if read.is_reverse:
refseq = refseq.translate(mapdamage.seq.table)[::-1]
seq = seq.translate(mapdamage.seq.table)[::-1]
beforerev = after.translate(mapdamage.seq.table)[::-1]
after = before.translate(mapdamage.seq.table)[::-1]
before = beforerev
# record soft clippinp when present
if len(mapdamage.align.parseCigar(read.cigar, 4)) > 0:
mapdamage.align.recordSoftClipping(mapdamage.align.parseCigar(read.cigar, 4), read, chrom, misincorp, options.length)
# count misincorparations by comparing read and reference base by base
mapdamage.align.getMis(read, seq, refseq, chrom, options.length, misincorp, '5p')
# do the same with sequences align to 3'-ends
mapdamage.align.getMis(read, seq[::-1], refseq[::-1], chrom, options.length, misincorp, '3p')
# compute base composition for genomic regions
mapdamage.composition.countRefComp(read, chrom, before, after, dnacomp)
# compute base composition for reads
mapdamage.composition.countReadComp(read, chrom, options.length, dnacomp)
if options.fasta:
writeFasta(read, chrom, seq, refseq, min(coordinate), max(coordinate), before, after, ff)
if options.verbose:
if counter % 50000 == 0:
print("\t%10d reads processed (%d unmapped)" % (counter, filtered))
if not options.quiet:
print("\tDone. %d reads read (%d unmapped)" % (counter, filtered))
# close file handlers
in_bam.close()
ref.close()
if options.fasta:
ff.close()
# open file handlers to output results
fmut = open(options.folder+"/"+"misincorporation.txt", 'w')
fcomp = open(options.folder+"/"+"dnacomp.txt", 'w')
# write summary tables to disk
mapdamage.tables.printMut(misincorp, options, fmut)
mapdamage.tables.printComp(dnacomp, options, fcomp)
# close file handlers
fmut.close()
fcomp.close()
# plot using R
if not options.nor:
mapdamage.rscript.plot(options)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))