/
bwa.rb
312 lines (286 loc) · 15.9 KB
/
bwa.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
module Bio
# @author Francesco Strozzi https://github.com/fstrozzi
class BWA
extend FFI::Library
ffi_lib Bio::BWA::Library.load
# Convert a Fasta to Packed format
# @param [Hash]. params Options.
# @option params [String] :file_in the Fasta or FastQ file (REQUIRED)
# @option params [String] :prefix the prefix name for the PAC file
def self.fa2pac(params={})
valid_params = %w(file_in prefix)
last_params = [:file_in, :prefix]
mandatory_params = [:file_in]
check_mandatory(mandatory_params, params)
args = build_parameters("fa2pac",valid_params,params,last_params)
call_BWA_function(args)
end
# Convert a Packed file format to Burrows-Wheeler Transform format
# @param [Hash]. params Options.
# @option params [String] :file_in the PAC file (REQUIRED)
# @option params [String] :file_out the name of the BWT file (REQUIRED)
def self.pac2bwt(params={})
valid_params = %w(file_in file_out)
last_params = [:file_in,:file_out]
check_mandatory(last_params, params)
args = build_parameters("pac2bwt",valid_params,params,last_params)
call_BWA_function(args)
end
# Convert a BWT file to the new BWT format
# @param [Hash]. params Options.
# @option params [String] :file_in the BWT file (REQUIRED)
# @note this method overwrite existing BWT file
def self.bwtupdate(params={})
valid_params = %w(file_in)
last_params = [:file_in]
check_mandatory(last_params, params)
args = build_parameters("bwtupdate",valid_params,params,last_params)
call_BWA_function(args)
end
# Generate reverse Packed format
# @param [Hash]. params Options.
# @option params [String] :file_in the PAC file (REQUIRED)
# @option params [String] :file_out the name of the REV PAC (REQUIRED)
def self.pac_rev(params={})
valid_params = %w(file_in file_out)
last_params = [:file_in,:file_out]
check_mandatory(last_params, params)
args = build_parameters("pac_rev",valid_params,params,last_params)
call_BWA_function(args)
end
# Generate SA file from BWT and Occ files
# @param [Hash]. params Options.
# @option params [String] :file_in the PAC file (REQUIRED)
# @option params [String] :file_out the name of the REV PAC (REQUIRED)
def self.bwt2sa(params={})
valid_params = %w(file_in file_out i)
last_params = [:file_in,:file_out]
check_mandatory(last_params, params)
args = build_parameters("bwt2sa",valid_params,params,last_params)
call_BWA_function(args)
end
# Generate the BWT index for a Fasta database
# @param [Hash]. params Options.
# @option params [String] :file_in the Fasta file (REQUIRED)
# @option params [String] :p the prefix for the database files that will be generated [default is Fasta name]
# @option params [String] :a the algorithm to be used for indexing: 'is' (short database)[default] or 'bwtsw' (long database)
# @option params [Boolean] :c colorspace database index
# @note Boolean values must be set to 'true'
def self.make_index(params = {})
valid_params = %w(file_in p a c)
mandatory_params = [:file_in]
last_params = [:file_in]
check_mandatory(mandatory_params, params)
params = change_arg_name(params,:prefix,:p) if params[:prefix]
args = build_parameters("index",valid_params,params,last_params)
call_BWA_function(args)
end
# Run the alignment for short query sequences
# @param [Hash] params Options
# @option params [String] :file_in the FastQ file (REQUIRED)
# @option params [String] :prefix the prefix of the database index files (REQUIRED)
# @option params [String] :file_out the output of the alignment in SAI format (REQUIRED)
# @option params [Integer] :n max #diff (int) or missing prob under 0.02 err rate (float) [0.04]
# @option params [Integer] :o maximum number or fraction of gap opens [1]
# @option params [Integer] :e maximum number of gap extensions, -1 for disabling long gaps [-1]
# @option params [Integer] :m maximum entries in the queue [2000000]
# @option params [Integer] :t number of threads [1]
# @option params [Integer] :M mismatch penalty [3]
# @option params [Integer] :O gap open penalty [11]
# @option params [Integer] :R stop searching when there are >INT equally best hits [30]
# @option params [Integer] :q quality threshold for read trimming down to 35bp [0]
# @option params [Integer] :B length of barcode
# @option params [Boolean] :c input sequences are in the color space
# @option params [Boolean] :L log-scaled gap penalty for long deletions
# @option params [Boolean] :N non-iterative mode: search for all n-difference hits (slow)
# @option params [Boolean] :I the input is in the Illumina 1.3+ FASTQ-like format
# @option params [Boolean] :b the input read file is in the BAM format
# @option params [Boolean] :single use single-end reads only (effective with -b)
# @option params [Boolean] :first use the 1st read in a pair (effective with -b)
# @option params [Boolean] :second use the 2nd read in a pair (effective with -b)
# @option params [Integer] :i do not put an indel within INT bp towards the ends [5]
# @option params [Integer] :d maximum occurrences for extending a long deletion [10]
# @option params [Integer] :l seed length [32]
# @option params [Integer] :k maximum differences in the seed [2]
# @option params [Integer] :E gap extension penalty [4]
# @note Boolean values must be set to 'true'
def self.short_read_alignment(params={})
args = ["aln"]
valid_params = %w(n o e i d l k c L R m t N M O E q f b single first second I B prefix file_in)
mandatory_params = [:prefix,:file_in,:file_out]
last_params = [:prefix,:file_in]
check_mandatory(mandatory_params, params)
params = change_arg_name(params,:file_out,:f) if params[:file_out]
params = change_arg_name(params,:single,"0") if params[:single]
params = change_arg_name(params,:first,"1") if params[:first]
params = change_arg_name(params,:second,"2") if params[:second]
args = build_parameters("aln",valid_params,params,last_params)
call_BWA_function(args)
end
# Convert the SAI alignment output into SAM format (single end)
# @param [Hash] params Options
# @option params [String] :fastq the FastQ file (REQUIRED)
# @option params [String] :prefix the prefix of the database index files (REQUIRED)
# @option params [String] :sai the alignment file in SAI format (REQUIRED)
# @option params [String] :file_out the file name of the SAM output
# @option params [Integer] :n max_occ
# @option params [String] :r RG_line
def self.sai_to_sam_single(params = {})
valid_params = %w(n r fastq sai prefix f)
mandatory_params = [:prefix,:sai,:fastq]
last_params = [:prefix,:sai,:fastq]
check_mandatory(mandatory_params, params)
params = change_arg_name(params,:file_out,:f) if params[:file_out]
args = build_parameters("sai2sam_se",valid_params,params,last_params)
call_BWA_function(args)
end
# Convert the SAI alignment output into SAM format (paired ends)
# @param [Hash] params Options
# @option params [String] :prefix the prefix of the database index files (REQUIRED)
# @option params [Array] :sai the two alignment files in SAI format (REQUIRED)
# @option params [Array] :fastq the two fastq files (REQUIRED)
# @option params [Integer] :a maximum insert size [500]
# @option params [Integer] :o maximum occurrences for one end [100000]
# @option params [Integer] :n maximum hits to output for paired reads [3]
# @option params [Integer] :N maximum hits to output for discordant pairs [10]
# @option params [Float] :c prior of chimeric rate (lower bound) [1.0e-05]
# @option params [String] :r read group header line such as '@RG\tID:foo\tSM:bar'
# @option params [Boolean] :P preload index into memory (for base-space reads only)
# @option params [Boolean] :s disable Smith-Waterman for the unmapped mate
# @option params [Boolean] :A disable insert size estimate (force :s)
# @note Boolean values must be set to 'true'
def self.sai_to_sam_paired(params = {})
valid_params = %w(a o s P n N c f A r prefix first_sai second_sai first_fastq second_fastq)
mandatory_params = [:prefix, :sai, :fastq]
last_params = [:prefix, :first_sai, :second_sai, :first_fastq, :second_fastq]
check_mandatory(mandatory_params, params)
params = change_arg_name(params,:file_out,:f) if params[:file_out]
if params[:sai]
raise ArgumentError,"you must provide an array with two SAI files!" unless params[:sai].is_a?(Array) and params[:sai].size == 2
params[:first_sai] = params[:sai][0]
params[:second_sai] = params[:sai][1]
params.delete(:sai)
end
if params[:fastq]
raise ArgumentError,"you must provide an array with two FastQ files!" unless params[:fastq].is_a?(Array) and params[:fastq].size == 2
params[:first_fastq] = params[:fastq][0]
params[:second_fastq] = params[:fastq][1]
params.delete(:fastq)
end
args = build_parameters("sai2sam_pe",valid_params,params,last_params)
call_BWA_function(args)
end
# Run the alignment for long query sequences
# @param [Hash] params Options
# @option params [String] :file_in the FastQ file (REQUIRED)
# @option params [String] :prefix the prefix of the database index files (REQUIRED)
# @option params [String] :file_out the output of the alignment in SAM format (REQUIRED)
# @option params [Integer] :a score for a match [1]
# @option params [Integer] :b mismatch penalty [3]
# @option params [Integer] :q gap open penalty [5]
# @option params [Integer] :r gap extension penalty [2]
# @option params [Integer] :t number of threads [1]
# @option params [Integer] :w band width [50]
# @option params [Float] :m mask level [0.50]
# @option params [Integer] :T score threshold divided by a [30]
# @option params [Integer] :s maximum seeding interval size [3]
# @option params [Integer] :z Z-best [1]
# @option params [Integer] :N number of seeds to trigger reverse alignment [5]
# @option params [Float] :c coefficient of length-threshold adjustment [5.5]
# @option params [Boolean] :H in SAM output, use hard clipping rather than soft
# @note Boolean arguments must be set to 'true'
def self.long_read_alignment(params = {})
valid_params = %w(q r a b t T w d z m y s c N H f prefix file_in)
mandatory_params = [:prefix, :file_in, :file_out]
last_params = [:prefix,:file_in]
check_mandatory(mandatory_params, params)
params = change_arg_name(params,:file_out,:f) if params[:file_out]
args = build_parameters("bwtsw2",valid_params,params,last_params)
call_BWA_function(args)
end
# Run the alignment between multiple short sequences and ONE long sequence
# @param [Hash] params Options
# @option params [String] :short_seq the short query sequence (REQUIRED)
# @option params [String] :long_seq the long database sequence (REQUIRED)
# @option params [String] :file_out the alignment output
# @option params [Integer] :T minimum score [1]
# @option params [Boolean] :p protein alignment (suppressing :r)
# @option params [Boolean] :f forward strand only
# @option params [Boolean] :r reverse strand only
# @option params [Boolean] :g global alignment
# @note Boolean values must be set to 'true'
def self.simple_SW(params = {})
args = ["stdsw"]
valid_params = %w(g T f r p file_out long_seq short_seq)
mandatory_params = [:long_seq,:short_seq]
last_params = mandatory_params
check_mandatory(mandatory_params, params)
file_out = params[:file_out]
params.delete(:file_out)
args = build_parameters("stdsw",valid_params,params,last_params)
$stdout.reopen(file_out,"w") if file_out
call_BWA_function(args)
$stdout.reopen("/dev/tty","w") if file_out
end
######## Methods to handle C functions and arguments ########
attach_function :bwa_fa2pac, [:int,:pointer], :int
attach_function :bwa_pac2bwt, [:int,:pointer], :int
attach_function :bwa_bwtupdate, [:int,:pointer], :int
attach_function :bwa_pac_rev, [:int,:pointer], :int
attach_function :bwa_bwt2sa, [:int,:pointer], :int
attach_function :bwa_index, [:int,:pointer], :int
attach_function :bwa_aln, [:int,:pointer], :int
attach_function :bwa_sai2sam_se, [:int, :pointer], :int
attach_function :bwa_sai2sam_pe, [:int,:pointer], :int
attach_function :bwa_bwtsw2, [:int, :pointer], :int
attach_function :bwa_stdsw, [:int, :pointer], :int
# Internal method to call the BWA C functions
# @note this method should not be called directly
def self.call_BWA_function(args)
c_args = build_args_for_BWA(args)
self.send("bwa_#{args[0]}".to_sym,args.size,c_args) # call the C function and pass the arguments size and parameters list (same as int argc, char *argv[])
end
# Internal method to build argument list for BWA C functions
# @note this method should not be called directly
def self.build_args_for_BWA(args)
cmd_args = args.map do |arg|
FFI::MemoryPointer.from_string(arg.to_s) # convert every parameters into a string and then into a memory pointer
end
exec_args = FFI::MemoryPointer.new(:pointer, cmd_args.length) # creating a pointer to an array of pointers
cmd_args.each_with_index do |arg, i|
exec_args[i].put_pointer(0, arg) # filling in the array of pointers
end
return exec_args
end
# Internal method to produce a correct parameter list for BWA functions
# @note this method should not be called directly
def self.build_parameters(function_name,valid_params,params,last_params)
args = [function_name]
params.each_key do |k|
raise ArgumentError, "Unknown parameter '#{k}'" unless valid_params.include?(k.to_s)
if params[k] and !last_params.include?(k) then # check if value exists and if is not a last_params (required at the end of BWA functions)
args << "-#{k}"
args << params[k] unless params[k] == true # skipping boolean values. just include the param name
end
end
last_params.each {|p| args << params[p]} # now adding the last_params so the parameter list is in the correct order for BWA functions
return args
end
# Internal method to check if mandatory params have been set
# @note this method should not be called directly
def self.check_mandatory(mandatory_params, params)
mandatory_params.each {|mp| raise ArgumentError,"You must provide parameter '#{mp}'" unless params.include?(mp)}
end
# Internal method used to change parameters name from Ruby to BWA functions
# @note this method should not be called directly
def self.change_arg_name(hash,key,new_key)
hash[new_key] = hash[key]
hash.delete(key)
return hash
end
private_class_method :call_BWA_function
private_class_method :build_args_for_BWA
private_class_method :build_parameters
private_class_method :check_mandatory
end
end