Skip to content

Commit

Permalink
Added --one-based option. Treats all intervals as 1-based.
Browse files Browse the repository at this point in the history
Needed because some tools like Ensembl output BED files using 1-based
intervals, even though this is against the standard.

Closes #107.
  • Loading branch information
csw committed Aug 6, 2012
1 parent 4400ad0 commit acd405f
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 45 deletions.
90 changes: 50 additions & 40 deletions bin/maf_extract
Expand Up @@ -6,12 +6,13 @@ require 'ostruct'

include Bio::MAF

options = OpenStruct.new
options.mode = :intersect
options.format = :maf
options.seq_filter = {}
options.block_filter = {}
options.parse_options = {}
$options = OpenStruct.new
$options.mode = :intersect
$options.format = :maf
$options.one_based = false
$options.seq_filter = {}
$options.block_filter = {}
$options.parse_options = {}

def handle_list_spec(spec)
if spec =~ /^@(.+)/
Expand All @@ -23,7 +24,11 @@ end

def handle_interval_spec(int)
if int =~ /(.+):(\d+)-(\d+)/
Bio::GenomicInterval.zero_based($1, $2.to_i, $3.to_i)
if $options.one_based
Bio::GenomicInterval.new($1, $2.to_i, $3.to_i)
else
Bio::GenomicInterval.zero_based($1, $2.to_i, $3.to_i)
end
else
raise "Invalid interval specification: #{int}"
end
Expand All @@ -34,13 +39,13 @@ $op = OptionParser.new do |opts|
opts.separator ""
opts.separator "MAF source options (either --maf or --maf-dir must be given):"
opts.on("-m", "--maf MAF", "MAF file") do |maf|
options.maf = maf
$options.maf = maf
end
opts.on("-i", "--index INDEX", "MAF index") do |idx|
options.idx = idx
$options.idx = idx
end
opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
options.maf_dir = dir
$options.maf_dir = dir
end
opts.separator ""
opts.separator "Extraction options:"
Expand All @@ -49,63 +54,68 @@ $op = OptionParser.new do |opts|
"blocks intersecting the given region,",
"or 'slice' to extract subsets covering ",
"given regions") do |mode|
options.mode = mode
$options.mode = mode
end
opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
options.bed = bed
$options.bed = bed
end
opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
options.interval = handle_interval_spec(int)
$options.interval = handle_interval_spec(int)
end
opts.on("--one-based",
"Treat all intervals as one-based",
"(even from BED files, contrary to the standard)") do
$options.one_based = true
end
opts.separator ""
opts.separator "Output options:"
opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
options.format = fmt
$options.format = fmt
end
opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
options.out_path = out
$options.out_path = out
end
opts.separator ""
opts.separator "Filtering options:"
opts.on("--only-species SPECIES",
"Filter out all but the species in the",
"given comma-separated list",
"(or @FILE to read from a file)") do |spec|
options.seq_filter[:only_species] = handle_list_spec(spec)
$options.seq_filter[:only_species] = handle_list_spec(spec)
end
opts.on("--with-all-species SPECIES",
"Only match blocks with all the given",
"species, comma-separated",
"(or @FILE to read from a file)") do |spec|
options.block_filter[:with_all_species] = handle_list_spec(spec)
$options.block_filter[:with_all_species] = handle_list_spec(spec)
end
opts.on("--min-sequences N", Integer,
"Match only blocks with at least N sequences") do |n|
options.block_filter[:at_least_n_sequences] = n
$options.block_filter[:at_least_n_sequences] = n
end
opts.on("--min-text-size N", Integer,
"Match only blocks with minimum text size N") do |n|
options.block_filter[:min_size] = n
$options.block_filter[:min_size] = n
end
opts.on("--max-text-size N", Integer,
"Match only blocks with maximum text size N") do |n|
options.block_filter[:max_size] = n
$options.block_filter[:max_size] = n
end
opts.separator ""
opts.separator "Block processing options:"
opts.on("--join-blocks",
"Join blocks if appropriate after filtering",
"out sequences") do
options.parse_options[:join_blocks] = true
$options.parse_options[:join_blocks] = true
end
opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
options.parse_options[:remove_gaps] = true
$options.parse_options[:remove_gaps] = true
end
opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
options.parse_options[:parse_extended] = true
$options.parse_options[:parse_extended] = true
end
opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
options.parse_options[:parse_empty] = true
$options.parse_options[:parse_empty] = true
end
opts.separator ""
opts.separator "Logging options:"
Expand All @@ -120,24 +130,24 @@ def usage(msg)
exit 2
end

if options.maf
access = Access.file(options.maf, options.idx, options.parse_options)
elsif options.maf_dir
access = Access.maf_dir(options.maf_dir, options.parse_options)
if $options.maf
access = Access.file($options.maf, $options.idx, $options.parse_options)
elsif $options.maf_dir
access = Access.maf_dir($options.maf_dir, $options.parse_options)
else
usage "Must supply --maf or --maf-dir!"
end

begin
access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
access.block_filter = options.block_filter unless options.block_filter.empty?
if options.out_path
outf = File.open(options.out_path, 'w')
access.sequence_filter = $options.seq_filter unless $options.seq_filter.empty?
access.block_filter = $options.block_filter unless $options.block_filter.empty?
if $options.out_path
outf = File.open($options.out_path, 'w')
else
outf = $stdout
end

case options.format
case $options.format
when :maf
writer = Writer.new(outf)
when :fasta
Expand All @@ -146,20 +156,20 @@ begin
raise "unsupported output format #{format}!"
end

if options.bed
intervals = read_bed_intervals(options.bed)
elsif options.interval
intervals = [options.interval]
if $options.bed
intervals = read_bed_intervals($options.bed)
elsif $options.interval
intervals = [$options.interval]
else
usage "Must supply --interval or --bed!"
end

# TODO: provide access to original MAF header?
if options.format == :maf
if $options.format == :maf
writer.write_header(Header.default)
end

case options.mode
case $options.mode
when :intersect
access.find(intervals) do |block|
writer.write_block(block)
Expand All @@ -172,7 +182,7 @@ begin
end
end
else
raise "Unsupported mode #{options.mode}!"
raise "Unsupported mode #{$options.mode}!"
end

ensure
Expand Down
19 changes: 14 additions & 5 deletions bin/maf_tile
Expand Up @@ -39,15 +39,19 @@ o_parser = OptionParser.new do |opts|
end
opts.on("-i", "--interval [CHR:]BEGIN:END", "Genomic interval, zero-based") do |int|
if int =~ /(.+):(\d+):(\d+)/
gi = Bio::GenomicInterval.zero_based($1, ($2.to_i), ($3.to_i))
options.genomic_interval = gi
options.genomic_interval_spec = [$1, $2.to_i, $3.to_i]
elsif int =~ /(\d+):(\d+)/
options.interval = ($1.to_i)...($2.to_i)
else
$stderr.puts "Invalid interval specification #{int}!"
options.usage = true
end
end
opts.on("--one-based",
"Treat all intervals as one-based",
"(even from BED files, contrary to the standard)") do
options.one_based = true
end
opts.on("-s", "--species SPECIES[:NAME]", "Species to use (with mapped name)") do |sp|
if sp =~ /:/
species, mapped = sp.split(/:/)
Expand Down Expand Up @@ -78,7 +82,7 @@ unless (! options.usage) \
&& maf_p && (! options.species.empty?) \
&& (options.output_base \
? options.bed \
: options.interval || options.genomic_interval)
: options.interval || options.genomic_interval_spec)
$stderr.puts o_parser
exit 2
end
Expand All @@ -105,8 +109,13 @@ if options.bed
end
else
# single interval
if options.genomic_interval
interval = options.genomic_interval
if options.genomic_interval_spec
spec = options.genomic_interval_spec
if options.one_based
interval = Bio::GenomicInterval.new(*spec)
else
interval = Bio::GenomicInterval.zero_based(*spec)
end
else
if access.indices.size != 1
raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!"
Expand Down
11 changes: 11 additions & 0 deletions features/maf-querying.feature
Expand Up @@ -82,3 +82,14 @@ Feature: Filter results from MAF files
And I run `maf_extract -m mm8.chrM.maf.bgz --interval mm8.chrM:6938-13030 -o m2.maf`
And I run `diff m1.maf m2.maf`
Then the exit status should be 0

@no_jruby
Scenario: One-based indexing with maf_extract
Given test files:
| mm8_chr7_tiny.maf |
| mm8_chr7_tiny.kct |
When I run `sh -c 'maf_extract -d . --one-based --interval mm8.chr7:80082592-80082713 | grep "^a" | wc -l'`
Then it should pass with:
"""
2
"""
15 changes: 15 additions & 0 deletions features/tiling.feature
Expand Up @@ -241,3 +241,18 @@ Feature: Join alignment blocks with reference data
--------GG
"""

@no_jruby
Scenario: Tile with CLI tool and directory, 1-based
Given test files:
| mm8_chr7_tiny.maf |
| mm8_chr7_tiny.kct |
When I run `maf_tile -s mm8 -s rn4 -s hg18 --one-based --interval mm8.chr7:80082335:80082344 .`
Then it should pass with:
"""
>mm8
GGGCTGAGGG
>rn4
GGGCTGAGGG
>hg18
--------GG
"""

0 comments on commit acd405f

Please sign in to comment.