Skip to content

Commit

Permalink
Merge pull request #7 from genome/ins-ctgs
Browse files Browse the repository at this point in the history
rename when inserting haplotigs
remove haplotig that was put into primary ctgs from haplotigs
RI-321
  • Loading branch information
ebelter committed Aug 11, 2018
2 parents af2790e + 3450b2b commit b06842b
Show file tree
Hide file tree
Showing 6 changed files with 129,636 additions and 75 deletions.
147 changes: 90 additions & 57 deletions lib/Pacbio/Command/Assembly/InsertMissingContigs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use strict;
use warnings 'FATAL';

use Bio::SeqIO;
use List::Util;
use List::MoreUtils;
use Set::Scalar;
use Sx::Index::Fai;
Expand All @@ -30,11 +31,14 @@ class Pacbio::Command::Assembly::InsertMissingContigs {
doc => 'Haplotigs contigs fasta index file. If not given, ".fai" will be appended to the haplotigs contigs fasta file name.',
},
},
has_optional_output => {
output_fasta => {
has_output => {
output_primary_fasta => {
is => 'Text',
default_value => '-',
doc => 'Output fasta file. Defaults to STDOUT.',
doc => 'Output primary contigs fasta file.',
},
output_haplotigs_fasta => {
is => 'Text',
doc => 'Output haplotigs fasta file.',
},
},
has_optional_transient => {
Expand All @@ -47,18 +51,19 @@ sub __init__ {
my ($self) = @_;

for my $type (qw/ primary haplotigs /) {
# input fasta
my $fasta_method = join('_', $type, 'fasta');
$self->fatal_message("File $fasta_method does not exist!") if not -s $self->$fasta_method;
# input fai
my $fai_method = join('_', $type, 'fai');
if ( not $self->$fai_method ) {
$self->$fai_method( join('.', $self->$fasta_method, 'fai') );
}
$self->fatal_message("File $fai_method does not exist!") if not -s $self->$fai_method;
}

my $output_fasta = $self->output_fasta;
if ( $output_fasta and $output_fasta ne '-' ) {
$self->fatal_message("Output file exists: $output_fasta. Please change detination, or remove it.") if -s $output_fasta;
# output fasta
my $method = join('_', 'output', $type, 'fasta');
my $fasta = $self->$method;
$self->fatal_message('Output %s fasta exists: %s. Please change detination, or remove it.', $type, $fasta) if -s $fasta;
}
}

Expand All @@ -70,11 +75,7 @@ sub execute {
my $haplotigs = $self->_get_haplotigs_set_from_fai;
my $missing_ctgs = $haplotigs->difference($ctgs);
$self->fatal_message("No missing contigs found!") if not $missing_ctgs->members;
my $missing_haplotigs = $self->_get_missing_haplotigs($missing_ctgs);
#print Data::Dumper::Dumper([sort $ctgs->members]);
#print Data::Dumper::Dumper([sort $missing_ctgs->members]);
#print Data::Dumper::Dumper([sort $missing_haplotigs->members]);
$self->_write_contigs($ctgs->union($missing_haplotigs));
$self->_write_contigs($ctgs, $missing_ctgs);
1;
}

Expand All @@ -99,67 +100,99 @@ sub _get_haplotigs_set_from_fai {

my $set = Set::Scalar->new;
while ( my $e = $fai->reader->read ) {
my @t = split(/\|/, $e->{id});
$t[0] =~ s/_\d+$//;
$set->insert( join('|', @t) );
}

$set;
}

sub _get_missing_haplotigs {
my ($self, $missing) = @_;

my $fai = $self->_haplotigs_fai;
my $set = Set::Scalar->new;
for my $id ( $missing->members ) {
my @t = split(/\|/, $id);
my $entries = $fai->entries_for_id_regex("^$t[0]");
$self->fatal_message("Failed to get haplotigs for id regex: $id") if not $entries;
my ($max) = sort { $b->{length} <=> $a->{length} } @$entries;
$set->insert($max->{id});
my $haplotig_id_tokens = $self->haplotig_id_tokens($e->{id});
$set->insert( $haplotig_id_tokens->[0].$haplotig_id_tokens->[2] );
}

$set;
}

sub _write_contigs {
my ($self, $contigs) = @_;
my ($self, $ctgs, $missing_ctgs) = @_;

my $pfh = IO::File->new($self->primary_fasta, 'r');
$self->fatal_message('Failed to open primary fasta! %s', $self->primary_fasta) if not $pfh;
my $pseqio = Bio::SeqIO->new(
-fh => $pfh,
# Seq INs
my $p_seqin = Bio::SeqIO->new(
-file => $self->primary_fasta,
-format => 'Fasta',
);

my $hfh = IO::File->new($self->haplotigs_fasta, 'r');
$self->fatal_message('Failed to open haplotigs fasta! %s', $self->haplotigs_fasta) if not $hfh;
my $hseqio = Bio::SeqIO->new(
my $h_seqin = Bio::SeqIO->new(
-fh => $hfh,
-format => 'Fasta',
);

my $output_fasta = $self->output_fasta;
my %seqio_params = ( $self->output_fasta eq '-' )
? ( -fh => \*STDOUT )
: ( -file => ">$output_fasta" );
$seqio_params{'-format'} = 'Fasta';
my $oseqio = Bio::SeqIO->new(%seqio_params);

my $seq;
for my $ctg_id ( sort { $a cmp $b } $contigs->members ) {
my $entry = $self->_haplotigs_fai->entry_for_id($ctg_id);
if ( defined $entry ) {
my $pos = $entry->{offset} - length($ctg_id) - 2;
$hfh->seek($pos, 0);
$seq = $hseqio->next_seq;
# Seq OUTs
my $p_seqout = Bio::SeqIO->new(
-format => 'Fasta',
-file => '>'.$self->output_primary_fasta,
);
my $h_seqout = Bio::SeqIO->new(
-format => 'Fasta',
-file => '>'.$self->output_haplotigs_fasta,
);

for my $ctg_id ( sort { $a cmp $b } ($ctgs->members, $missing_ctgs->members) ) {
if ( $ctgs->has($ctg_id) ) {
# Contig should be next in primary fasta - write it to the output primary fasta
my $seq = $p_seqin->next_seq;
$self->fatal_message('Expected primary contig %s but got %s', $ctg_id, $seq->id) if $ctg_id ne $seq->id;
$p_seqout->write_seq($seq);

# Get entries from haplotig fai for this primary id, write them to haplotig fasta
my $entries = $self->_haplotigs_fai->entries_for_id_regex('^'.$ctg_id.'_');
next if not $entries; # dunno, could happen?
for my $entry ( @$entries ) {
my $seq = $h_seqin->next_seq;
$self->fatal_message('Expected haplotig %s but got %s', $entry->{id}, $seq->id) if $entry->{id} ne $seq->id;
$h_seqout->write_seq($seq);
}
}
else {
$seq = $pseqio->next_seq;
else { # MISSING
# Get entries from haplotig fai, marking the longest one
my $entries = $self->_get_haplotigs_for_primary_ctg_id_marking_longest($ctg_id);
my $i = 1;
for my $entry ( @$entries ) {
my $seq = $h_seqin->next_seq;
$self->fatal_message('Expected haplotig %s but got %s', $entry->{id}, $seq->id) if $entry->{id} ne $seq->id;
if ( exists $entry->{longest} ) { # longest goes to primary using the ctg id
$seq->id($ctg_id);
$p_seqout->write_seq($seq);
}
else { # others go to the output haplotig, re-numbering them
my $haplotig_id_tokens = $self->haplotig_id_tokens($entry->{id});
$seq->id( sprintf('%s_%03d%s', $haplotig_id_tokens->[0], $i, $haplotig_id_tokens->[2]) );
$h_seqout->write_seq($seq);
$i++;
}
}
}
$oseqio->write_seq($seq);
}
}

sub _get_haplotigs_for_primary_ctg_id_marking_longest {
my ($self, $ctg_id) = @_;

my $fai = $self->_haplotigs_fai;
my $entries = $fai->entries_for_id_regex('^'.$ctg_id.'_');
$self->fatal_message('No entries for primary contig id: %s', $ctg_id) if not $entries;

my $lengths = Set::Scalar->new( map { $_->{length} } @$entries );
my $longest_length = List::Util::max( $lengths->members );
my $longest = List::MoreUtils::first_value { $_->{length} == $longest_length } @$entries;
$longest->{longest} = 1;
$entries
}

sub haplotig_id_tokens {
my ($self, $haplotig_id) = @_;
$self->fatal_message("No haplotig id to get tokens!") if not defined $haplotig_id;
my ($ctg_num, $rest) = split(/_/, $haplotig_id, 2);
$self->fatal_message("Could not find haplotig number in id: $haplotig_id") if not $rest;
$rest =~ s/^(\d+)//;
my $num = "$1";
$self->fatal_message("Could not find haplotig number in id: $haplotig_id") if not $num;
[ $ctg_num, $num, $rest ]; # CTG_NUM HAP_NUM REMAINDER
}

1;
2 changes: 1 addition & 1 deletion lib/Sx/Index/Fai.pm
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ sub entries_for_id_regex {
push @e, $reader->read;
}

\@e;
[ sort { $a->{offset} <=> $b->{offset} } @e ];
}

1;
35 changes: 24 additions & 11 deletions t/Pacbio-Command-Assembly-InsertMissingContigs.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@ use warnings 'FATAL';

use TenxTestEnv;

use File::Compare;
use File::Slurp;
use File::Temp;
use Path::Class;
use Test::Exception;
use Test::More tests => 2;
use Test::More tests => 3;

my %test = ( class => 'Pacbio::Command::Assembly::InsertMissingContigs', );
subtest 'setup' => sub{
Expand All @@ -16,33 +19,43 @@ subtest 'setup' => sub{
use_ok($test{class});
$test{data_dir} = TenxTestEnv::test_data_directory_for_class($test{class});
ok(-d $test{data_dir}->stringify, 'data dir exists');
$test{tempdir} = Path::Class::dir( File::Temp::tempdir(CLEANUP => 1) );

};

subtest 'execute' => sub{
plan tests => 6;

#my $output = $test{data_dir}->file('got.fasta')->stringify;
#unlink $output;
plan tests => 7;

my $got_primary = $test{tempdir}->file('p.got.fasta')->stringify;
my $got_haplotigs = $test{tempdir}->file('h.got.fasta')->stringify;
my $cmd = $test{class}->create(
primary_fasta => $test{data_dir}->file('p_ctg.fasta')->stringify,
haplotigs_fasta => $test{data_dir}->file('h_ctg.fasta')->stringify,
#output_fasta => $output,
output_primary_fasta => $got_primary,
output_haplotigs_fasta => $got_haplotigs,
);
ok($cmd, 'create command');

my $output;
open local(*STDOUT), '>', \$output or die $!;
lives_ok(sub{ $cmd->execute }, 'execute');
ok($cmd->result, 'command result');

my $expected_output = File::Slurp::slurp( $test{data_dir}->file('expected.fasta')->stringify );
is($output, $expected_output, 'output fasta matches');
my $expected_primary = $test{data_dir}->file('p.expected.fasta')->stringify;
is(File::Compare::compare($got_primary, $expected_primary), 0, 'output primary fasta matches');
my $expected_haplotigs = $test{data_dir}->file('h.expected.fasta')->stringify;
is(File::Compare::compare($got_haplotigs, $expected_haplotigs), 0, 'output haplotigs fasta matches');

is($cmd->primary_fai, $cmd->primary_fasta.'.fai', 'set primary fai');
is($cmd->haplotigs_fai, $cmd->haplotigs_fasta.'.fai', 'set haplotigs fai');

};

subtest 'haplotig id tokens' => sub{
plan tests => 3;

throws_ok(sub{ $test{class}->haplotig_id_tokens(); } , qr/No haplotig id/, 'fails w/o haplotig id');
throws_ok(sub{ $test{class}->haplotig_id_tokens('000200F|arrow'); } , qr/Could not find haplotig number/, 'fails w/o invalid haplotig id');
my @expected = (qw/ 000200F 002 |arrow /);
is_deeply($test{class}->haplotig_id_tokens('000200F_002|arrow'), \@expected, 'correct primary id');

};

done_testing();
5 changes: 3 additions & 2 deletions t/Sx-Index-Fai.t
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@ subtest 'entries_for_id_regex' => sub{
my $fai = $test{fai};
throws_ok(sub{ $fai->entries_for_id_regex; }, qr/No id regex given/, 'fails w/o id regex');
lives_ok(sub{ $fai->entries_for_id_regex('blah'); }, 'ok if id not found');
my @e = $fai->entries_for_id_regex('^000000F_004');
is_deeply(@e, [$test{entry}], 'got entries by id regex');
my $expected = [ $fai->entry_for_id('000000F_001|arrow'), $fai->entry_for_id('000000F_004|arrow') ];
my $got = $fai->entries_for_id_regex('^000000F_00[14]');
is_deeply($got, $expected, 'got entries by id regex');

};

Expand Down
Loading

0 comments on commit b06842b

Please sign in to comment.