Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Fixed Marker_build and added an option -remove_dup to remove duplicat…

…e taxons
  • Loading branch information...
commit 00a37d509653e35ec521a2aab6295508c7fec94a 1 parent ee48c87
@gjospin authored
View
3  bin/phylosift
@@ -112,8 +112,9 @@ GetOptions(
"coverage=s" => \$ps->{"coverage"}, #provides a contig/scaffold coverage file
"updated_markers=i" => \$ps->{"updated"}, #Indicates if Phylosift uses the updated versions of the Markers.
"marker_url=s" => \$ps->{"marker_url"}, # an alternate address to retrieve markers from
- "extended" => \$ps->{"extended"}, #Should the full extended set of markers be used?
+ "extended" => \$ps->{"extended"}, #Should the full extended set of markers be used?
"config=s" => \$ps->{"configuration"}, #custom config file
+ "remove_dup" => \$ps->{"remove_dup"}, #removes duplicate taxons when using build_marker
"h" => \$help, #prints the usage part of the README file
"help" => \$help, #same as -h
"debug" => \$my_debug, #print debugging messages?
View
299 lib/Phylosift/MarkerBuild.pm
@@ -14,6 +14,7 @@ Phylosift::MarkerBuild - build a seed marker package from an existing multiple s
Version 0.01
=cut
+
our $VERSION = '0.01';
=head1 SYNOPSIS
@@ -37,56 +38,76 @@ sub build_marker {
my $self = $args{self} || miss("self");
my $aln_file = $args{alignment} || miss("alignment");
my $cutoff = $args{cutoff} || miss("cutoff");
- my $force = $args{force}; #force is not a required option
- my $mapping = $args{mapping}; #not a required argument
+ my $force = $args{force}; #force is not a required option
+ my $mapping = $args{mapping}; #not a required argument
my ( $core, $path, $ext ) = fileparse( $aln_file, qr/\.[^.]*$/ );
- my $marker_dir = Phylosift::Utilities::get_data_path( data_name => "markers", data_path => $Phylosift::Settings::marker_path );
+ my $marker_dir = Phylosift::Utilities::get_data_path(
+ data_name => "markers",
+ data_path => $Phylosift::Settings::marker_path
+ );
my $target_dir = $marker_dir . "/$core";
#$target_dir = $self->{"fileDir"};
if ( -e $target_dir && !$force ) {
croak
"Marker already exists in $marker_dir. Delete Marker and restart the marker build.\nUse -f to force an override of the previous marker.\nUsage:\n>phylosift build_marker -f aln_file cutoff\n";
- } else {
+ }
+ else {
- #`rm -rf "$target_dir"` if $force;
- #`mkdir "$target_dir"`;
+ `rm -rf "$target_dir"` if $force;
+ `mkdir "$target_dir"`;
}
my $fasta_file = "$target_dir/$core.fasta";
- my $seq_count = Phylosift::Utilities::unalign_sequences( aln => $aln_file, output_path => $fasta_file );
+ my $seq_count = Phylosift::Utilities::unalign_sequences(
+ aln => $aln_file,
+ output_path => $fasta_file
+ );
my $masked_aln = "$target_dir/$core.masked";
mask_aln( file => $aln_file, output => $masked_aln ) unless -e $masked_aln;
my $hmm_file = "$target_dir/$core.hmm";
- generate_hmm( file_name => $aln_file, hmm_name => $hmm_file ) unless -e $hmm_file;
- Phylosift::Utilities::fasta2stockholm( fasta => $aln_file, output => "$target_dir/$core.stk" ) unless -e "$target_dir/$core.stk";
+ generate_hmm( file_name => $aln_file, hmm_name => $hmm_file )
+ unless -e $hmm_file;
+ Phylosift::Utilities::fasta2stockholm(
+ fasta => $aln_file,
+ output => "$target_dir/$core.stk"
+ ) unless -e "$target_dir/$core.stk";
my $stk_aln = "$target_dir/$core.stk";
#may need to create an unaligned file for the sequences before aligning them
my $new_alignment_file = hmmalign_to_model(
- hmm_profile => $hmm_file,
- sequence_file => $fasta_file,
- target_dir => $target_dir,
- reference_alignment => $stk_aln,
- sequence_count => $seq_count
+ hmm_profile => $hmm_file,
+ sequence_file => $fasta_file,
+ target_dir => $target_dir,
+ reference_alignment => $stk_aln,
+ sequence_count => $seq_count
);
my $clean_aln = "$target_dir/$core.clean";
- my %id_map = mask_and_clean_alignment( alignment_file => $new_alignment_file, output_file => $clean_aln );
+ my %id_map = mask_and_clean_alignment(
+ alignment_file => $new_alignment_file,
+ output_file => $clean_aln
+ );
debug( "ID_map is " . scalar( keys(%id_map) ) . " long\n" );
- my @array = keys(%id_map);
- my $fasttree_file = "$target_dir/18s_reps.tree";
- my $tree_log_file = "$target_dir/18s_reps.log";
-
- #my ( $fasttree_file, $tree_log_file ) = generate_fasttree( alignment_file => $clean_aln, target_directory => $target_dir ) unless -e "$target_dir/$core.tree";
- #need to generate representatives using PDA
- #my $rep_file = "$target_dir/18s_reps.tree";
- my $rep_file = get_representatives_from_tree( tree => $fasttree_file, target_directory => $target_dir, cutoff => $cutoff )
- unless -e "$target_dir/$core.pda";
-
- #Phylosift::Utilities::generate_hmm();
- #need to read the representatives picked by PDA and generate a representative fasta file
- #my $rep_fasta = "$target_dir/18s_reps.rep";
- my $rep_fasta = get_fasta_from_pda_representatives( pda_file => $rep_file, target_dir => $target_dir, fasta_reference => $fasta_file, id_map => \%id_map )
- unless -e "$target_dir/$core.rep";
+ my @array = keys(%id_map);
+
+ my ( $fasttree_file, $tree_log_file ) = generate_fasttree(
+ alignment_file => $clean_aln,
+ target_directory => $target_dir
+ ) unless -e "$target_dir/$core.tree";
+
+ #need to generate representatives using PDA
+ my $rep_file = get_representatives_from_tree(
+ tree => $fasttree_file,
+ target_directory => $target_dir,
+ cutoff => $cutoff
+ ) unless -e "$target_dir/$core.pda";
+
+#need to read the representatives picked by PDA and generate a representative fasta file
+ my $rep_fasta = get_fasta_from_pda_representatives(
+ pda_file => $rep_file,
+ target_dir => $target_dir,
+ fasta_reference => $fasta_file,
+ id_map => \%id_map
+ ) unless -e "$target_dir/$core.rep";
if ( defined $mapping ) {
my $tmp_jplace = $target_dir . "/" . $core . ".tmpread.jplace";
my $mangled_jplace = $tmp_jplace . ".mangled";
@@ -100,23 +121,39 @@ sub build_marker {
unless -e "target_dir/temp_ref";
#make a dummy jplace file
- #my $tmpread_file = "$target_dir/18s_reps.tmpread";
- my $tmpread_file = Phylosift::UpdateDB::create_temp_read_fasta( file => "$target_dir/$core", aln_file => $clean_aln )
- unless -e "$target_dir/$core.tmpread.fasta";
- `cd "$target_dir";pplacer -c temp_ref -p "$tmpread_file"` unless -e $tmp_jplace;
+ my $tmpread_file = Phylosift::UpdateDB::create_temp_read_fasta(
+ file => "$target_dir/$core",
+ aln_file => $clean_aln
+ ) unless -e "$target_dir/$core.tmpread.fasta";
+ `cd "$target_dir";pplacer -c temp_ref -p "$tmpread_file"`
+ unless -e $tmp_jplace;
jplace_mangler( in => $tmp_jplace, out => $mangled_jplace );
#create a file with a list of IDs
- my @taxon_array = generate_id_to_taxonid_map( id_hash_ref => \%id_map, map_file => $mapping, out_file => $id_taxon_map, marker => $core );
- debug "ID_MAP : " . scalar( keys(%id_map) ) . "\n";
- Phylosift::UpdateDB::make_ncbi_subtree( out_file => $ncbi_sub_tree, taxon_ids => \@taxon_array );
+ my %taxon_hash = generate_id_to_taxonid_map(
+ self => $self,
+ id_hash_ref => \%id_map,
+ map_file => $mapping,
+ out_file => $id_taxon_map,
+ marker => $core
+ );
+ my @taxon_array = keys(%taxon_hash);
+ debug "TAXON_ARRAY " . scalar(@taxon_array) . "\n";
+ Phylosift::UpdateDB::make_ncbi_subtree(
+ out_file => $ncbi_sub_tree,
+ taxon_ids => \@taxon_array
+ );
+ debug "AFTER ncbi_subtree\n";
`readconciler $ncbi_sub_tree $mangled_jplace $id_taxon_map $taxon_map`;
- `rm -rf "$tmpread_file" "$mangled_jplace" "$tmp_jplace" "$id_taxon_map" "$ncbi_sub_tree" "$target_dir/temp_ref"`;
+ debug "AFTER readconciler\n";
+
+`rm -rf "$tmpread_file" "$mangled_jplace" "$tmp_jplace" "$id_taxon_map" "$ncbi_sub_tree" "$target_dir/temp_ref"`;
}
- #use taxit to create a new reference package required for running PhyloSift
- #needed are : 1 alignment file, 1 representatives fasta file, 1 hmm profile, 1 tree file, 1 log tree file.
+#use taxit to create a new reference package required for running PhyloSift
+#needed are : 1 alignment file, 1 representatives fasta file, 1 hmm profile, 1 tree file, 1 log tree file.
`cd "$target_dir";taxit create -c -d "Creating a reference package for PhyloSift for the $core marker" -l "$core" -f "$clean_aln" -t "$target_dir/$core.tree" -s "$target_dir/$core.log" -Y FastTree -P "$core"`;
+
`rm "$target_dir/$core.pda"`;
`rm "$target_dir/$core.tree"`;
`rm "$target_dir/$core.log"`;
@@ -137,6 +174,7 @@ Also returns a list of Taxon IDs
sub generate_id_to_taxonid_map {
my %args = @_;
+ my $self = $args{self} || miss("PS object");
my $list_ref = $args{id_hash_ref} || miss("Hash for seqID to unique IDs");
my $map_file = $args{map_file} || miss("seqID to TaxonID file");
my $out_file = $args{out_file} || miss("ID_taxon mapping filename");
@@ -144,19 +182,41 @@ sub generate_id_to_taxonid_map {
my %map_hash = %{$list_ref};
my $FHOUT = Phylosift::Utilities::ps_open( ">" . $out_file );
my $FHIN = Phylosift::Utilities::ps_open($map_file);
- my @taxon_hash = ();
my @test_array = keys(%map_hash);
+ my %return_hash = ();
+
while (<$FHIN>) {
chomp($_);
- my @line = split( / /, $_ );
- if ( exists $map_hash{ $line[0] } ) { #&& $line[1] ne 'na'
- print $FHOUT $marker . "\t" . $line[1] . "\t" . $map_hash{ $line[0] } . "\n";
- push( @taxon_hash, $line[1] );
+ my @line = split( /\s/, $_ );
+ $_ =~ m/^(\S+)\s+(\S+)$/;
+ my $id = $1;
+ my $taxonid = $2;
+ if ( exists $map_hash{$id} ) {
+ print $FHOUT $marker . "\t"
+ . $taxonid . "\t"
+ . $map_hash{$id} . "\n";
+ if ( exists $return_hash{$taxonid} ) {
+ $return_hash{$taxonid}++;
+ }
+ else {
+ $return_hash{$taxonid} = 1;
+ }
}
#no need to print anything otherwise
}
- return @taxon_hash;
+
+ #removing any taxon that was seen more than once
+ if ( $self->{remove_dup} ) {
+ debug "Removing duplicate Taxons\n";
+ foreach my $key ( sort ( keys(%return_hash) ) ) {
+ if ( $return_hash{$key} > 1 ) {
+ delete( $return_hash{$key} );
+ }
+ }
+ }
+ debug "Return_hash has : " . scalar( keys(%return_hash) ) . " taxons\n";
+ return %return_hash;
}
=head2 jplace_mangler
@@ -202,21 +262,21 @@ sub hmmalign_to_model {
my $hmm_profile = $args{hmm_profile} || miss("hmm_profile");
my $sequence_file = $args{sequence_file} || miss("sequence_file");
my $target_dir = $args{target_dir} || miss("target_dir");
- my $ref_ali = $args{reference_alignment} || miss("reference_alignment");
- my $seq_count = $args{sequence_count} || miss("sequence_count");
+ my $ref_ali = $args{reference_alignment} || miss("reference_alignment");
+ my $seq_count = $args{sequence_count} || miss("sequence_count");
my ( $core_name, $path, $ext ) = fileparse( $sequence_file, qr/\.[^.]*$/ );
- #my $ALNOUT = ps_open(">$target_dir/$core_name.aln");
- #my $ALNIN = ps_open("$Phylosift::Utilities::hmmalign --mapali $ref_ali --trim --outformat afa $hmm_profile $sequence_file |");
- my $ALNIN = ps_open("$target_dir/18s_reps.aln");
- my $s = 0;
+ my $ALNOUT = ps_open(">$target_dir/$core_name.aln");
+ my $ALNIN = ps_open(
+"$Phylosift::Utilities::hmmalign --mapali $ref_ali --trim --outformat afa $hmm_profile $sequence_file |"
+ );
+ my $s = 0;
while ( my $line = <$ALNIN> ) {
if ( $line =~ /^>/ ) {
$s++;
last if $s > $seq_count;
}
-
- #print $ALNOUT $line;
+ print $ALNOUT $line;
}
close $ALNOUT;
return "$target_dir/$core_name.aln";
@@ -233,20 +293,21 @@ sub mask_and_clean_alignment {
my %args = @_;
my $aln_file = $args{alignment_file} || miss("alignment_file");
my $output_file = $args{output_file} || miss("output_file");
- my %id_map; # will store a map of unique IDs to sequence names
+ my %id_map; # will store a map of unique IDs to sequence names
debug "Using $aln_file\n";
- my $in = Phylosift::Utilities::open_SeqIO_object( file => $aln_file );
- my %s = (); #hash remembering the IDs already printed
+ my $in = Phylosift::Utilities::open_SeqIO_object( file => $aln_file );
+ my %s = (); #hash remembering the IDs already printed
my $FILEOUT = ps_open(">$output_file");
my $seq_counter = 0;
+
while ( my $seq_object = $in->next_seq() ) {
my $seq = $seq_object->seq;
my $id = $seq_object->id;
my $unique_id = sprintf( "%09d", $seq_counter++ );
$id_map{$id} = $unique_id;
- $id =~ s/\(\)//g; #removes ( and ) from the header lines
- $seq =~ s/[a-z]//g; # lowercase chars didnt align to model
- $seq =~ s/\.//g; # shouldnt be any dots
+ $id =~ s/\(\)//g; #removes ( and ) from the header lines
+ $seq =~ s/[a-z]//g; # lowercase chars didnt align to model
+ $seq =~ s/\.//g; # shouldnt be any dots
print $FILEOUT ">" . $unique_id . "\n" . $seq . "\n";
}
@@ -270,10 +331,11 @@ sub generate_fasttree {
my %type = %{ Phylosift::Utilities::get_sequence_input_type($aln_file) };
if ( $type{seqtype} eq "dna" ) {
debug "DNA alignment detected\n";
- `$Phylosift::Utilities::fasttree -nt -gtr -log "$target_dir/$core.log" "$aln_file" > "$target_dir/$core.tree" 2> /dev/null`;
- } else {
+`$Phylosift::Utilities::fasttree -nt -gtr -log "$target_dir/$core.log" "$aln_file" > "$target_dir/$core.tree" 2> /dev/null`;
+ }
+ else {
debug "NOT DNA detected\n";
- `$Phylosift::Utilities::fasttree -log "$target_dir/$core.log" "$aln_file" > "$target_dir/$core.tree" 2> /dev/null`;
+`$Phylosift::Utilities::fasttree -log "$target_dir/$core.log" "$aln_file" > "$target_dir/$core.tree" 2> /dev/null`;
}
return ( "$target_dir/$core.tree", "$target_dir/$core.log" );
}
@@ -294,7 +356,8 @@ sub get_representatives_from_tree {
#get the number of taxa in the tree
my $taxa_count = 0;
- my $input_tree = new Bio::TreeIO( -file => $tree_file, -format => "newick" );
+ my $input_tree =
+ new Bio::TreeIO( -file => $tree_file, -format => "newick" );
while ( my $tree = $input_tree->next_tree ) {
for my $node ( $tree->get_nodes ) {
if ( $node->is_Leaf ) {
@@ -303,9 +366,10 @@ sub get_representatives_from_tree {
}
}
- #pda doesn't seem to want to run if $taxa_count is the number of leaves. Decrementing to let pda do the search.
+#pda doesn't seem to want to run if $taxa_count is the number of leaves. Decrementing to let pda do the search.
$taxa_count--;
- my $pda_cmd = "cd \"$target_dir\";$Phylosift::Utilities::pda -g -k $taxa_count -minlen $cutoff \"$tree_file\" \"$target_dir/$core.pda\"";
+ my $pda_cmd =
+"cd \"$target_dir\";$Phylosift::Utilities::pda -g -k $taxa_count -minlen $cutoff \"$tree_file\" \"$target_dir/$core.pda\"";
`$pda_cmd`;
return "$target_dir/$core.pda";
}
@@ -334,9 +398,13 @@ sub get_fasta_from_pda_representatives {
chomp($_);
if ( $_ =~ m/optimal PD set has (\d+) taxa/ ) {
$taxa_number = $1;
- } elsif ( $_ =~ m/Corresponding sub-tree:/ ) {
+ }
+ elsif ( $_ =~ m/Corresponding sub-tree:/ ) {
last;
- } elsif ( $taxa_number != 0 && scalar( keys(%selected_taxa) ) < $taxa_number ) {
+ }
+ elsif ( $taxa_number != 0
+ && scalar( keys(%selected_taxa) ) < $taxa_number )
+ {
$_ =~ m/^(\S+)$/;
$selected_taxa{$1} = 1;
}
@@ -344,9 +412,15 @@ sub get_fasta_from_pda_representatives {
close($REPSIN);
my @lect = keys(%selected_taxa);
- #reading the reference sequences and printing the selected representatives using BioPerl
- my $reference_seqs = Phylosift::Utilities::open_SeqIO_object( file => $reference_fasta, format => "FASTA" );
- my $representatives_fasta = Phylosift::Utilities::open_SeqIO_object( file => ">$target_dir/$core.rep", format => "FASTA" );
+#reading the reference sequences and printing the selected representatives using BioPerl
+ my $reference_seqs = Phylosift::Utilities::open_SeqIO_object(
+ file => $reference_fasta,
+ format => "FASTA"
+ );
+ my $representatives_fasta = Phylosift::Utilities::open_SeqIO_object(
+ file => ">$target_dir/$core.rep",
+ format => "FASTA"
+ );
while ( my $ref_seq = $reference_seqs->next_seq ) {
if ( exists $selected_taxa{ $id_map{ $ref_seq->id } } ) {
$representatives_fasta->write_seq($ref_seq);
@@ -367,7 +441,11 @@ sub mask_aln {
my $outfile = $args{output} || miss("output");
my $gap_cutoff = 10;
my $cutoff = 10;
- my $maskcont = martin_mask( input_file => $args{file}, cutoff => $cutoff, opt_g => $gap_cutoff );
+ my $maskcont = martin_mask(
+ input_file => $args{file},
+ cutoff => $cutoff,
+ opt_g => $gap_cutoff
+ );
my %maskseq;
my @ori_order;
$maskcont =~ s/^>//;
@@ -408,7 +486,8 @@ sub mask_aln {
sub martin_mask {
my %args = @_;
- my ( $input_file, $cutoff, $opt_g ) = ( $args{input_file}, $args{cutoff}, $args{opt_g} );
+ my ( $input_file, $cutoff, $opt_g ) =
+ ( $args{input_file}, $args{cutoff}, $args{opt_g} );
if ( !( length($opt_g) > 1 ) ) { $opt_g = 101; }
my $self = {};
$self->{inputfile} = $input_file;
@@ -435,10 +514,12 @@ sub read_matrix {
$_ = shift(@mx);
if (/amino_acid_order = "(.+)"/) {
@aa = split //, $1;
- } elsif (/MATRIX (.+)\[/) {
+ }
+ elsif (/MATRIX (.+)\[/) {
$matrix_name = $1;
$j = 0;
- } elsif (/\d,/) {
+ }
+ elsif (/\d,/) {
s/(\s|\}|;)//g;
my @score = split /,/;
for ( $i = 0 ; $i <= $#score ; $i++ ) {
@@ -452,7 +533,8 @@ sub read_matrix {
for my $pair ( keys %{ $matrix{$key} } ) {
if ( !$min{$key} ) {
$min{$key} = $matrix{$key}{$pair};
- } elsif ( $matrix{$key}{$pair} < $min{$key} ) {
+ }
+ elsif ( $matrix{$key}{$pair} < $min{$key} ) {
$min{$key} = $matrix{$key}{$pair};
}
}
@@ -482,10 +564,12 @@ sub read_alignment {
if (/%([\S]+)/) {
$id = $1;
push( @order, $id );
- } elsif (/>([\S]+)/) {
+ }
+ elsif (/>([\S]+)/) {
$id = $1;
push( @order, $id );
- } else {
+ }
+ else {
$_ =~ s/\s+//g;
$_ =~ s/\./-/g; #AED: treat . and ? as gaps
$_ =~ s/\?/-/g;
@@ -529,13 +613,15 @@ sub calculate_score {
if ( $seqArray{$sequence}[$i] ne "-" ) {
$freq{ $seqArray{$sequence}[$i] }++;
$number++;
- } else {
+ }
+ else {
$number_gap++;
}
}
for my $aa1 (@aa) {
for my $aps (@aa) {
- $profile{$aa1} += $freq{$aps} * $matrix{"gon250mt"}{"$aa1$aps"} if exists $freq{$aps};
+ $profile{$aa1} += $freq{$aps} * $matrix{"gon250mt"}{"$aa1$aps"}
+ if exists $freq{$aps};
}
# $profile{$aa1} /= $number;
@@ -560,20 +646,24 @@ sub calculate_score {
}
if ($number) {
$dist_mean /= $number;
- } else {
+ }
+ else {
$dist_mean = 0;
}
my @sort = sort { $a <=> $b } ( values %dist );
my $t = $number % 2;
if ( $t == 0 ) {
- $dist_median = ( $sort[ $number / 2 - 1 ] + $sort[ $number / 2 ] ) / 2;
- } else {
+ $dist_median =
+ ( $sort[ $number / 2 - 1 ] + $sort[ $number / 2 ] ) / 2;
+ }
+ else {
$dist_median = $sort[ ( $number - 1 ) / 2 ];
}
- # $column_score2{$i} = exp (-$dist_mean/4) * 100 *$number/$numseq;
- # $column_score{$i} = exp (-$dist_median/1.82) * 100 * ($number/$numseq); # 1.82 make total random alignment score 1.0
- $column_score{$i} = exp( -$dist_median / 3 ) * 100 * ( $number / $numseq );
+# $column_score2{$i} = exp (-$dist_mean/4) * 100 *$number/$numseq;
+# $column_score{$i} = exp (-$dist_median/1.82) * 100 * ($number/$numseq); # 1.82 make total random alignment score 1.0
+ $column_score{$i} =
+ exp( -$dist_median / 3 ) * 100 * ( $number / $numseq );
my $gap_percent = $number_gap / $numseq * 100;
my $gap_cutoff = $self->{gap_cutoff};
if ( ( $gap_cutoff > 0.00001 ) && ( $gap_percent > $gap_cutoff ) ) {
@@ -602,28 +692,40 @@ sub mask {
for my $i ( 0 .. $seqlength - 1 ) {
if ( $i <= 2 or $i >= $seqlength - 3 ) {
$local_score{$i} = $column_score{$i};
- } else {
+ }
+ else {
$local_score{$i} =
( $column_score{ $i - 3 } +
- 2 * $column_score{ $i - 2 } +
- 3 * $column_score{ $i - 1 } +
- 4 * $column_score{$i} +
- 3 * $column_score{ $i + 1 } +
- 2 * $column_score{ $i + 2 } +
- 1 * $column_score{ $i + 3 } ) / 16;
+ 2 * $column_score{ $i - 2 } +
+ 3 * $column_score{ $i - 1 } +
+ 4 * $column_score{$i} +
+ 3 * $column_score{ $i + 1 } +
+ 2 * $column_score{ $i + 2 } +
+ 1 * $column_score{ $i + 3 } ) / 16;
}
if ( $column_score{$i} == 0 ) { $local_score{$i} = 0; }
elsif ( $local_score{$i} / $column_score{$i} > 3 ) {
- my $score_l = $column_score{ $i - 3 } + $column_score{ $i - 2 } + $column_score{ $i - 1 };
- my $score_r = $column_score{ $i + 1 } + $column_score{ $i + 2 } + $column_score{ $i + 3 };
- if ( ($score_r) && ( $score_l / $score_r > 20 or $score_l / $score_r < 0.05 ) ) {
+ my $score_l =
+ $column_score{ $i - 3 } +
+ $column_score{ $i - 2 } +
+ $column_score{ $i - 1 };
+ my $score_r =
+ $column_score{ $i + 1 } +
+ $column_score{ $i + 2 } +
+ $column_score{ $i + 3 };
+ if ( ($score_r)
+ && ( $score_l / $score_r > 20 or $score_l / $score_r < 0.05 ) )
+ {
$local_score{$i} = $column_score{$i};
}
}
###########################################cutoff #######################################
- if ( ( $local_score{$i} >= $cutoff ) && ( !( $self->{rid_gap}->{$i} ) ) ) {
+ if ( ( $local_score{$i} >= $cutoff )
+ && ( !( $self->{rid_gap}->{$i} ) ) )
+ {
$mask .= "1";
- } else {
+ }
+ else {
$mask .= "0";
}
@@ -744,4 +846,5 @@ See http://dev.perl.org/licenses/ for more information.
=cut
+
1; # End of Phylosift::MarkerBuild.pm
View
2  lib/Phylosift/Phylosift.pm
@@ -151,7 +151,7 @@ sub run {
read_phylosift_config( self => $self );
run_program_check( self => $self );
Phylosift::Utilities::data_checks( self => $self ); # unless $self->{"mode"} eq 'sim';
- file_check( self => $self ) unless $self->{"mode"} eq 'index' || $self->{"mode"} eq 'sim';
+ file_check( self => $self ) unless $self->{"mode"} eq 'index' || $self->{"mode"} eq 'sim' || $self->{"mode"} eq 'build_marker';
directory_prep( self => $self, force => $force ) unless $self->{"mode"} eq 'index' ;
$self->{"readsFile"} = prep_isolate_files( self => $self, file => $self->{"readsFile"} ) if $self->{"isolate"} == 1;
View
61 lib/Phylosift/Simulations.pm
@@ -266,19 +266,6 @@ sub get_genome_ids_from_pda {
return @taxa;
}
-sub pick_new_genomes() {
- my $date_cutoff = 20120101;
- my $DETAILS = ps_open("/share/eisen-d2/amphora2/ebi/bacteria.details.txt");
- my %new_genomes;
- my $line = <$DETAILS>;
- while($line = <$DETAILS>){
- chomp $line;
- my ( $acc, $version, $date, $taxid, $description ) = split( /\t/, $line );
- $new_genomes{$taxid} = 1 if($version == 1 && $date > $date_cutoff);
- }
- return \%new_genomes;
-}
-
=head2 knockout_genomes
Prints 2 files in the fileDir for the PS object
@@ -303,36 +290,28 @@ sub knockout_genomes() {
my %ko_taxa;
my $abund_sum=0;
my $nummer = $num_picked;
- my $new_genomes = pick_new_genomes();
- my $use_maxpd = 0;
-
+
for ( my $i = 0 ; $i < $num_picked/2 ; ) {
+ # pick one from the maxpd set
+ my $rand_index = int( rand( scalar(@top_array) - 1 ) );
+ my $rand_taxon = $top_array[$rand_index];
+ $top_array[$rand_index] = $list[$nummer++];
+ debug "picked $rand_taxon, i $i\n";
+ next unless defined($taxon_genome_files->{$rand_taxon}); # don't add this one unless its available
+ $ko_taxa{$rand_taxon}=rand(1000); # assign a uniformly random abundance
+ $abund_sum += $ko_taxa{$rand_taxon};
+ splice( @top_array, $rand_index, 1 ); # remove this one so it doesn't get resampled
+ $i++;
+ }
+ for ( my $i = $num_picked/2 ; $i < $num_picked ; ) {
# pick one uniformly at random
my $rand_taxon = $list[ int( rand($list_length) ) ];
debug "picked $rand_taxon, i $i\n";
- next unless defined($taxon_genome_files->{$rand_taxon}); # is it available on disk?
- next unless defined($new_genomes->{$rand_taxon}); # is it new enough?
- next if defined($ko_taxa{$rand_taxon}); # do we already have this one?
+ next unless defined($taxon_genome_files->{$rand_taxon});
$ko_taxa{$rand_taxon}=rand(1000);
$abund_sum += $ko_taxa{$rand_taxon};
$i++;
}
- if($use_maxpd){
- for ( my $i = $num_picked/2 ; $i < $num_picked ; ) {
- # pick one from the maxpd set
- my $rand_index = int( rand( scalar(@top_array) - 1 ) );
- my $rand_taxon = $top_array[$rand_index];
- $top_array[$rand_index] = $list[$nummer++];
- croak("ran out of genomes, relax your constraints!") if $nummer == @list;
- debug "picked $rand_taxon, i $i\n";
- next unless defined($taxon_genome_files->{$rand_taxon}); # don't add this one unless its available
- next unless defined($new_genomes->{$rand_taxon}); # don't add unless it's new enough
- $ko_taxa{$rand_taxon}=rand(1000); # assign a uniformly random abundance
- $abund_sum += $ko_taxa{$rand_taxon};
- splice( @top_array, $rand_index, 1 ); # remove this one so it doesn't get resampled
- $i++;
- }
- }
foreach my $taxon(keys(%ko_taxa)){
print $KO "$taxon\n";#.($ko_taxa{$taxon}/$abund_sum)."\n";
}
@@ -416,8 +395,6 @@ sub find_neighborhoods {
# now for each deleted node, find it's ancestral valid node & record some stats
my %stats;
my $SIMSTATS = ps_open(">".$self->{"fileDir"} . "/sim_stats.txt");
- print $SIMSTATS "# TaxonID\tParentID\tParentDistance\tParentToGrandParent\tParentOtherChild\tMinLeafDist\n";
-
foreach my $node(@target_nodes){
my $parent=$node;
my $p_len = $node->get_branch_length();
@@ -442,18 +419,10 @@ sub find_neighborhoods {
}
$c = $new_c;
}
- # get distance to nearest valid leaf
- my $min_leaf_dist = -1;
- foreach my $lnode ( @{ $tree->get_entities } ) {
- next unless (@{ $lnode->get_children() } < 2);
- next unless $dvalid{$lnode};
- my $ldist = $lnode->calc_patristic_distance($node);
- $min_leaf_dist = $ldist if($min_leaf_dist < 0 || $ldist < $min_leaf_dist);
- }
my $tid = $id_to_taxon->{$node->get_name()};
$stats{$node} = [$tid, $parent, $p_len, $pp_len, $c_dist];
# seems like there's some problem with c_dist
- print $SIMSTATS join("\t", $tid, $parent, $p_len, $pp_len, $c_dist, $min_leaf_dist)."\n";
+ print $SIMSTATS join("\t", $tid, $parent, $p_len, $pp_len, $c_dist)."\n";
}
my @dn = keys(%del_nodes);
return \@dn;
View
11 lib/Phylosift/UpdateDB.pm
@@ -612,16 +612,19 @@ sub make_ncbi_subtree {
my %tidnodes;
my $phylotree = Bio::Phylo::Forest::Tree->new();
my $ncbi_count = 0;
+ my $count=0;
+
foreach my $tid (@taxonids) {
+ debug "COULD NOT FIND : " .$tid ."\n" unless (defined($idnamemap{$tid}));
next unless(defined($merged->{$tid}) || defined($idnamemap{$tid})); # ensure the id actually exists in NCBI's db
next if ( $tid eq "" );
$ncbi_count++;
my @children;
while ( defined($tid) ) {
-
+
# check if we've already seen this one
last if ( defined( $tidnodes{$tid} ) );
-
+ #debug "ADDING $tid\n";
# process any merging that may have been done
my @mtid;
push( @mtid, $tid );
@@ -643,8 +646,10 @@ sub make_ncbi_subtree {
if ( defined( $parentid ) && defined( $tidnodes{$parentid} ) ){
$newnode = Bio::Phylo::Forest::Node->new( -parent => $tidnodes{$parentid}, -name => $mnode ) ;
}else{
+
$newnode = Bio::Phylo::Forest::Node->new( -name => $mnode );
}
+ $count++;
$tidnodes{$mnode} = $newnode;
# add all children to the new node
@@ -661,6 +666,8 @@ sub make_ncbi_subtree {
}
}
# if there's something in the tree, write it out
+ debug "NCBI COUNT : $ncbi_count\n";
+ debug "Making $count Nodes\n";
if($ncbi_count>0){
my $TREEOUT = ps_open( ">$out_file" );
print $TREEOUT $phylotree->to_newick( "-nodelabels" => 1 );
View
2  tools/readconciler.cpp
@@ -390,6 +390,8 @@ void reconcile( PhyloTree< TreeNode >& reftree, string treefile, unordered_multi
// cout << "found edge " << pg.edge_array[i].first << "\n";
mapout << edgenum_map[pg.edge_array[i].first] << "\t" << refnodename << endl;
}
+ }else{
+ cerr << "Mapping too ambiguous for node " << i << endl;
}
// if(pg_splitlist[i].count() == 1)
// return;
Please sign in to comment.
Something went wrong with that request. Please try again.