From fa1c195afce085b47e9092e8eb0f23d17abca266 Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Wed, 16 Jul 2014 13:49:01 +1000 Subject: [PATCH] version 0.6: use IMG dereplicated, use -j with parallel --- annotateM | 118 +++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 62 insertions(+), 56 deletions(-) diff --git a/annotateM b/annotateM index 87a2cd2..f51d522 100755 --- a/annotateM +++ b/annotateM @@ -2,17 +2,17 @@ ############################################################################### # # annotateM -# -# The idea here is an automated way of annotating your genome based on -# multiple available databases and to produce a tab-delimited file of +# +# The idea here is an automated way of annotating your genome based on +# multiple available databases and to produce a tab-delimited file of # all the annotations, evalues, scores, descriptions. -# +# # Suggested workflow: # 1) run your genome nucleotide fasta file through annotateM # 2) then run post_annotateM to include the contig id,orf_start and end # 3) generate a tab-delimited file # 4) open the file in ms excel or oo calc -# 5) manually curate the annotations based on evalues/scores/desc etc +# 5) manually curate the annotations based on evalues/scores/desc etc # 6) metabolic reconstruction of organism # # Copyright (C) Mohamed Fauzi Haroon @@ -53,7 +53,7 @@ BEGIN { $| = 1; } -# edit here to log all external commands +# edit here to log all external commands my $global_log_commands = 0; # ext command failure levels @@ -67,6 +67,9 @@ use constant { printAtStart(); my $global_options = checkParams(); +# Database paths +my $img_protein_database = '/srv/db/img/4.1/dereplicated/img_dereplicated_species.genes.faa' + ###################################################################### # CODE HERE ###################################################################### @@ -84,27 +87,28 @@ if (! -e "./prokka_annotation/") "--prefix" => $global_options->{'locustag'}, "--kingdom" => $global_options->{'kingdom'}, "--cpus" => $global_options->{'threads'}, - $global_options->{'in'}, + $global_options->{'in'}, }], DIE_ON_FAILURE); } -# identify the ORF called amino acid fasta file +# identify the ORF called amino acid fasta file my $locus = $global_options->{'locustag'}; # blast against img if (! -e "./$locus.faaVSimg.blastp") { print "BLASTing against IMG 4.1 database...............\n"; - checkAndRunCommand("cat", - [[ - "prokka_annotation/$locus.faa |", + checkAndRunCommand("cat", + [[ + "prokka_annotation/$locus.faa |", "parallel", - "--block"=> "100k", + "-j" => $global_options->{'threads'}, + "--block"=> "100k", "--recstart", "'>'", "--pipe", "blastp", - -db => "/srv/db/img/4.1/blastdbs/prokaryotic.genes.finished.faa", + -db => $img_protein_database, -outfmt => 6, -max_target_seqs => 1, -evalue => $global_options->{'evalue'}, @@ -120,7 +124,7 @@ if (! -e "./subsetimg.faaVS$locus.faa.blastp") checkAndRunCommand("contig_extractor.pl", [[ -i => "$locus.faaVSimg.blastp", - -d => "/srv/db/img/4.1/blastdbs/prokaryotic.genes.finished.faa", + -d => $img_protein_database, -b => '', -S => '', -o => "subsetimg.faa", @@ -144,13 +148,14 @@ if (! -e "./subsetimg.faaVS$locus.faa.blastp") ]], DIE_ON_FAILURE); } -# blast against uniref +# blast against uniref if (! -e "./$locus.faaVSuniref90.blastp") { print "BLASTing against latest Uniref90 April2014 database ................\n"; checkAndRunCommand("cat",[[ "prokka_annotation/$locus.faa |", "parallel", + "-j" => $global_options->{'threads'}, "--block"=> "100k", "--recstart", "'>'", @@ -198,6 +203,7 @@ if (! -e "./$locus.faaVSCOG.blastp") checkAndRunCommand("cat",[[ "prokka_annotation/$locus.faa |", "parallel", + "-j" => $global_options->{'threads'}, "--block"=> "100k", "--recstart", "'>'", @@ -218,7 +224,7 @@ if (! -e "./$locus.faaVSPfam-A.hmm.hmmscanned") { print "HMMscanning against latest Pfam 27 database................\n"; checkAndRunCommand("pfam_scan.pl",[[ - -cpu => $global_options->{'threads'}, + -cpu => $global_options->{'threads'}, -e_seq => $global_options->{'evalue'}, -outfile => "$locus.faaVSPfam-A.hmm.hmmscanned", -fasta => "prokka_annotation/$locus.faa", @@ -241,7 +247,7 @@ if (! -e "./$locus.faaVStigr_all.hmm.hmmscanned") "prokka_annotation/$locus.faa", ]], DIE_ON_FAILURE); } - + # convert the hmmscan output to tab delimited checkAndRunCommand("awk",[[ "'{\$1=\$1}{ print }'", @@ -267,7 +273,7 @@ my %imghash2 =(); #my @orfid = (); # read the img blast output and store in hash -# SAMPLE img blast output - +# SAMPLE img blast output - # phycis_04080 649633083|649978419 38.08 1116 640 14 13 1099 1 1094 0.0 663 # phycis_04081 649633083|649980044 28.40 405 237 10 49 422 20 402 3e-27 119 # phycis_04082 649633030|649661236 42.86 259 144 3 1 256 1 258 1e-61 205 @@ -316,7 +322,7 @@ while (<$imgid2names>) foreach my $orfid (keys $access2imgid{$columns[0]}) { #print "$orfid\n"; - $img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]"; + $img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]"; print {$img_temp_OUT} "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]\n"; #$img2reciprocal{$columns[0]} = "$orfid\t$columns[1]\t$columns[2]"; #print {$img_temp_OUT} "$orfid\t$columns[1]\t$columns[2]\n"; @@ -346,17 +352,17 @@ while (<$rIMGblast>) if (exists $img2reciprocal{$columns[0]}) { print {$img_temp_OUT2} $img2reciprocal{$columns[0]} . "\treciprocal\n"; - } + } else { print {$img_temp_OUT2} "$columns[0]\t$columns[1]\tNA\tNA\tNA\tNA\tNOT reciprocal\n"; - } - + } + } close($img_temp_OUT2); -# hashes for uniref +# hashes for uniref my %hash4 = (); my %hash5 =(); my %hash6 = (); @@ -559,7 +565,7 @@ while (<$img_annotation>) my @columns = split (/\t/, $_); # my @baba = @columns[1..$#columns]; my @baba = @columns[2..$#columns]; - #print "@baba \n"; + #print "@baba \n"; #$combined_bighash{$columns[0]}->{'img'} = join("\t", @baba); push @{$combined_bighash{$columns[1]}->{'a-img'}}, join("\t", @baba); } @@ -635,7 +641,7 @@ while (<$pfam_annotation>) my @columns = split (/\t/, $_); my @baba = @columns[1..$#columns]; #$combined_bighash{$columns[0]}->{'pfam'} = join("\t", @baba); - push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba); + push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba); } #print Dumper \%combined_bighash; @@ -654,14 +660,14 @@ while (<$tigrfam_annotation>) # assign key and value for number of expected columns for each annotation type, important for putting NA in missing annotation types my %column_lengths = ( 'a-img' => 5, - 'b-uniref' => 5, + 'b-uniref' => 5, 'c-prokka' => 1, 'd-cog' => 5, - 'e-pfam' => 3, + 'e-pfam' => 3, 'f-tigrfam' => 4, ); -# print the orfids first +# print the orfids first foreach my $ID (sort(keys %combined_bighash)) { print {$FINAL_OUTPUT} "$ID\t"; @@ -669,7 +675,7 @@ foreach my $ID (sort(keys %combined_bighash)) { # if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values if (! exists $combined_bighash{$ID}->{$annotation_type}) - { + { # cool way of printing a certain string multiple times based on the values in the hash print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t"; } @@ -679,15 +685,15 @@ foreach my $ID (sort(keys %combined_bighash)) { # place to store the columns in the values of the hash annotation types my @storage_array; - foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}}) + foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}}) { # each annotation types have different number of columns, so we need to split the columns first before # we can add in the extra values if lets say an orfid hits multiple pfam/cog/tigrfam values my @values = split("\t",$line); - # cool and alternate way of doing columns[1] = values[1], and so on.., repetitively + # cool and alternate way of doing columns[1] = values[1], and so on.., repetitively # what it basically means as long as the value i less than the number of columns in each annotation type # add +1 to the string $i and do the push below - for (my $i = 0; $i <= $#values; $i++) + for (my $i = 0; $i <= $#values; $i++) { push @{$storage_array[$i]}, $values[$i]; } @@ -696,19 +702,19 @@ foreach my $ID (sort(keys %combined_bighash)) # array to store the multiple hits in each column. eg. test0001 orfid hits multiple pfam values pf0008 & pf0010 # so we would like to have the values combined together in the same column delimited by a comma my @print_info_array; - for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++) + for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++) { push @print_info_array, join("; ", @{$storage_array[$i]}); } #print Dumper(\@print_info_array); print {$FINAL_OUTPUT} join("\t", @print_info_array), "\t"; - } + } else { print {$FINAL_OUTPUT} "$combined_bighash{$ID}{$annotation_type}\t"; - } + } } - print {$FINAL_OUTPUT} "\n"; + print {$FINAL_OUTPUT} "\n"; } #close all files @@ -741,7 +747,7 @@ sub checkParams { my %options; # Add any other command line options, and the code to handle them - # + # GetOptions( \%options, @standard_options ); # if no arguments supplied print the usage and exit @@ -751,7 +757,7 @@ sub checkParams { # If the -help option is set, print the usage and exit # exec("pod2usage $0") if $options{'help'}; - + # Compulsory items #if(!exists $options{''} ) { printParamError (""); } if(!exists $options{'in'} ) { printParamError ("You MUST supply a fasta file"); } @@ -763,8 +769,8 @@ sub printParamError { #----- # What to do if there's something wrong with a parameter - # - my ($error) = @_; + # + my ($error) = @_; print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); } @@ -774,7 +780,7 @@ sub overrideDefault # Set and override default values for parameters # my ($default_value, $option_name) = @_; - if(exists $global_options->{$option_name}) + if(exists $global_options->{$option_name}) { return $global_options->{$option_name}; } @@ -795,7 +801,7 @@ sub openWrite } sub openRead -{ +{ #----- # Open a file for reading # @@ -809,7 +815,7 @@ sub openRead # # checkAndRunCommand("ls", { # -a => "" -# }, +# }, # WARN_ON_FAILURE); sub checkFileExists { @@ -859,15 +865,15 @@ sub checkAndRunCommand # Run external commands more sanelier # my ($cmd, $params, $failure_type) = @_; - + isCommandInPath($cmd, $failure_type); - + # join the parameters to the command my $param_str = join " ", map {formatParams($_)} @{$params}; my $cmd_str = $cmd . " " . $param_str; - - print "The command currently running:\t$cmd_str\n"; + + print "The command currently running:\t$cmd_str\n"; logExternalCommand($cmd_str); # make sure that all went well @@ -879,11 +885,11 @@ sub checkAndRunCommand sub formatParams { #--------- - # Handles and formats the different ways of passing parameters to + # Handles and formats the different ways of passing parameters to # checkAndRunCommand # my $ref = shift; - + if (ref($ref) eq "ARRAY") { return join(" ", @{$ref}); } elsif (ref($ref) eq "HASH") { @@ -914,15 +920,15 @@ sub handleCommandFailure { sub printAtStart { print<<"EOF"; ----------------------------------------------------------------- - +---------------------------------------------------------------- + $0 annotateM - annotate my genome Due to the blast processes against multiple databases, this whole annotation pipeline will usually take awhile. Please be patient! What you get in the end will save you heaps of time. - ----------------------------------------------------------------- + +---------------------------------------------------------------- EOF } @@ -958,13 +964,13 @@ __DATA__ annotateM -i [fasta_file] -l [locus] -k [kingdom] -t [threads] -e [evalue] - -i FASTA_FILE Nucleotide fasta file + -i FASTA_FILE Nucleotide fasta file -l locustag Name of locus tag -k kingdom (Bacteria/Archaea/Phage/Viruses) Kingdom of genome to be annotated -t threads Number of threads - -e evalue Evalue for BLAST, recommended 1e-3 + -e evalue Evalue for BLAST, recommended 1e-3 [-help -h] Displays basic usage information - + =cut