Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

version 0.6: use IMG dereplicated, use -j with parallel #1

Merged
merged 1 commit into from Jul 16, 2014
Jump to file or symbol
Failed to load files and symbols.
+62 −56
Split
View
118 annotateM
@@ -2,17 +2,17 @@
###############################################################################
#
# annotateM
-#
-# The idea here is an automated way of annotating your genome based on
-# multiple available databases and to produce a tab-delimited file of
+#
+# The idea here is an automated way of annotating your genome based on
+# multiple available databases and to produce a tab-delimited file of
# all the annotations, evalues, scores, descriptions.
-#
+#
# Suggested workflow:
# 1) run your genome nucleotide fasta file through annotateM
# 2) then run post_annotateM to include the contig id,orf_start and end
# 3) generate a tab-delimited file
# 4) open the file in ms excel or oo calc
-# 5) manually curate the annotations based on evalues/scores/desc etc
+# 5) manually curate the annotations based on evalues/scores/desc etc
# 6) metabolic reconstruction of organism
#
# Copyright (C) Mohamed Fauzi Haroon
@@ -53,7 +53,7 @@ BEGIN {
$| = 1;
}
-# edit here to log all external commands
+# edit here to log all external commands
my $global_log_commands = 0;
# ext command failure levels
@@ -67,6 +67,9 @@ use constant {
printAtStart();
my $global_options = checkParams();
+# Database paths
+my $img_protein_database = '/srv/db/img/4.1/dereplicated/img_dereplicated_species.genes.faa'
+
######################################################################
# CODE HERE
######################################################################
@@ -84,27 +87,28 @@ if (! -e "./prokka_annotation/")
"--prefix" => $global_options->{'locustag'},
"--kingdom" => $global_options->{'kingdom'},
"--cpus" => $global_options->{'threads'},
- $global_options->{'in'},
+ $global_options->{'in'},
}], DIE_ON_FAILURE);
}
-# identify the ORF called amino acid fasta file
+# identify the ORF called amino acid fasta file
my $locus = $global_options->{'locustag'};
# blast against img
if (! -e "./$locus.faaVSimg.blastp")
{
print "BLASTing against IMG 4.1 database...............\n";
- checkAndRunCommand("cat",
- [[
- "prokka_annotation/$locus.faa |",
+ checkAndRunCommand("cat",
+ [[
+ "prokka_annotation/$locus.faa |",
"parallel",
- "--block"=> "100k",
+ "-j" => $global_options->{'threads'},
+ "--block"=> "100k",
"--recstart",
"'>'",
"--pipe",
"blastp",
- -db => "/srv/db/img/4.1/blastdbs/prokaryotic.genes.finished.faa",
+ -db => $img_protein_database,
-outfmt => 6,
-max_target_seqs => 1,
-evalue => $global_options->{'evalue'},
@@ -120,7 +124,7 @@ if (! -e "./subsetimg.faaVS$locus.faa.blastp")
checkAndRunCommand("contig_extractor.pl",
[[
-i => "$locus.faaVSimg.blastp",
- -d => "/srv/db/img/4.1/blastdbs/prokaryotic.genes.finished.faa",
+ -d => $img_protein_database,
-b => '',
-S => '',
-o => "subsetimg.faa",
@@ -144,13 +148,14 @@ if (! -e "./subsetimg.faaVS$locus.faa.blastp")
]], DIE_ON_FAILURE);
}
-# blast against uniref
+# blast against uniref
if (! -e "./$locus.faaVSuniref90.blastp")
{
print "BLASTing against latest Uniref90 April2014 database ................\n";
checkAndRunCommand("cat",[[
"prokka_annotation/$locus.faa |",
"parallel",
+ "-j" => $global_options->{'threads'},
"--block"=> "100k",
"--recstart",
"'>'",
@@ -198,6 +203,7 @@ if (! -e "./$locus.faaVSCOG.blastp")
checkAndRunCommand("cat",[[
"prokka_annotation/$locus.faa |",
"parallel",
+ "-j" => $global_options->{'threads'},
"--block"=> "100k",
"--recstart",
"'>'",
@@ -218,7 +224,7 @@ if (! -e "./$locus.faaVSPfam-A.hmm.hmmscanned")
{
print "HMMscanning against latest Pfam 27 database................\n";
checkAndRunCommand("pfam_scan.pl",[[
- -cpu => $global_options->{'threads'},
+ -cpu => $global_options->{'threads'},
-e_seq => $global_options->{'evalue'},
-outfile => "$locus.faaVSPfam-A.hmm.hmmscanned",
-fasta => "prokka_annotation/$locus.faa",
@@ -241,7 +247,7 @@ if (! -e "./$locus.faaVStigr_all.hmm.hmmscanned")
"prokka_annotation/$locus.faa",
]], DIE_ON_FAILURE);
}
-
+
# convert the hmmscan output to tab delimited
checkAndRunCommand("awk",[[
"'{\$1=\$1}{ print }'",
@@ -267,7 +273,7 @@ my %imghash2 =();
#my @orfid = ();
# read the img blast output and store in hash
-# SAMPLE img blast output -
+# SAMPLE img blast output -
# phycis_04080 649633083|649978419 38.08 1116 640 14 13 1099 1 1094 0.0 663
# phycis_04081 649633083|649980044 28.40 405 237 10 49 422 20 402 3e-27 119
# phycis_04082 649633030|649661236 42.86 259 144 3 1 256 1 258 1e-61 205
@@ -316,7 +322,7 @@ while (<$imgid2names>)
foreach my $orfid (keys $access2imgid{$columns[0]})
{
#print "$orfid\n";
- $img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]";
+ $img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]";
print {$img_temp_OUT} "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]\n";
#$img2reciprocal{$columns[0]} = "$orfid\t$columns[1]\t$columns[2]";
#print {$img_temp_OUT} "$orfid\t$columns[1]\t$columns[2]\n";
@@ -346,17 +352,17 @@ while (<$rIMGblast>)
if (exists $img2reciprocal{$columns[0]})
{
print {$img_temp_OUT2} $img2reciprocal{$columns[0]} . "\treciprocal\n";
- }
+ }
else
{
print {$img_temp_OUT2} "$columns[0]\t$columns[1]\tNA\tNA\tNA\tNA\tNOT reciprocal\n";
- }
-
+ }
+
}
close($img_temp_OUT2);
-# hashes for uniref
+# hashes for uniref
my %hash4 = ();
my %hash5 =();
my %hash6 = ();
@@ -559,7 +565,7 @@ while (<$img_annotation>)
my @columns = split (/\t/, $_);
# my @baba = @columns[1..$#columns];
my @baba = @columns[2..$#columns];
- #print "@baba \n";
+ #print "@baba \n";
#$combined_bighash{$columns[0]}->{'img'} = join("\t", @baba);
push @{$combined_bighash{$columns[1]}->{'a-img'}}, join("\t", @baba);
}
@@ -635,7 +641,7 @@ while (<$pfam_annotation>)
my @columns = split (/\t/, $_);
my @baba = @columns[1..$#columns];
#$combined_bighash{$columns[0]}->{'pfam'} = join("\t", @baba);
- push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba);
+ push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba);
}
#print Dumper \%combined_bighash;
@@ -654,22 +660,22 @@ while (<$tigrfam_annotation>)
# assign key and value for number of expected columns for each annotation type, important for putting NA in missing annotation types
my %column_lengths = (
'a-img' => 5,
- 'b-uniref' => 5,
+ 'b-uniref' => 5,
'c-prokka' => 1,
'd-cog' => 5,
- 'e-pfam' => 3,
+ 'e-pfam' => 3,
'f-tigrfam' => 4,
);
-# print the orfids first
+# print the orfids first
foreach my $ID (sort(keys %combined_bighash))
{
print {$FINAL_OUTPUT} "$ID\t";
foreach my $annotation_type (sort(keys %column_lengths))
{
# if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values
if (! exists $combined_bighash{$ID}->{$annotation_type})
- {
+ {
# cool way of printing a certain string multiple times based on the values in the hash
print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t";
}
@@ -679,15 +685,15 @@ foreach my $ID (sort(keys %combined_bighash))
{
# place to store the columns in the values of the hash annotation types
my @storage_array;
- foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}})
+ foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}})
{
# each annotation types have different number of columns, so we need to split the columns first before
# we can add in the extra values if lets say an orfid hits multiple pfam/cog/tigrfam values
my @values = split("\t",$line);
- # cool and alternate way of doing columns[1] = values[1], and so on.., repetitively
+ # cool and alternate way of doing columns[1] = values[1], and so on.., repetitively
# what it basically means as long as the value i less than the number of columns in each annotation type
# add +1 to the string $i and do the push below
- for (my $i = 0; $i <= $#values; $i++)
+ for (my $i = 0; $i <= $#values; $i++)
{
push @{$storage_array[$i]}, $values[$i];
}
@@ -696,19 +702,19 @@ foreach my $ID (sort(keys %combined_bighash))
# array to store the multiple hits in each column. eg. test0001 orfid hits multiple pfam values pf0008 & pf0010
# so we would like to have the values combined together in the same column delimited by a comma
my @print_info_array;
- for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++)
+ for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++)
{
push @print_info_array, join("; ", @{$storage_array[$i]});
}
#print Dumper(\@print_info_array);
print {$FINAL_OUTPUT} join("\t", @print_info_array), "\t";
- }
+ }
else
{
print {$FINAL_OUTPUT} "$combined_bighash{$ID}{$annotation_type}\t";
- }
+ }
}
- print {$FINAL_OUTPUT} "\n";
+ print {$FINAL_OUTPUT} "\n";
}
#close all files
@@ -741,7 +747,7 @@ sub checkParams {
my %options;
# Add any other command line options, and the code to handle them
- #
+ #
GetOptions( \%options, @standard_options );
# if no arguments supplied print the usage and exit
@@ -751,7 +757,7 @@ sub checkParams {
# If the -help option is set, print the usage and exit
#
exec("pod2usage $0") if $options{'help'};
-
+
# Compulsory items
#if(!exists $options{''} ) { printParamError (""); }
if(!exists $options{'in'} ) { printParamError ("You MUST supply a fasta file"); }
@@ -763,8 +769,8 @@ sub printParamError
{
#-----
# What to do if there's something wrong with a parameter
- #
- my ($error) = @_;
+ #
+ my ($error) = @_;
print "**ERROR: $0 : $error\n"; exec("pod2usage $0");
}
@@ -774,7 +780,7 @@ sub overrideDefault
# Set and override default values for parameters
#
my ($default_value, $option_name) = @_;
- if(exists $global_options->{$option_name})
+ if(exists $global_options->{$option_name})
{
return $global_options->{$option_name};
}
@@ -795,7 +801,7 @@ sub openWrite
}
sub openRead
-{
+{
#-----
# Open a file for reading
#
@@ -809,7 +815,7 @@ sub openRead
#
# checkAndRunCommand("ls", {
# -a => ""
-# },
+# },
# WARN_ON_FAILURE);
sub checkFileExists {
@@ -859,15 +865,15 @@ sub checkAndRunCommand
# Run external commands more sanelier
#
my ($cmd, $params, $failure_type) = @_;
-
+
isCommandInPath($cmd, $failure_type);
-
+
# join the parameters to the command
my $param_str = join " ", map {formatParams($_)} @{$params};
my $cmd_str = $cmd . " " . $param_str;
-
- print "The command currently running:\t$cmd_str\n";
+
+ print "The command currently running:\t$cmd_str\n";
logExternalCommand($cmd_str);
# make sure that all went well
@@ -879,11 +885,11 @@ sub checkAndRunCommand
sub formatParams {
#---------
- # Handles and formats the different ways of passing parameters to
+ # Handles and formats the different ways of passing parameters to
# checkAndRunCommand
#
my $ref = shift;
-
+
if (ref($ref) eq "ARRAY") {
return join(" ", @{$ref});
} elsif (ref($ref) eq "HASH") {
@@ -914,15 +920,15 @@ sub handleCommandFailure {
sub printAtStart {
print<<"EOF";
-----------------------------------------------------------------
-
+----------------------------------------------------------------
+
$0
annotateM - annotate my genome
Due to the blast processes against multiple databases, this whole
annotation pipeline will usually take awhile. Please be patient!
What you get in the end will save you heaps of time.
-
-----------------------------------------------------------------
+
+----------------------------------------------------------------
EOF
}
@@ -958,13 +964,13 @@ __DATA__
annotateM -i [fasta_file] -l [locus] -k [kingdom] -t [threads] -e [evalue]
- -i FASTA_FILE Nucleotide fasta file
+ -i FASTA_FILE Nucleotide fasta file
-l locustag Name of locus tag
-k kingdom (Bacteria/Archaea/Phage/Viruses) Kingdom of genome to be annotated
-t threads Number of threads
- -e evalue Evalue for BLAST, recommended 1e-3
+ -e evalue Evalue for BLAST, recommended 1e-3
[-help -h] Displays basic usage information
-
+
=cut