diff --git a/annotateM b/annotateM index d09d77b..516a733 100755 --- a/annotateM +++ b/annotateM @@ -73,29 +73,28 @@ my $global_options = checkParams(); # check that the file exists checkFileExists($global_options->{'in'}); +# run prokka to generate the ORFs and also prokka annotations if (! -e "./prokka_annotation/") { -#print "prokka\n"; -# run prokka to generate the ORFs and also prokka annotations -checkAndRunCommand("prokka", [{ + print "Running prokka v1.8\n"; + checkAndRunCommand("prokka", [{ "--locustag" => $global_options->{'locustag'}, "--outdir" => "prokka_annotation", "--prefix" => $global_options->{'locustag'}, "--kingdom" => $global_options->{'kingdom'}, "--cpus" => $global_options->{'threads'}, - "--keep_names", $global_options->{'in'}, }], DIE_ON_FAILURE); } -# identify the ORF called amino acid fasta file for blast-ing +# identify the ORF called amino acid fasta file my $locus = $global_options->{'locustag'}; # blast against img if (! -e "./$locus.faaVSimg.blastp") { -print "BLASTing against IMG 4.0 database...............\n"; -checkAndRunCommand("cat", + print "BLASTing against IMG 4.1 database...............\n"; + checkAndRunCommand("cat", [[ "prokka_annotation/$locus.faa |", "parallel", @@ -104,7 +103,7 @@ checkAndRunCommand("cat", "'>'", "--pipe", "blastp", - -db => "/srv/db/img/4.0/dereplicated/img_dereplicated_species.genes.faa", + -db => "/srv/db/img/4.1/blastdbs/prokaryotic.genes.finished.faa", -outfmt => 6, -max_target_seqs => 1, -evalue => $global_options->{'evalue'}, @@ -116,23 +115,23 @@ checkAndRunCommand("cat", # reciprocal blast of img positive hits against genome ORF if (! -e "./subsetimg.faaVS$locus.faa.blastp") { -print "Reciprocal BLASTing positive IMG hits to $locus.faa ...............\n"; -checkAndRunCommand("contig_extractor.pl", + print "Reciprocal BLASTing positive IMG hits to $locus.faa ...............\n"; + checkAndRunCommand("contig_extractor.pl", [[ -i => "$locus.faaVSimg.blastp", - -d => "/srv/db/img/4.0/dereplicated/img_dereplicated_species.genes.faa", + -d => "/srv/db/img/4.1/blastdbs/prokaryotic.genes.finished.faa", -b => '', -S => '', -o => "subsetimg.faa", ]], DIE_ON_FAILURE); -checkAndRunCommand("makeblastdb", + checkAndRunCommand("makeblastdb", [[ -in => "prokka_annotation/$locus.faa", -dbtype => "prot", ]], DIE_ON_FAILURE); -checkAndRunCommand("blastp", + checkAndRunCommand("blastp", [[ -query => "subsetimg.faa", -db => "prokka_annotation/$locus.faa", @@ -147,8 +146,8 @@ checkAndRunCommand("blastp", # blast against uniref if (! -e "./$locus.faaVSuniref90.blastp") { -print "BLASTing against Uniref90 database................\n"; -checkAndRunCommand("cat",[[ + print "BLASTing against latest Uniref90 April2014 database ................\n"; + checkAndRunCommand("cat",[[ "prokka_annotation/$locus.faa |", "parallel", "--block"=> "100k", @@ -169,9 +168,8 @@ checkAndRunCommand("cat",[[ # reciprocal blast of Uniref positive hits against genome ORF if (! -e "./subsetuniref.faaVS$locus.faa.blastp") { -print "Reciprocal BLASTing positive Uniref hits to $locus.faa ...............\n"; - -checkAndRunCommand("contig_extractor.pl", + print "Reciprocal BLASTing positive Uniref hits to $locus.faa ...............\n"; + checkAndRunCommand("contig_extractor.pl", [[ -i => "$locus.faaVSuniref90.blastp", -d => "/srv/db/uniprot/uniref-20140403/uniref90.fasta", @@ -180,7 +178,7 @@ checkAndRunCommand("contig_extractor.pl", -o => "subsetuniref.faa", ]], DIE_ON_FAILURE); -checkAndRunCommand("blastp", + checkAndRunCommand("blastp", [[ -query => "subsetuniref.faa", -db => "prokka_annotation/$locus.faa", @@ -195,8 +193,8 @@ checkAndRunCommand("blastp", # blast against COG if (! -e "./$locus.faaVSCOG.blastp") { -print "BLASTing against COG database................\n"; -checkAndRunCommand("cat",[[ + print "BLASTing against the one and only COG database................\n"; + checkAndRunCommand("cat",[[ "prokka_annotation/$locus.faa |", "parallel", "--block"=> "100k", @@ -217,8 +215,8 @@ checkAndRunCommand("cat",[[ # HMMSCAN against PFAM if (! -e "./$locus.faaVSPfam-A.hmm.hmmscanned") { -print "HMMscanning against Pfam database................\n"; -checkAndRunCommand("pfam_scan.pl",[[ + print "HMMscanning against latest Pfam 27 database................\n"; + checkAndRunCommand("pfam_scan.pl",[[ -cpu => $global_options->{'threads'}, -e_seq => $global_options->{'evalue'}, -outfile => "$locus.faaVSPfam-A.hmm.hmmscanned", @@ -230,8 +228,8 @@ checkAndRunCommand("pfam_scan.pl",[[ # HMMSCAN against TIGRfam if (! -e "./$locus.faaVStigr_all.hmm.hmmscanned") { -print "HMMscanning against TIGRfam database................\n"; -checkAndRunCommand("hmmscan",[[ + print "HMMscanning against TIGRfam April2014 database................\n"; + checkAndRunCommand("hmmscan",[[ "--tblout", "$locus.faaVStigr_all.hmm.hmmscanned", "--noali", @@ -253,63 +251,106 @@ checkAndRunCommand("awk",[[ checkAndRunCommand("awk",[[ "'{\$1=\$1}{ print }'", - "$locus.faaVStigr_all.hmm.hmmscanned", + "$locus.faaVStigr_all.hmm.hmmscanned", + "| sed 's/^\\s+//'", + "| sed 's/\\s+\$//'", "| sed 's/\\s/\\t/g'", - "> $locus.faaVStigr_all.hmm.hmmscanned.tab", - ]], DIE_ON_FAILURE); + "> $locus.faaVStigr_all.hmm.hmmscanned.tab", + ]], DIE_ON_FAILURE); -# hashes for img -my %hash = (); -my %hash2 = (); -my %hash3 =(); +# declare hashes for img +my %access2imgid=(); +my %img2reciprocal = (); +my %imghash2 =(); +#my @orfid = (); # read the img blast output and store in hash +# SAMPLE img blast output - +# phycis_04080 649633083|649978419 38.08 1116 640 14 13 1099 1 1094 0.0 663 +# phycis_04081 649633083|649980044 28.40 405 237 10 49 422 20 402 3e-27 119 +# phycis_04082 649633030|649661236 42.86 259 144 3 1 256 1 258 1e-61 205 +# phycis_04083 640753047|640896165 61.55 1186 444 3 1 1177 1 1183 0.0 1504 + +# columns[0] = orfid +# columns[1] = imgid +# columns[10] = evalue +# columns[11] = blast score + open my $IMGblast, "./$locus.faaVSimg.blastp", or die "Couldn't open file $locus.faaVSimg.blastp\n"; while (<$IMGblast>) { chomp $_; my @columns = split (/\t/, $_); - # store the orfid which is in columns[0] - my $orfid = $columns[0]; +# push @orfid, $columns[0]; if ($columns[11] > 60) { - # key is $orfid while the value of the key in $columns[1] is the IMG id - $hash{$orfid} = $columns[1]; - $hash{$columns[1]} = $columns[0]; - $hash2{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]"; +# push @orfid, $columns[0]; + #store access2imgid hash with the imgid key and point towards the orfid and value is the output i want printed out later + $access2imgid{$columns[1]}->{$columns[0]} = "$columns[1]\t$columns[0]\t$columns[10]\t$columns[11]"; } } +#print Dumper (\%access2imgid); + + +# read img id2names.txt which is the file to get the gene identity of the imgid +# SAMPLE img id2names.txt file - +# 650716001|650846201 Ahos_0001 replication initiator protein Cdc6-3 Acidianus hospitalis W1 +# 650716001|650846202 Ahos_0002 hypothetical protein Acidianus hospitalis W1 +# 650716001|650846203 Ahos_0003 transcriptional coactivator/pterin dehydratase Acidianus hospitalis W1 +# 650716001|650846204 Ahos_0004 GGCT (gamma glutamyl cyclotransferase) domain-containing protein Acidianus hospitalis W1 + +# columns[0] = imgid +# columns[1] = gene name +# columns[2] = organism -# read img id2names.txt -open my $imgid2names, "/srv/db/img/4.0/dereplicated/id2names.txt", or die "Couldn't open id2names.txt\n"; +open my $imgid2names, "/srv/db/img/4.1/blastdbs/img4.1_id2names.txt", or die "Couldn't open img4.1_id2names.txt\n"; open my $img_temp_OUT, ">img_output_temp.txt"; while (<$imgid2names>) { chomp $_; my @columns = split (/\t/, $_); - if (exists $hash2{$columns[0]}) - { - $hash3{$columns[0]} = "$hash2{$columns[0]}\t$columns[1]\t$columns[2]"; - print {$img_temp_OUT} "$hash2{$columns[0]}\t$columns[1]\t$columns[2]\n"; - } + if (exists $access2imgid{$columns[0]}) + { + foreach my $orfid (keys $access2imgid{$columns[0]}) + { + #print "$orfid\n"; + $img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]"; + print {$img_temp_OUT} "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]\n"; + #$img2reciprocal{$columns[0]} = "$orfid\t$columns[1]\t$columns[2]"; + #print {$img_temp_OUT} "$orfid\t$columns[1]\t$columns[2]\n"; + #print Dumper (\%access2imgid); + } + } } +#print Dumper (\%access2imgid); close($IMGblast); close($imgid2names); close($img_temp_OUT); # read my reciprocal img blast output and store in hash +# SAMPLE +# 2513020047|2513221347 phycis_01043 39.30 285 168 4 21 301 21 304 2e-51 172 +# 648028035|648160186 phycis_03502 40.55 217 122 4 7 221 422 633 1e-48 167 +# 639633053|639783588 phycis_00179 49.23 260 121 4 14 269 20 272 3e-80 246 +# 639633064|639773205 phycis_02647 29.24 383 234 11 8 370 3 368 2e-45 160 + open my $rIMGblast, "./subsetimg.faaVS$locus.faa.blastp", or die "Couldn't open file subsetimg.faaVS$locus.faa.blastp\n"; open my $img_temp_OUT2, ">img_output_temp2.txt"; while (<$rIMGblast>) { chomp $_; my @columns = split (/\t/, $_); - if (exists $hash{$columns[0]}) + if (exists $img2reciprocal{$columns[0]}) { - print {$img_temp_OUT2} "$hash3{$columns[0]}\treciprocal\n"; - } + print {$img_temp_OUT2} $img2reciprocal{$columns[0]} . "\treciprocal\n"; + } + else + { + print {$img_temp_OUT2} "$columns[0]\t$columns[1]\tNA\tNA\tNA\tNA\tNOT reciprocal\n"; + } + } close($img_temp_OUT2); @@ -358,7 +399,7 @@ while (<$runirefblast>) { chomp $_; my @columns = split (/\t/, $_); - if (exists $hash4{$columns[0]}) + if (exists $hash6{$columns[0]}) { print {$uniref_temp_OUT2} "$hash6{$columns[0]}\treciprocal\n"; } @@ -415,10 +456,10 @@ open my $tigrfamoutput, "./$locus.faaVStigr_all.hmm.hmmscanned.tab", or die "Cou while (<$tigrfamoutput>) { next if /^\s*(#.*)?$/; - next if $tigrfamoutput =~ /^#/; - chomp $_; - my @columns = split (/\t/, $_); - if ($columns[5] > 10) + next if $tigrfamoutput =~ /^#/; + chomp $_; + my @columns = split (/\t/, $_); + if ($columns[5] > 10) { $hash9{$columns[2]} = $columns[0]; $hash9{$columns[0]} = $columns[2]; @@ -427,14 +468,15 @@ while (<$tigrfamoutput>) } # read tigrfam id2names2description -open my $tigrfamid2names, "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_INFO/tigr_info_combined.parsed", or die "Couldn't open tigr_info_combined.parsed\n"; +open my $tigrfamid2names, "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_INFO/tigr_info_combined.parsed_updated2", or die "Couldn't open tigr_info_combined.parsed_updated2\n"; open my $tigrfam_temp_OUT, ">tigrfam_output_temp.txt"; while (<$tigrfamid2names>) { chomp $_; my @columns = split (/\t/, $_); - $columns[0] =~ s/^\s+//; - $columns[0] =~ s/\s+$//; + $columns[0] =~ s/^\s+|\s+$//g; + #$columns[0] =~ s/^\s+//; + #$columns[0] =~ s/\s+$//; if (exists $hash10{$columns[0]}) { print {$tigrfam_temp_OUT} "$hash10{$columns[0]}\t$columns[1]\t$columns[2]\n"; @@ -514,10 +556,11 @@ while (<$img_annotation>) { chomp $_; my @columns = split (/\t/, $_); - my @baba = @columns[1..$#columns]; +# my @baba = @columns[1..$#columns]; + my @baba = @columns[2..$#columns]; #print "@baba \n"; #$combined_bighash{$columns[0]}->{'img'} = join("\t", @baba); - push @{$combined_bighash{$columns[0]}->{'a-img'}}, join("\t", @baba); + push @{$combined_bighash{$columns[1]}->{'a-img'}}, join("\t", @baba); } # uniref @@ -543,6 +586,25 @@ checkAndRunCommand("grep",[[ "> prokka_temp_output.txt", ]], DIE_ON_FAILURE); +# SAMPLE gff file +##gff-version 3 +##sequence-region contig_3875 1 10320 +#contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001 +#contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002 +#contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003 + +#open my $prokka_gff, "./prokka_annotation/$locus.gff", or die "Couldn't open $locus.gff\n"; +#while (<$prokka_gff>) +#{ +# next if $prokka_gff =~ /^#/; +# chomp $_; +# my @main_columns = split (/\t/, $_); +# $prokka_gff = my $ID =~ m/[ID\=](.*)[\;]/; +# $prokka_gff = my $product =~ m/[product\=](.*)[\;]/; +# print "$ID\t$product\n"; +#} + + open my $prokka_annotation, "./prokka_temp_output.txt", or die "Couldn't open prokka_temp_output.txt\n"; while (<$prokka_annotation>) { @@ -603,17 +665,17 @@ foreach my $ID (sort(keys %combined_bighash)) { print {$FINAL_OUTPUT} "$ID\t"; foreach my $annotation_type (sort(keys %column_lengths)) + { + # if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values + if (! exists $combined_bighash{$ID}->{$annotation_type}) + { + # cool way of printing a certain string multiple times based on the values in the hash + print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t"; + } + # check the derefencing made with @{$combined_bighash{$columns[0]}->{'f-tigrfam'}} and so on.. + # the derefencing allows the hash be converted into an array so that we can read the hash for the different types of annotation types + elsif (ref($combined_bighash{$ID}->{$annotation_type}) eq 'ARRAY') { - # if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values - if (! exists $combined_bighash{$ID}->{$annotation_type}) - { - # cool way of printing a certain string multiple times based on the values in the hash - print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t"; - } - # check the derefencing made with @{$combined_bighash{$columns[0]}->{'f-tigrfam'}} and so on.. - # the derefencing allows the hash be converted into an array so that we can read the hash for the different types of annotation types - elsif (ref($combined_bighash{$ID}->{$annotation_type}) eq 'ARRAY') - { # place to store the columns in the values of the hash annotation types my @storage_array; foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}}) @@ -630,21 +692,21 @@ foreach my $ID (sort(keys %combined_bighash)) } } #print Dumper(\@storage_array); - # array to store the multiple hits in each column.. for example test0001 orfid hits multiple pfam values like pf0008 anf pf0010 etc.. + # array to store the multiple hits in each column. eg. test0001 orfid hits multiple pfam values pf0008 & pf0010 # so we would like to have the values combined together in the same column delimited by a comma my @print_info_array; for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++) { - push @print_info_array, join(", ", @{$storage_array[$i]}); + push @print_info_array, join("; ", @{$storage_array[$i]}); } #print Dumper(\@print_info_array); print {$FINAL_OUTPUT} join("\t", @print_info_array), "\t"; - } - else - { + } + else + { print {$FINAL_OUTPUT} "$combined_bighash{$ID}{$annotation_type}\t"; - } - } + } + } print {$FINAL_OUTPUT} "\n"; } @@ -852,10 +914,12 @@ sub handleCommandFailure { sub printAtStart { print<<"EOF"; ---------------------------------------------------------------- + $0 annotateM - annotate my genome Due to the blast processes against multiple databases, this whole annotation pipeline will usually take awhile. Please be patient! + What you get in the end will save you heaps of time. ---------------------------------------------------------------- EOF @@ -891,14 +955,14 @@ __DATA__ =head1 SYNOPSIS - annotateM -i [fasta_file] -l [locus] -k [Bacteria|Archaea] -t [threads] -e [evalue] + annotateM -i [fasta_file] -l [locus] -k [kingdom] -t [threads] -e [evalue] - -i FASTA_FILE Nucleotide fasta file - -l locustag Name of locus tag - -k kingdom (Bacteria/Archaea) Kingdom of genome to be annotated - -t threads Number of threads - -e evalue Evalue for BLAST, recommended 1e-3 - [-help -h] Displays basic usage information + -i FASTA_FILE Nucleotide fasta file + -l locustag Name of locus tag + -k kingdom (Bacteria/Archaea/Phage/Viruses) Kingdom of genome to be annotated + -t threads Number of threads + -e evalue Evalue for BLAST, recommended 1e-3 + [-help -h] Displays basic usage information =cut