Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/env perl | |
| ############################################################################### | |
| # | |
| # annotateM | |
| # | |
| # The idea here is an automated way of annotating your genome based on | |
| # multiple available databases and to produce a tab-delimited file of | |
| # all the annotations, evalues, scores, descriptions. | |
| # | |
| # Suggested workflow: | |
| # 1) run your genome nucleotide fasta file through annotateM | |
| # 2) generate a tab-delimited file | |
| # 3) open the file in ms excel or oo calc | |
| # 4) manually curate the annotations based on evalues/scores/desc etc | |
| # 5) metabolic reconstruction of organism | |
| # | |
| # Copyright (C) Mohamed Fauzi Haroon | |
| # Special appearance from Adam Skarshewski | |
| # | |
| # This program is free software: you can redistribute it and/or modify | |
| # it under the terms of the GNU General Public License as published by | |
| # the Free Software Foundation, either version 3 of the License, or | |
| # (at your option) any later version. | |
| # | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # GNU General Public License for more details. | |
| # | |
| # You should have received a copy of the GNU General Public License | |
| # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| # | |
| ############################################################################### | |
| #pragmas | |
| use strict; | |
| use warnings; | |
| #core Perl modules | |
| use Getopt::Long; | |
| use Carp; | |
| use Data::Dumper; | |
| #CPAN modules | |
| #locally-written modules | |
| BEGIN { | |
| select(STDERR); | |
| $| = 1; | |
| select(STDOUT); | |
| $| = 1; | |
| } | |
| # edit here to log all external commands | |
| my $global_log_commands = 0; | |
| # ext command failure levels | |
| use constant { | |
| IGNORE_FAILURE => 0, | |
| WARN_ON_FAILURE => 1, | |
| DIE_ON_FAILURE => 2 | |
| }; | |
| # get input params and print copyright | |
| printAtStart(); | |
| my $global_options = checkParams(); | |
| ###################################################################### | |
| # CODE HERE | |
| ###################################################################### | |
| # check that the file exists | |
| checkFileExists($global_options->{'in'}); | |
| if (! -e "./prokka_annotation/") | |
| { | |
| #print "prokka\n"; | |
| # run prokka to generate the ORFs and also prokka annotations | |
| checkAndRunCommand("prokka", [{ | |
| "--locustag" => $global_options->{'locustag'}, | |
| "--outdir" => "prokka_annotation", | |
| "--prefix" => $global_options->{'locustag'}, | |
| "--kingdom" => $global_options->{'kingdom'}, | |
| "--cpus" => $global_options->{'threads'}, | |
| "--keep_names", | |
| $global_options->{'in'}, | |
| }], DIE_ON_FAILURE); | |
| } | |
| # identify the ORF called amino acid fasta file for blast-ing | |
| my $locus = $global_options->{'locustag'}; | |
| # blast against img | |
| if (! -e "./$locus.faaVSimg.blastp") | |
| { | |
| print "BLASTing against IMG 4.0 database...............\n"; | |
| checkAndRunCommand("cat", | |
| [[ | |
| "prokka_annotation/$locus.faa |", | |
| "parallel", | |
| "--block"=> "100k", | |
| "--recstart", | |
| "'>'", | |
| "--pipe", | |
| "blastp", | |
| -db => "/srv/db/img/4.0/dereplicated/img_dereplicated_species.genes.faa", | |
| -outfmt => 6, | |
| -max_target_seqs => 1, | |
| -evalue => $global_options->{'evalue'}, | |
| -query => "-", | |
| "> $locus.faaVSimg.blastp", | |
| ]], DIE_ON_FAILURE); | |
| } | |
| # reciprocal blast of img positive hits against genome ORF | |
| if (! -e "./subsetimg.faaVS$locus.faa.blastp") | |
| { | |
| print "Reciprocal BLASTing positive IMG hits to $locus.faa ...............\n"; | |
| checkAndRunCommand("contig_extractor.pl", | |
| [[ | |
| -i => "$locus.faaVSimg.blastp", | |
| -d => "/srv/db/img/4.0/dereplicated/img_dereplicated_species.genes.faa", | |
| -b => '', | |
| -S => '', | |
| -o => "subsetimg.faa", | |
| ]], DIE_ON_FAILURE); | |
| checkAndRunCommand("makeblastdb", | |
| [[ | |
| -in => "prokka_annotation/$locus.faa", | |
| -dbtype => "prot", | |
| ]], DIE_ON_FAILURE); | |
| checkAndRunCommand("blastp", | |
| [[ | |
| -query => "subsetimg.faa", | |
| -db => "prokka_annotation/$locus.faa", | |
| -outfmt => 6, | |
| -max_target_seqs => 1, | |
| -evalue => $global_options->{'evalue'}, | |
| -num_threads => $global_options->{'threads'}, | |
| -out => "subsetimg.faaVS$locus.faa.blastp", | |
| ]], DIE_ON_FAILURE); | |
| } | |
| # blast against uniref | |
| if (! -e "./$locus.faaVSuniref90.blastp") | |
| { | |
| print "BLASTing against Uniref90 database................\n"; | |
| checkAndRunCommand("cat",[[ | |
| "prokka_annotation/$locus.faa |", | |
| "parallel", | |
| "--block"=> "100k", | |
| "--recstart", | |
| "'>'", | |
| "--pipe", | |
| "blastp", | |
| -db => "/srv/db/uniprot/uniref-20140403/uniref90.fasta", | |
| -outfmt => 6, | |
| -max_target_seqs => 1, | |
| -evalue => $global_options->{'evalue'}, | |
| -query => "-", | |
| "> $locus.faaVSuniref90.blastp", | |
| #-num_threads => $global_options->{'threads'}, | |
| ]], DIE_ON_FAILURE); | |
| } | |
| # reciprocal blast of Uniref positive hits against genome ORF | |
| if (! -e "./subsetuniref.faaVS$locus.faa.blastp") | |
| { | |
| print "Reciprocal BLASTing positive Uniref hits to $locus.faa ...............\n"; | |
| checkAndRunCommand("contig_extractor.pl", | |
| [[ | |
| -i => "$locus.faaVSuniref90.blastp", | |
| -d => "/srv/db/uniprot/uniref-20140403/uniref90.fasta", | |
| -b => '', | |
| -S => '', | |
| -o => "subsetuniref.faa", | |
| ]], DIE_ON_FAILURE); | |
| checkAndRunCommand("blastp", | |
| [[ | |
| -query => "subsetuniref.faa", | |
| -db => "prokka_annotation/$locus.faa", | |
| -outfmt => 6, | |
| -max_target_seqs => 1, | |
| -evalue => $global_options->{'evalue'}, | |
| -num_threads => $global_options->{'threads'}, | |
| -out => "subsetuniref.faaVS$locus.faa.blastp", | |
| ]], DIE_ON_FAILURE); | |
| } | |
| # blast against COG | |
| if (! -e "./$locus.faaVSCOG.blastp") | |
| { | |
| print "BLASTing against COG database................\n"; | |
| checkAndRunCommand("cat",[[ | |
| "prokka_annotation/$locus.faa |", | |
| "parallel", | |
| "--block"=> "100k", | |
| "--recstart", | |
| "'>'", | |
| "--pipe", | |
| "blastp", | |
| -db => "/srv/db/cog/cog_blast_prot_db", | |
| -outfmt => 6, | |
| -max_target_seqs => 1, | |
| -evalue => $global_options->{'evalue'}, | |
| -query => "-", | |
| "> $locus.faaVSCOG.blastp", | |
| #-num_threads => $global_options->{'threads'}, | |
| ]], DIE_ON_FAILURE); | |
| } | |
| # HMMSCAN against PFAM | |
| if (! -e "./$locus.faaVSPfam-A.hmm.hmmscanned") | |
| { | |
| print "HMMscanning against Pfam database................\n"; | |
| checkAndRunCommand("pfam_scan.pl",[[ | |
| -cpu => $global_options->{'threads'}, | |
| -e_seq => $global_options->{'evalue'}, | |
| -outfile => "$locus.faaVSPfam-A.hmm.hmmscanned", | |
| -fasta => "prokka_annotation/$locus.faa", | |
| -dir => "/srv/db/pfam/27", | |
| ]], DIE_ON_FAILURE); | |
| } | |
| # HMMSCAN against TIGRfam | |
| if (! -e "./$locus.faaVStigr_all.hmm.hmmscanned") | |
| { | |
| print "HMMscanning against TIGRfam database................\n"; | |
| checkAndRunCommand("hmmscan",[[ | |
| "--tblout", | |
| "$locus.faaVStigr_all.hmm.hmmscanned", | |
| "--noali", | |
| -E => $global_options->{'evalue'}, | |
| "--cpu", | |
| $global_options->{'threads'}, | |
| "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_HMM/tigr_all.hmm", | |
| "prokka_annotation/$locus.faa", | |
| ]], DIE_ON_FAILURE); | |
| } | |
| # convert the hmmscan output to tab delimited | |
| checkAndRunCommand("awk",[[ | |
| "'{\$1=\$1}{ print }'", | |
| "$locus.faaVSPfam-A.hmm.hmmscanned", | |
| "| sed 's/\\s/\\t/g'", | |
| "> $locus.faaVSPfam-A.hmm.hmmscanned.tab", | |
| ]], DIE_ON_FAILURE); | |
| checkAndRunCommand("awk",[[ | |
| "'{\$1=\$1}{ print }'", | |
| "$locus.faaVStigr_all.hmm.hmmscanned", | |
| "| sed 's/\\s/\\t/g'", | |
| "> $locus.faaVStigr_all.hmm.hmmscanned.tab", | |
| ]], DIE_ON_FAILURE); | |
| # hashes for img | |
| my %hash = (); | |
| my %hash2 = (); | |
| my %hash3 =(); | |
| # read the img blast output and store in hash | |
| open my $IMGblast, "./$locus.faaVSimg.blastp", or die "Couldn't open file $locus.faaVSimg.blastp\n"; | |
| while (<$IMGblast>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| # store the orfid which is in columns[0] | |
| my $orfid = $columns[0]; | |
| if ($columns[11] > 60) | |
| { | |
| # key is $orfid while the value of the key in $columns[1] is the IMG id | |
| $hash{$orfid} = $columns[1]; | |
| $hash{$columns[1]} = $columns[0]; | |
| $hash2{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]"; | |
| } | |
| } | |
| # read img id2names.txt | |
| open my $imgid2names, "/srv/db/img/4.0/dereplicated/id2names.txt", or die "Couldn't open id2names.txt\n"; | |
| open my $img_temp_OUT, ">img_output_temp.txt"; | |
| while (<$imgid2names>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if (exists $hash2{$columns[0]}) | |
| { | |
| $hash3{$columns[0]} = "$hash2{$columns[0]}\t$columns[1]\t$columns[2]"; | |
| print {$img_temp_OUT} "$hash2{$columns[0]}\t$columns[1]\t$columns[2]\n"; | |
| } | |
| } | |
| close($IMGblast); | |
| close($imgid2names); | |
| close($img_temp_OUT); | |
| # read my reciprocal img blast output and store in hash | |
| open my $rIMGblast, "./subsetimg.faaVS$locus.faa.blastp", or die "Couldn't open file subsetimg.faaVS$locus.faa.blastp\n"; | |
| open my $img_temp_OUT2, ">img_output_temp2.txt"; | |
| while (<$rIMGblast>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if (exists $hash{$columns[0]}) | |
| { | |
| print {$img_temp_OUT2} "$hash3{$columns[0]}\treciprocal\n"; | |
| } | |
| } | |
| close($img_temp_OUT2); | |
| # hashes for uniref | |
| my %hash4 = (); | |
| my %hash5 =(); | |
| my %hash6 = (); | |
| # read uniref blast and store in hash | |
| open my $unirefblast, "./$locus.faaVSuniref90.blastp", or die "Couldn't open file $locus.faaVSuniref90.blastp\n"; | |
| while (<$unirefblast>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if ($columns[11] > 60) | |
| { | |
| $hash4{$columns[0]} = $columns[1]; | |
| $hash4{$columns[1]} = $columns[0]; | |
| $hash5{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]"; | |
| } | |
| } | |
| # read uniref id2names.txt | |
| open my $unirefid2names, "/srv/db/uniprot/uniref-20140403/uniref90_id2names.txt", or die "Couldn't open id2names.txt\n"; | |
| open my $uniref_temp_OUT, ">uniref_output_temp.txt"; | |
| while (<$unirefid2names>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if (exists $hash5{$columns[0]}) | |
| { | |
| $hash6{$columns[0]} = "$hash5{$columns[0]}\t$columns[1]\t$columns[2]"; | |
| print {$uniref_temp_OUT} "$hash5{$columns[0]}\t$columns[1]\t$columns[2]\n"; | |
| } | |
| } | |
| close($unirefblast); | |
| close($unirefid2names); | |
| close($uniref_temp_OUT); | |
| # read my reciprocal img blast output and store in hash | |
| open my $runirefblast, "./subsetuniref.faaVS$locus.faa.blastp", or die "Couldn't open file subsetuniref.faaVS$locus.faa.blastp\n"; | |
| open my $uniref_temp_OUT2, ">uniref_output_temp2.txt"; | |
| while (<$runirefblast>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if (exists $hash4{$columns[0]}) | |
| { | |
| print {$uniref_temp_OUT2} "$hash6{$columns[0]}\treciprocal\n"; | |
| } | |
| } | |
| close($uniref_temp_OUT2); | |
| # hashes for pfam | |
| my %hash7 = (); | |
| my %hash8 = (); | |
| # read pfam hmmscan output and store in hash | |
| open my $pfamoutput, "./$locus.faaVSPfam-A.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVSPfam-A.hmm.hmmscanned.tab\n"; | |
| while (<$pfamoutput>) | |
| { | |
| next if /^\s*(#.*)?$/; | |
| next if $pfamoutput =~ /^#/; | |
| next if $pfamoutput =~ /^=/; | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if ($columns[11] > 60) | |
| { | |
| my @pfam_columns = split (/\./, $columns[5]); | |
| my $pfam_id = $pfam_columns[0]; | |
| $hash7{$columns[0]} = $pfam_columns[0]; | |
| $hash7{$pfam_columns[0]} = $columns[0]; | |
| $hash8{$pfam_columns[0]} = "$columns[0]\t$columns[12]\t$columns[11]"; | |
| } | |
| } | |
| # read Pfam-A.clans.tsv | |
| open my $pfamid2names, "/srv/db/pfam/27/Pfam-A.clans.tsv", or die "Couldn't open Pfam-A.clans.tsv\n"; | |
| open my $pfam_temp_OUT, ">pfam_output_temp.txt"; | |
| while (<$pfamid2names>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if (exists $hash8{$columns[0]}) | |
| { | |
| print {$pfam_temp_OUT} "$hash8{$columns[0]}\t$columns[4]\n"; | |
| } | |
| } | |
| close($pfamoutput); | |
| close($pfamid2names); | |
| close($pfam_temp_OUT); | |
| # hashes for tigrfam | |
| my %hash9 = (); | |
| my %hash10 = (); | |
| # read tigrfam hmmscan output and store in hash | |
| open my $tigrfamoutput, "./$locus.faaVStigr_all.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVStigr_all.hmm.hmmscanned.tab\n"; | |
| while (<$tigrfamoutput>) | |
| { | |
| next if /^\s*(#.*)?$/; | |
| next if $tigrfamoutput =~ /^#/; | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if ($columns[5] > 10) | |
| { | |
| $hash9{$columns[2]} = $columns[0]; | |
| $hash9{$columns[0]} = $columns[2]; | |
| $hash10{$columns[0]} = "$columns[2]\t$columns[4]\t$columns[5]"; | |
| } | |
| } | |
| # read tigrfam id2names2description | |
| open my $tigrfamid2names, "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_INFO/tigr_info_combined.parsed", or die "Couldn't open tigr_info_combined.parsed\n"; | |
| open my $tigrfam_temp_OUT, ">tigrfam_output_temp.txt"; | |
| while (<$tigrfamid2names>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| $columns[0] =~ s/^\s+//; | |
| $columns[0] =~ s/\s+$//; | |
| if (exists $hash10{$columns[0]}) | |
| { | |
| print {$tigrfam_temp_OUT} "$hash10{$columns[0]}\t$columns[1]\t$columns[2]\n"; | |
| } | |
| } | |
| close($tigrfamoutput); | |
| close($tigrfamid2names); | |
| close($tigrfam_temp_OUT); | |
| # hashes for cog | |
| my %hash11 = (); | |
| my %hash12 = (); | |
| my %hash13 = (); | |
| # read cog blastp output and store in hash | |
| open my $cogblast, "./$locus.faaVSCOG.blastp", or die "Couldn't open file $locus.faaVSCOG.blastp\n"; | |
| while (<$cogblast>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if ($columns[11] > 60) | |
| { | |
| $hash11{$columns[0]} = $columns[1]; | |
| $hash11{$columns[1]} = $columns[0]; | |
| $hash12{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]"; | |
| } | |
| } | |
| # read cog prot2COG.tab | |
| open my $cogid2names, "/srv/db/cog/prot2COG.tab", or die "Couldn't open prot2COG.tab\n"; | |
| open my $cog_temp_OUT, "> cog_output_temp.txt"; | |
| while (<$cogid2names>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if (exists $hash12{$columns[0]}) | |
| { | |
| $hash13{$columns[0]} = "$hash12{$columns[0]}\t$columns[1]"; | |
| $hash13{$columns[1]} = $hash12{$columns[0]}; | |
| print {$cog_temp_OUT} "$hash12{$columns[0]}\t$columns[1]\n"; | |
| } | |
| } | |
| close($cogblast); | |
| close($cogid2names); | |
| close($cog_temp_OUT); | |
| # read cog listcogs.txt | |
| open my $cogid2longernames, "/srv/db/cog/listcogs.txt", or die "Couldn't open listcogs.txt\n"; | |
| open my $cog_temp_OUT2, "> cog_output_temp2.txt"; | |
| while(<$cogid2longernames>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| if (exists $hash13{$columns[5]}) | |
| { | |
| print {$cog_temp_OUT2} "$hash13{$columns[5]}\t$columns[3]\t$columns[4]\t$columns[6]\n"; | |
| } | |
| } | |
| close($cog_temp_OUT2); | |
| ### now to parse all the temporary files and combine into one tab-delimited-file | |
| # to store the IDs => DB => values/annotations | |
| my %combined_bighash =(); | |
| # open file for output | |
| open my $FINAL_OUTPUT, "> ./final_output.txt"; | |
| # print header | |
| print {$FINAL_OUTPUT} "ORF_ID\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n"; | |
| # img | |
| open my $img_annotation, "./img_output_temp2.txt", or die "Couldn't open img_output_temp2.txt\n"; | |
| while (<$img_annotation>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| my @baba = @columns[1..$#columns]; | |
| #print "@baba \n"; | |
| #$combined_bighash{$columns[0]}->{'img'} = join("\t", @baba); | |
| push @{$combined_bighash{$columns[0]}->{'a-img'}}, join("\t", @baba); | |
| } | |
| # uniref | |
| open my $uniref_annotation, "./uniref_output_temp2.txt", or die "Couldn't open uniref_output_temp2.txt\n"; | |
| while (<$uniref_annotation>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| my @baba = @columns[1..$#columns]; | |
| #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba); | |
| push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba); | |
| } | |
| # prokka | |
| # need to parse faa file to give prokka id2names | |
| checkAndRunCommand("grep",[[ | |
| "'>'", | |
| "prokka_annotation/$locus.faa |", | |
| "sed", | |
| "'s/>//g' |", | |
| "sed", | |
| -e => "'s/ /\\t/'", | |
| "> prokka_temp_output.txt", | |
| ]], DIE_ON_FAILURE); | |
| open my $prokka_annotation, "./prokka_temp_output.txt", or die "Couldn't open prokka_temp_output.txt\n"; | |
| while (<$prokka_annotation>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| my @baba = @columns[1..$#columns]; | |
| #$combined_bighash{$columns[0]}->{'prokka'} = join("\t", @baba); | |
| push @{$combined_bighash{$columns[0]}->{'c-prokka'}}, join("\t", @baba); | |
| } | |
| # cog | |
| open my $cog_annotation, "./cog_output_temp2.txt", or die "Couldn't open cog_output_temp2.txt\n"; | |
| while (<$cog_annotation>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| my @baba = @columns[1..$#columns]; | |
| #$combined_bighash{$columns[0]}->{'cog'} = join("\t", @baba); | |
| push @{$combined_bighash{$columns[0]}->{'d-cog'}}, join("\t", @baba); | |
| } | |
| # pfam | |
| open my $pfam_annotation, "./pfam_output_temp.txt", or die "Couldn't open pfam_output_temp.txt\n"; | |
| while (<$pfam_annotation>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| my @baba = @columns[1..$#columns]; | |
| #$combined_bighash{$columns[0]}->{'pfam'} = join("\t", @baba); | |
| push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba); | |
| } | |
| #print Dumper \%combined_bighash; | |
| # tigrfam | |
| open my $tigrfam_annotation, "./tigrfam_output_temp.txt", or die "Couldn't open tigrfam_output_temp.txt\n"; | |
| while (<$tigrfam_annotation>) | |
| { | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| my @baba = @columns[1..$#columns]; | |
| #$combined_bighash{$columns[0]}->{'tigrfam'} = join("\t", @baba); | |
| push @{$combined_bighash{$columns[0]}->{'f-tigrfam'}}, join("\t", @baba); | |
| } | |
| # to print finally................. | |
| # assign key and value for number of expected columns for each annotation type, important for putting NA in missing annotation types | |
| my %column_lengths = ( | |
| 'a-img' => 5, | |
| 'b-uniref' => 5, | |
| 'c-prokka' => 1, | |
| 'd-cog' => 5, | |
| 'e-pfam' => 3, | |
| 'f-tigrfam' => 4, | |
| ); | |
| # print the orfids first | |
| foreach my $ID (sort(keys %combined_bighash)) | |
| { | |
| print {$FINAL_OUTPUT} "$ID\t"; | |
| foreach my $annotation_type (sort(keys %column_lengths)) | |
| { | |
| # if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values | |
| if (! exists $combined_bighash{$ID}->{$annotation_type}) | |
| { | |
| # cool way of printing a certain string multiple times based on the values in the hash | |
| print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t"; | |
| } | |
| # check the derefencing made with @{$combined_bighash{$columns[0]}->{'f-tigrfam'}} and so on.. | |
| # the derefencing allows the hash be converted into an array so that we can read the hash for the different types of annotation types | |
| elsif (ref($combined_bighash{$ID}->{$annotation_type}) eq 'ARRAY') | |
| { | |
| # place to store the columns in the values of the hash annotation types | |
| my @storage_array; | |
| foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}}) | |
| { | |
| # each annotation types have different number of columns, so we need to split the columns first before | |
| # we can add in the extra values if lets say an orfid hits multiple pfam/cog/tigrfam values | |
| my @values = split("\t",$line); | |
| # cool and alternate way of doing columns[1] = values[1], and so on.., repetitively | |
| # what it basically means as long as the value i less than the number of columns in each annotation type | |
| # add +1 to the string $i and do the push below | |
| for (my $i = 0; $i <= $#values; $i++) | |
| { | |
| push @{$storage_array[$i]}, $values[$i]; | |
| } | |
| } | |
| #print Dumper(\@storage_array); | |
| # array to store the multiple hits in each column.. for example test0001 orfid hits multiple pfam values like pf0008 anf pf0010 etc.. | |
| # so we would like to have the values combined together in the same column delimited by a comma | |
| my @print_info_array; | |
| for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++) | |
| { | |
| push @print_info_array, join(", ", @{$storage_array[$i]}); | |
| } | |
| #print Dumper(\@print_info_array); | |
| print {$FINAL_OUTPUT} join("\t", @print_info_array), "\t"; | |
| } | |
| else | |
| { | |
| print {$FINAL_OUTPUT} "$combined_bighash{$ID}{$annotation_type}\t"; | |
| } | |
| } | |
| print {$FINAL_OUTPUT} "\n"; | |
| } | |
| #close all files | |
| close($img_annotation); | |
| close($uniref_annotation); | |
| close($prokka_annotation); | |
| close($cog_annotation); | |
| close($pfam_annotation); | |
| close($tigrfam_annotation); | |
| close($FINAL_OUTPUT); | |
| ###################################################################### | |
| # CUSTOM SUBS | |
| ###################################################################### | |
| # who needs custom subs... | |
| ###################################################################### | |
| # TEMPLATE SUBS | |
| ###################################################################### | |
| # PARAMETERS | |
| sub checkParams { | |
| #----- | |
| # Do any and all options checking here... | |
| # | |
| my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "kingdom|k:s", "threads|t:s", "evalue|e:s"); | |
| my %options; | |
| # Add any other command line options, and the code to handle them | |
| # | |
| GetOptions( \%options, @standard_options ); | |
| # if no arguments supplied print the usage and exit | |
| # | |
| exec("pod2usage $0") if (0 == (keys (%options) )); | |
| # If the -help option is set, print the usage and exit | |
| # | |
| exec("pod2usage $0") if $options{'help'}; | |
| # Compulsory items | |
| #if(!exists $options{''} ) { printParamError (""); } | |
| if(!exists $options{'in'} ) { printParamError ("You MUST supply a fasta file"); } | |
| return \%options; | |
| } | |
| sub printParamError | |
| { | |
| #----- | |
| # What to do if there's something wrong with a parameter | |
| # | |
| my ($error) = @_; | |
| print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); | |
| } | |
| sub overrideDefault | |
| { | |
| #----- | |
| # Set and override default values for parameters | |
| # | |
| my ($default_value, $option_name) = @_; | |
| if(exists $global_options->{$option_name}) | |
| { | |
| return $global_options->{$option_name}; | |
| } | |
| return $default_value; | |
| } | |
| ##################################################################### | |
| # FILE IO | |
| sub openWrite | |
| { | |
| #----- | |
| # Open a file for writing | |
| # | |
| my ($fn) = @_; | |
| open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n"; | |
| return $fh; | |
| } | |
| sub openRead | |
| { | |
| #----- | |
| # Open a file for reading | |
| # | |
| my ($fn) = @_; | |
| open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n"; | |
| return $fh; | |
| } | |
| ###################################################################### | |
| # EXTERNAL COMMANDS | |
| # | |
| # checkAndRunCommand("ls", { | |
| # -a => "" | |
| # }, | |
| # WARN_ON_FAILURE); | |
| sub checkFileExists { | |
| #----- | |
| # Does a file exists? | |
| # | |
| my ($file) = @_; | |
| unless(-e $file) { | |
| croak "**ERROR: $0 : Cannot find:\n$file\n"; | |
| } | |
| } | |
| sub logExternalCommand | |
| { | |
| #----- | |
| # Log a command line command to the command line! | |
| # | |
| if(1 == $global_log_commands) { | |
| print $_[0], "\n"; | |
| } | |
| } | |
| sub isCommandInPath | |
| { | |
| #----- | |
| # Is this command in the path? | |
| # | |
| my ($cmd, $failure_type) = @_; | |
| if (system("which $cmd |> /dev/null")) { | |
| handleCommandFailure($cmd, $failure_type); | |
| } | |
| } | |
| sub runExternalCommand | |
| { | |
| #----- | |
| # Run a command line command on the command line! | |
| # | |
| my ($cmd) = @_; | |
| logExternalCommand($cmd); | |
| system($cmd); | |
| } | |
| sub checkAndRunCommand | |
| { | |
| #----- | |
| # Run external commands more sanelier | |
| # | |
| my ($cmd, $params, $failure_type) = @_; | |
| isCommandInPath($cmd, $failure_type); | |
| # join the parameters to the command | |
| my $param_str = join " ", map {formatParams($_)} @{$params}; | |
| my $cmd_str = $cmd . " " . $param_str; | |
| print "The command currently running:\t$cmd_str\n"; | |
| logExternalCommand($cmd_str); | |
| # make sure that all went well | |
| if (system($cmd_str)) { | |
| handleCommandFailure($cmd_str, $failure_type) | |
| } | |
| } | |
| sub formatParams { | |
| #--------- | |
| # Handles and formats the different ways of passing parameters to | |
| # checkAndRunCommand | |
| # | |
| my $ref = shift; | |
| if (ref($ref) eq "ARRAY") { | |
| return join(" ", @{$ref}); | |
| } elsif (ref($ref) eq "HASH") { | |
| return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref}); | |
| } | |
| croak 'The elements of the $params argument in checkAndRunCommand can ' . | |
| 'only contain references to arrays or hashes\n'; | |
| } | |
| sub handleCommandFailure { | |
| #----- | |
| # What to do when all goes bad! | |
| # | |
| my ($cmd, $failure_type) = @_; | |
| if (defined($failure_type)) { | |
| if ($failure_type == DIE_ON_FAILURE) { | |
| croak "**ERROR: $0 : " . $! . "\n"; | |
| } elsif ($failure_type == WARN_ON_FAILURE) { | |
| carp "**WARNING: $0 : " . $! . "\n"; | |
| } | |
| } | |
| } | |
| ###################################################################### | |
| # MISC | |
| sub printAtStart { | |
| print<<"EOF"; | |
| ---------------------------------------------------------------- | |
| $0 | |
| annotateM - annotate my genome | |
| Due to the blast processes against multiple databases, this whole | |
| annotation pipeline will usually take awhile. Please be patient! | |
| ---------------------------------------------------------------- | |
| EOF | |
| } | |
| __DATA__ | |
| =head1 NAME | |
| annotateM | |
| =head1 COPYRIGHT | |
| Copyright (C) Mohamed Fauzi Haroon | |
| Special appearance from Adam Skarshewski | |
| This program is free software: you can redistribute it and/or modify | |
| it under the terms of the GNU General Public License as published by | |
| the Free Software Foundation, either version 3 of the License, or | |
| (at your option) any later version. | |
| This program is distributed in the hope that it will be useful, | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| GNU General Public License for more details. | |
| You should have received a copy of the GNU General Public License | |
| along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| =head1 DESCRIPTION | |
| Want to annotate your genome? annotateM! | |
| =head1 SYNOPSIS | |
| annotateM -i [fasta_file] -l [locus] -k [Bacteria|Archaea] -t [threads] -e [evalue] | |
| -i FASTA_FILE Nucleotide fasta file | |
| -l locustag Name of locus tag | |
| -k kingdom (Bacteria/Archaea) Kingdom of genome to be annotated | |
| -t threads Number of threads | |
| -e evalue Evalue for BLAST, recommended 1e-3 | |
| [-help -h] Displays basic usage information | |
| =cut | |