Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/env perl | |
| ############################################################################### | |
| # | |
| # post_annotateM | |
| # | |
| # The idea here is an automated way of annotating your genome based on | |
| # multiple available databases and to produce a tab-delimited file of | |
| # all the annotations, evalues, scores, descriptions. | |
| # | |
| # Suggested workflow: | |
| # 1) run your genome nucleotide fasta file through annotateM | |
| # 2) then run post_annotateM to include the contig id,orf_start and end | |
| # 3) generate a tab-delimited file | |
| # 4) open the file in ms excel or oo calc | |
| # 5) manually curate the annotations based on evalues/scores/desc etc | |
| # 6) metabolic reconstruction of organism | |
| # | |
| # Copyright (C) Mohamed Fauzi Haroon | |
| # Special appearance from Adam Skarshewski | |
| # | |
| # This program is free software: you can redistribute it and/or modify | |
| # it under the terms of the GNU General Public License as published by | |
| # the Free Software Foundation, either version 3 of the License, or | |
| # (at your option) any later version. | |
| # | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # GNU General Public License for more details. | |
| # | |
| # You should have received a copy of the GNU General Public License | |
| # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| # | |
| ############################################################################### | |
| #pragmas | |
| use strict; | |
| use warnings; | |
| #core Perl modules | |
| use Getopt::Long; | |
| use Carp; | |
| use Data::Dumper; | |
| #CPAN modules | |
| #locally-written modules | |
| BEGIN { | |
| select(STDERR); | |
| $| = 1; | |
| select(STDOUT); | |
| $| = 1; | |
| } | |
| # edit here to log all external commands | |
| my $global_log_commands = 0; | |
| # ext command failure levels | |
| use constant { | |
| IGNORE_FAILURE => 0, | |
| WARN_ON_FAILURE => 1, | |
| DIE_ON_FAILURE => 2 | |
| }; | |
| # get input params and print copyright | |
| printAtStart(); | |
| my $global_options = checkParams(); | |
| ###################################################################### | |
| # CODE HERE | |
| ###################################################################### | |
| # check that the final_output.txt file exists | |
| checkFileExists($global_options->{'in'}); | |
| # identify the ORF called amino acid fasta file | |
| my $locus = $global_options->{'locustag'}; | |
| # gff file | |
| my $gff_file = $global_options->{'gff'}; | |
| # to store hash | |
| my %hashy = (); | |
| my %orf = (); | |
| # open my final_output.txt file | |
| open my $final_output, "./final_output.txt", or die "Couldn't open final_output.txt\n"; | |
| while (<$final_output>) | |
| { | |
| #skips first line | |
| next if $. < 2; | |
| chomp $_; | |
| my @columns = split (/\t/, $_); | |
| my @baba = @columns[1..$#columns]; | |
| $hashy{$columns[0]} = join("\t", @baba); | |
| #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba); | |
| # push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba); | |
| } | |
| close($final_output); | |
| #print Dumper (\%orf); | |
| # open the prokka annotation gff file | |
| # sample gff file: | |
| ###gff-version 3 | |
| ##sequence-region contig_3875 1 10320 | |
| #contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001 | |
| #contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002 | |
| #contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003 | |
| open my $SUPER_FINAL_OUTPUT, "> ./super_final_output.txt"; | |
| print {$SUPER_FINAL_OUTPUT} "ORF_ID\tcontig_ID\tORF_start_pos\tORF_end_pos\tstrand\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n"; | |
| open my $gff, "./$gff_file", or die "Couldn't open $locus.gff\n"; | |
| while (<$gff>) | |
| { | |
| if ($_ =~ /^##/){ | |
| if ($_ =~ /^##FASTA/){ | |
| last; | |
| } | |
| else{ | |
| next; | |
| } | |
| } | |
| chomp $_; | |
| my @orf_id = (); | |
| my @columns = split (/\t/, $_); | |
| #my @last_column = split (/\t/, $columns[8]); | |
| # $columns[8] =~ /(ID=)/; | |
| # print $1; | |
| if ($_ =~ /ID=(\w+);/) | |
| { | |
| push @orf_id, $1; | |
| # print Dumper (\@orf_id); | |
| } | |
| foreach my $value_in_orf_array (@orf_id) | |
| { | |
| if (exists $hashy{$value_in_orf_array}) | |
| { | |
| # print "$value_in_orf_array\t$columns[3]\t$columns[4]"; | |
| my @print_array = [$value_in_orf_array, $columns[3], $columns[4]]; | |
| # unshift @{$hashy{$value_in_orf_array}}, join("\t", @print_array); | |
| # $hashy{$value_in_orf_array} .= "\t" . $value_in_orf_array . "\t$columns[3]\t$columns[4]\t"; | |
| print {$SUPER_FINAL_OUTPUT} "$value_in_orf_array\t$columns[0]\t$columns[3]\t$columns[4]\t$columns[6]\t$hashy{$value_in_orf_array}\n"; | |
| } | |
| } | |
| } | |
| close($gff); | |
| close($SUPER_FINAL_OUTPUT); | |
| print "COMPLETED, super_final_output.txt generated! If i was you, i will open super_final_output.txt in excel\n"; | |
| ###################################################################### | |
| # CUSTOM SUBS | |
| ###################################################################### | |
| # who needs custom subs... | |
| ###################################################################### | |
| # TEMPLATE SUBS | |
| ###################################################################### | |
| # PARAMETERS | |
| sub checkParams { | |
| #----- | |
| # Do any and all options checking here... | |
| # | |
| my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "gff|g:s"); | |
| my %options; | |
| # Add any other command line options, and the code to handle them | |
| # | |
| GetOptions( \%options, @standard_options ); | |
| # if no arguments supplied print the usage and exit | |
| # | |
| exec("pod2usage $0") if (0 == (keys (%options) )); | |
| # If the -help option is set, print the usage and exit | |
| # | |
| exec("pod2usage $0") if $options{'help'}; | |
| # Compulsory items | |
| #if(!exists $options{''} ) { printParamError (""); } | |
| if(!exists $options{'in'} ) { printParamError ("You MUST supply your final_output.txt from annotateM"); } | |
| return \%options; | |
| } | |
| sub printParamError | |
| { | |
| #----- | |
| # What to do if there's something wrong with a parameter | |
| # | |
| my ($error) = @_; | |
| print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); | |
| } | |
| sub overrideDefault | |
| { | |
| #----- | |
| # Set and override default values for parameters | |
| # | |
| my ($default_value, $option_name) = @_; | |
| if(exists $global_options->{$option_name}) | |
| { | |
| return $global_options->{$option_name}; | |
| } | |
| return $default_value; | |
| } | |
| ##################################################################### | |
| # FILE IO | |
| sub openWrite | |
| { | |
| #----- | |
| # Open a file for writing | |
| # | |
| my ($fn) = @_; | |
| open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n"; | |
| return $fh; | |
| } | |
| sub openRead | |
| { | |
| #----- | |
| # Open a file for reading | |
| # | |
| my ($fn) = @_; | |
| open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n"; | |
| return $fh; | |
| } | |
| ###################################################################### | |
| # EXTERNAL COMMANDS | |
| # | |
| # checkAndRunCommand("ls", { | |
| # -a => "" | |
| # }, | |
| # WARN_ON_FAILURE); | |
| sub checkFileExists { | |
| #----- | |
| # Does a file exists? | |
| # | |
| my ($file) = @_; | |
| unless(-e $file) { | |
| croak "**ERROR: $0 : Cannot find:\n$file\n"; | |
| } | |
| } | |
| sub logExternalCommand | |
| { | |
| #----- | |
| # Log a command line command to the command line! | |
| # | |
| if(1 == $global_log_commands) { | |
| print $_[0], "\n"; | |
| } | |
| } | |
| sub isCommandInPath | |
| { | |
| #----- | |
| # Is this command in the path? | |
| # | |
| my ($cmd, $failure_type) = @_; | |
| if (system("which $cmd |> /dev/null")) { | |
| handleCommandFailure($cmd, $failure_type); | |
| } | |
| } | |
| sub runExternalCommand | |
| { | |
| #----- | |
| # Run a command line command on the command line! | |
| # | |
| my ($cmd) = @_; | |
| logExternalCommand($cmd); | |
| system($cmd); | |
| } | |
| sub checkAndRunCommand | |
| { | |
| #----- | |
| # Run external commands more sanelier | |
| # | |
| my ($cmd, $params, $failure_type) = @_; | |
| isCommandInPath($cmd, $failure_type); | |
| # join the parameters to the command | |
| my $param_str = join " ", map {formatParams($_)} @{$params}; | |
| my $cmd_str = $cmd . " " . $param_str; | |
| print "The command currently running:\t$cmd_str\n"; | |
| logExternalCommand($cmd_str); | |
| # make sure that all went well | |
| if (system($cmd_str)) { | |
| handleCommandFailure($cmd_str, $failure_type) | |
| } | |
| } | |
| sub formatParams { | |
| #--------- | |
| # Handles and formats the different ways of passing parameters to | |
| # checkAndRunCommand | |
| # | |
| my $ref = shift; | |
| if (ref($ref) eq "ARRAY") { | |
| return join(" ", @{$ref}); | |
| } elsif (ref($ref) eq "HASH") { | |
| return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref}); | |
| } | |
| croak 'The elements of the $params argument in checkAndRunCommand can ' . | |
| 'only contain references to arrays or hashes\n'; | |
| } | |
| sub handleCommandFailure { | |
| #----- | |
| # What to do when all goes bad! | |
| # | |
| my ($cmd, $failure_type) = @_; | |
| if (defined($failure_type)) { | |
| if ($failure_type == DIE_ON_FAILURE) { | |
| croak "**ERROR: $0 : " . $! . "\n"; | |
| } elsif ($failure_type == WARN_ON_FAILURE) { | |
| carp "**WARNING: $0 : " . $! . "\n"; | |
| } | |
| } | |
| } | |
| ###################################################################### | |
| # MISC | |
| sub printAtStart { | |
| print<<"EOF"; | |
| ---------------------------------------------------------------- | |
| $0 | |
| post_annotateM - annotate my genome | |
| Due to the blast processes against multiple databases, this whole | |
| annotation pipeline will usually take awhile. Please be patient! | |
| What you get in the end will save you heaps of time. | |
| ---------------------------------------------------------------- | |
| EOF | |
| } | |
| __DATA__ | |
| =head1 NAME | |
| post_annotateM | |
| =head1 COPYRIGHT | |
| Copyright (C) Mohamed Fauzi Haroon | |
| Special appearance from Adam Skarshewski | |
| This program is free software: you can redistribute it and/or modify | |
| it under the terms of the GNU General Public License as published by | |
| the Free Software Foundation, either version 3 of the License, or | |
| (at your option) any later version. | |
| This program is distributed in the hope that it will be useful, | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| GNU General Public License for more details. | |
| You should have received a copy of the GNU General Public License | |
| along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| =head1 DESCRIPTION | |
| Want to annotate your genome? annotateM! | |
| =head1 SYNOPSIS | |
| post_annotateM -i [finaloutputfromannotatem] -l [locustag] -g [gff] | |
| -i final_output.txt final_output.txt file generated from annotateM | |
| -l locustag Name of locus tag | |
| -g locustag.gff location of gff file in prokka_annotation folder | |
| [-help -h] Displays basic usage information | |
| =cut | |