Permalink
Please sign in to comment.
Browse files
run this script to get the contig id, orf start and end, and also strand
- Loading branch information...
397
post_annotateM
| @@ -0,0 +1,397 @@ | ||
| +#!/usr/bin/env perl | ||
| +############################################################################### | ||
| +# | ||
| +# post_annotateM | ||
| +# | ||
| +# The idea here is an automated way of annotating your genome based on | ||
| +# multiple available databases and to produce a tab-delimited file of | ||
| +# all the annotations, evalues, scores, descriptions. | ||
| +# | ||
| +# Suggested workflow: | ||
| +# 1) run your genome nucleotide fasta file through annotateM | ||
| +# 2) generate a tab-delimited file | ||
| +# 3) open the file in ms excel or oo calc | ||
| +# 4) manually curate the annotations based on evalues/scores/desc etc | ||
| +# 5) metabolic reconstruction of organism | ||
| +# | ||
| +# Copyright (C) Mohamed Fauzi Haroon | ||
| +# Special appearance from Adam Skarshewski | ||
| +# | ||
| +# This program is free software: you can redistribute it and/or modify | ||
| +# it under the terms of the GNU General Public License as published by | ||
| +# the Free Software Foundation, either version 3 of the License, or | ||
| +# (at your option) any later version. | ||
| +# | ||
| +# This program is distributed in the hope that it will be useful, | ||
| +# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| +# GNU General Public License for more details. | ||
| +# | ||
| +# You should have received a copy of the GNU General Public License | ||
| +# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
| +# | ||
| +############################################################################### | ||
| + | ||
| +#pragmas | ||
| +use strict; | ||
| +use warnings; | ||
| + | ||
| +#core Perl modules | ||
| +use Getopt::Long; | ||
| +use Carp; | ||
| +use Data::Dumper; | ||
| + | ||
| +#CPAN modules | ||
| + | ||
| +#locally-written modules | ||
| + | ||
| +BEGIN { | ||
| + select(STDERR); | ||
| + $| = 1; | ||
| + select(STDOUT); | ||
| + $| = 1; | ||
| +} | ||
| + | ||
| +# edit here to log all external commands | ||
| +my $global_log_commands = 0; | ||
| + | ||
| +# ext command failure levels | ||
| +use constant { | ||
| + IGNORE_FAILURE => 0, | ||
| + WARN_ON_FAILURE => 1, | ||
| + DIE_ON_FAILURE => 2 | ||
| +}; | ||
| + | ||
| +# get input params and print copyright | ||
| +printAtStart(); | ||
| +my $global_options = checkParams(); | ||
| + | ||
| +###################################################################### | ||
| +# CODE HERE | ||
| +###################################################################### | ||
| + | ||
| +# check that the final_output.txt file exists | ||
| +checkFileExists($global_options->{'in'}); | ||
| + | ||
| +# identify the ORF called amino acid fasta file | ||
| +my $locus = $global_options->{'locustag'}; | ||
| + | ||
| +# gff file | ||
| +my $gff_file = $global_options->{'gff'}; | ||
| + | ||
| +# to store hash | ||
| +my %hashy = (); | ||
| +my %orf = (); | ||
| + | ||
| +# open my final_output.txt file | ||
| +open my $final_output, "./final_output.txt", or die "Couldn't open final_output.txt\n"; | ||
| +while (<$final_output>) | ||
| +{ | ||
| + #skips first line | ||
| + next if $. < 2; | ||
| + chomp $_; | ||
| + my @columns = split (/\t/, $_); | ||
| + my @baba = @columns[1..$#columns]; | ||
| + $hashy{$columns[0]} = join("\t", @baba); | ||
| + #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba); | ||
| +# push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba); | ||
| +} | ||
| + | ||
| +close($final_output); | ||
| +#print Dumper (\%orf); | ||
| + | ||
| +# open the prokka annotation gff file | ||
| +# sample gff file: | ||
| +###gff-version 3 | ||
| +##sequence-region contig_3875 1 10320 | ||
| +#contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001 | ||
| +#contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002 | ||
| +#contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003 | ||
| +open my $SUPER_FINAL_OUTPUT, "> ./super_final_output.txt"; | ||
| +print {$SUPER_FINAL_OUTPUT} "ORF_ID\tcontig_ID\tORF_start_pos\tORF_end_pos\tstrand\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n"; | ||
| + | ||
| +open my $gff, "./$gff_file", or die "Couldn't open $locus.gff\n"; | ||
| +while (<$gff>) | ||
| +{ | ||
| + if ($_ =~ /^##/){ | ||
| + if ($_ =~ /^##FASTA/){ | ||
| + last; | ||
| + } | ||
| + else{ | ||
| + next; | ||
| + } | ||
| + } | ||
| + chomp $_; | ||
| + my @orf_id = (); | ||
| + my @columns = split (/\t/, $_); | ||
| + #my @last_column = split (/\t/, $columns[8]); | ||
| +# $columns[8] =~ /(ID=)/; | ||
| +# print $1; | ||
| + if ($_ =~ /ID=(\w+);/) | ||
| + { | ||
| + push @orf_id, $1; | ||
| +# print Dumper (\@orf_id); | ||
| + } | ||
| + foreach my $value_in_orf_array (@orf_id) | ||
| + { | ||
| + if (exists $hashy{$value_in_orf_array}) | ||
| + { | ||
| + # print "$value_in_orf_array\t$columns[3]\t$columns[4]"; | ||
| + my @print_array = [$value_in_orf_array, $columns[3], $columns[4]]; | ||
| + # unshift @{$hashy{$value_in_orf_array}}, join("\t", @print_array); | ||
| + # $hashy{$value_in_orf_array} .= "\t" . $value_in_orf_array . "\t$columns[3]\t$columns[4]\t"; | ||
| + | ||
| + print {$SUPER_FINAL_OUTPUT} "$value_in_orf_array\t$columns[0]\t$columns[3]\t$columns[4]\t$columns[6]\t$hashy{$value_in_orf_array}\n"; | ||
| + } | ||
| + } | ||
| +} | ||
| +close($gff); | ||
| +close($SUPER_FINAL_OUTPUT); | ||
| + | ||
| +print "COMPLETED, super_final_output.txt generated! If i was you, i will open super_final_output.txt in excel\n"; | ||
| + | ||
| +###################################################################### | ||
| +# CUSTOM SUBS | ||
| +###################################################################### | ||
| + | ||
| +# who needs custom subs... | ||
| + | ||
| +###################################################################### | ||
| +# TEMPLATE SUBS | ||
| + | ||
| +###################################################################### | ||
| +# PARAMETERS | ||
| + | ||
| +sub checkParams { | ||
| + #----- | ||
| + # Do any and all options checking here... | ||
| + # | ||
| + my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "gff|g:s"); | ||
| + my %options; | ||
| + | ||
| + # Add any other command line options, and the code to handle them | ||
| + # | ||
| + GetOptions( \%options, @standard_options ); | ||
| + | ||
| + # if no arguments supplied print the usage and exit | ||
| + # | ||
| + exec("pod2usage $0") if (0 == (keys (%options) )); | ||
| + | ||
| + # If the -help option is set, print the usage and exit | ||
| + # | ||
| + exec("pod2usage $0") if $options{'help'}; | ||
| + | ||
| + # Compulsory items | ||
| + #if(!exists $options{''} ) { printParamError (""); } | ||
| + if(!exists $options{'in'} ) { printParamError ("You MUST supply your final_output.txt from annotateM"); } | ||
| + | ||
| + return \%options; | ||
| +} | ||
| + | ||
| +sub printParamError | ||
| +{ | ||
| + #----- | ||
| + # What to do if there's something wrong with a parameter | ||
| + # | ||
| + my ($error) = @_; | ||
| + print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); | ||
| +} | ||
| + | ||
| +sub overrideDefault | ||
| +{ | ||
| + #----- | ||
| + # Set and override default values for parameters | ||
| + # | ||
| + my ($default_value, $option_name) = @_; | ||
| + if(exists $global_options->{$option_name}) | ||
| + { | ||
| + return $global_options->{$option_name}; | ||
| + } | ||
| + return $default_value; | ||
| +} | ||
| + | ||
| +##################################################################### | ||
| +# FILE IO | ||
| + | ||
| +sub openWrite | ||
| +{ | ||
| + #----- | ||
| + # Open a file for writing | ||
| + # | ||
| + my ($fn) = @_; | ||
| + open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n"; | ||
| + return $fh; | ||
| +} | ||
| + | ||
| +sub openRead | ||
| +{ | ||
| + #----- | ||
| + # Open a file for reading | ||
| + # | ||
| + my ($fn) = @_; | ||
| + open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n"; | ||
| + return $fh; | ||
| +} | ||
| + | ||
| +###################################################################### | ||
| +# EXTERNAL COMMANDS | ||
| +# | ||
| +# checkAndRunCommand("ls", { | ||
| +# -a => "" | ||
| +# }, | ||
| +# WARN_ON_FAILURE); | ||
| + | ||
| +sub checkFileExists { | ||
| + #----- | ||
| + # Does a file exists? | ||
| + # | ||
| + my ($file) = @_; | ||
| + unless(-e $file) { | ||
| + croak "**ERROR: $0 : Cannot find:\n$file\n"; | ||
| + } | ||
| +} | ||
| + | ||
| +sub logExternalCommand | ||
| +{ | ||
| + #----- | ||
| + # Log a command line command to the command line! | ||
| + # | ||
| + if(1 == $global_log_commands) { | ||
| + print $_[0], "\n"; | ||
| + } | ||
| +} | ||
| + | ||
| +sub isCommandInPath | ||
| +{ | ||
| + #----- | ||
| + # Is this command in the path? | ||
| + # | ||
| + my ($cmd, $failure_type) = @_; | ||
| + if (system("which $cmd |> /dev/null")) { | ||
| + handleCommandFailure($cmd, $failure_type); | ||
| + } | ||
| +} | ||
| + | ||
| +sub runExternalCommand | ||
| +{ | ||
| + #----- | ||
| + # Run a command line command on the command line! | ||
| + # | ||
| + my ($cmd) = @_; | ||
| + logExternalCommand($cmd); | ||
| + system($cmd); | ||
| +} | ||
| + | ||
| +sub checkAndRunCommand | ||
| +{ | ||
| + #----- | ||
| + # Run external commands more sanelier | ||
| + # | ||
| + my ($cmd, $params, $failure_type) = @_; | ||
| + | ||
| + isCommandInPath($cmd, $failure_type); | ||
| + | ||
| + # join the parameters to the command | ||
| + my $param_str = join " ", map {formatParams($_)} @{$params}; | ||
| + | ||
| + my $cmd_str = $cmd . " " . $param_str; | ||
| + | ||
| + print "The command currently running:\t$cmd_str\n"; | ||
| + logExternalCommand($cmd_str); | ||
| + | ||
| + # make sure that all went well | ||
| + if (system($cmd_str)) { | ||
| + handleCommandFailure($cmd_str, $failure_type) | ||
| + } | ||
| +} | ||
| + | ||
| +sub formatParams { | ||
| + | ||
| + #--------- | ||
| + # Handles and formats the different ways of passing parameters to | ||
| + # checkAndRunCommand | ||
| + # | ||
| + my $ref = shift; | ||
| + | ||
| + if (ref($ref) eq "ARRAY") { | ||
| + return join(" ", @{$ref}); | ||
| + } elsif (ref($ref) eq "HASH") { | ||
| + return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref}); | ||
| + } | ||
| + croak 'The elements of the $params argument in checkAndRunCommand can ' . | ||
| + 'only contain references to arrays or hashes\n'; | ||
| +} | ||
| + | ||
| + | ||
| +sub handleCommandFailure { | ||
| + #----- | ||
| + # What to do when all goes bad! | ||
| + # | ||
| + my ($cmd, $failure_type) = @_; | ||
| + if (defined($failure_type)) { | ||
| + if ($failure_type == DIE_ON_FAILURE) { | ||
| + croak "**ERROR: $0 : " . $! . "\n"; | ||
| + } elsif ($failure_type == WARN_ON_FAILURE) { | ||
| + carp "**WARNING: $0 : " . $! . "\n"; | ||
| + } | ||
| + } | ||
| +} | ||
| + | ||
| + | ||
| +###################################################################### | ||
| +# MISC | ||
| + | ||
| +sub printAtStart { | ||
| +print<<"EOF"; | ||
| +---------------------------------------------------------------- | ||
| + | ||
| + $0 | ||
| + post_annotateM - annotate my genome | ||
| + Due to the blast processes against multiple databases, this whole | ||
| + annotation pipeline will usually take awhile. Please be patient! | ||
| + What you get in the end will save you heaps of time. | ||
| + | ||
| +---------------------------------------------------------------- | ||
| +EOF | ||
| +} | ||
| + | ||
| +__DATA__ | ||
| + | ||
| +=head1 NAME | ||
| + | ||
| + annotateM | ||
| + | ||
| +=head1 COPYRIGHT | ||
| + | ||
| + Copyright (C) Mohamed Fauzi Haroon | ||
| + Special appearance from Adam Skarshewski | ||
| + | ||
| + This program is free software: you can redistribute it and/or modify | ||
| + it under the terms of the GNU General Public License as published by | ||
| + the Free Software Foundation, either version 3 of the License, or | ||
| + (at your option) any later version. | ||
| + | ||
| + This program is distributed in the hope that it will be useful, | ||
| + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| + GNU General Public License for more details. | ||
| + | ||
| + You should have received a copy of the GNU General Public License | ||
| + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
| + | ||
| +=head1 DESCRIPTION | ||
| + | ||
| + Want to annotate your genome? annotateM! | ||
| + | ||
| +=head1 SYNOPSIS | ||
| + | ||
| + annotateM -i [finaloutputfromannotatem] -l [locustag] -g [gff] | ||
| + | ||
| + -i final_output.txt final_output.txt file generated from annotateM | ||
| + -l locustag Name of locus tag | ||
| + -g locustag.gff location of gff file in prokka_annotation folder | ||
| + [-help -h] Displays basic usage information | ||
| + | ||
| +=cut | ||
| + | ||
| + |
0 comments on commit
72fe82c