From 72fe82c4645a5a5aad57bc256437448591564e87 Mon Sep 17 00:00:00 2001 From: fauzi Date: Thu, 17 Apr 2014 16:10:24 +1000 Subject: [PATCH] run this script to get the contig id, orf start and end, and also strand --- post_annotateM | 397 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100755 post_annotateM diff --git a/post_annotateM b/post_annotateM new file mode 100755 index 0000000..458d838 --- /dev/null +++ b/post_annotateM @@ -0,0 +1,397 @@ +#!/usr/bin/env perl +############################################################################### +# +# post_annotateM +# +# The idea here is an automated way of annotating your genome based on +# multiple available databases and to produce a tab-delimited file of +# all the annotations, evalues, scores, descriptions. +# +# Suggested workflow: +# 1) run your genome nucleotide fasta file through annotateM +# 2) generate a tab-delimited file +# 3) open the file in ms excel or oo calc +# 4) manually curate the annotations based on evalues/scores/desc etc +# 5) metabolic reconstruction of organism +# +# Copyright (C) Mohamed Fauzi Haroon +# Special appearance from Adam Skarshewski +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +#pragmas +use strict; +use warnings; + +#core Perl modules +use Getopt::Long; +use Carp; +use Data::Dumper; + +#CPAN modules + +#locally-written modules + +BEGIN { + select(STDERR); + $| = 1; + select(STDOUT); + $| = 1; +} + +# edit here to log all external commands +my $global_log_commands = 0; + +# ext command failure levels +use constant { + IGNORE_FAILURE => 0, + WARN_ON_FAILURE => 1, + DIE_ON_FAILURE => 2 +}; + +# get input params and print copyright +printAtStart(); +my $global_options = checkParams(); + +###################################################################### +# CODE HERE +###################################################################### + +# check that the final_output.txt file exists +checkFileExists($global_options->{'in'}); + +# identify the ORF called amino acid fasta file +my $locus = $global_options->{'locustag'}; + +# gff file +my $gff_file = $global_options->{'gff'}; + +# to store hash +my %hashy = (); +my %orf = (); + +# open my final_output.txt file +open my $final_output, "./final_output.txt", or die "Couldn't open final_output.txt\n"; +while (<$final_output>) +{ + #skips first line + next if $. < 2; + chomp $_; + my @columns = split (/\t/, $_); + my @baba = @columns[1..$#columns]; + $hashy{$columns[0]} = join("\t", @baba); + #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba); +# push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba); +} + +close($final_output); +#print Dumper (\%orf); + +# open the prokka annotation gff file +# sample gff file: +###gff-version 3 +##sequence-region contig_3875 1 10320 +#contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001 +#contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002 +#contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003 +open my $SUPER_FINAL_OUTPUT, "> ./super_final_output.txt"; +print {$SUPER_FINAL_OUTPUT} "ORF_ID\tcontig_ID\tORF_start_pos\tORF_end_pos\tstrand\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n"; + +open my $gff, "./$gff_file", or die "Couldn't open $locus.gff\n"; +while (<$gff>) +{ + if ($_ =~ /^##/){ + if ($_ =~ /^##FASTA/){ + last; + } + else{ + next; + } + } + chomp $_; + my @orf_id = (); + my @columns = split (/\t/, $_); + #my @last_column = split (/\t/, $columns[8]); +# $columns[8] =~ /(ID=)/; +# print $1; + if ($_ =~ /ID=(\w+);/) + { + push @orf_id, $1; +# print Dumper (\@orf_id); + } + foreach my $value_in_orf_array (@orf_id) + { + if (exists $hashy{$value_in_orf_array}) + { + # print "$value_in_orf_array\t$columns[3]\t$columns[4]"; + my @print_array = [$value_in_orf_array, $columns[3], $columns[4]]; + # unshift @{$hashy{$value_in_orf_array}}, join("\t", @print_array); + # $hashy{$value_in_orf_array} .= "\t" . $value_in_orf_array . "\t$columns[3]\t$columns[4]\t"; + + print {$SUPER_FINAL_OUTPUT} "$value_in_orf_array\t$columns[0]\t$columns[3]\t$columns[4]\t$columns[6]\t$hashy{$value_in_orf_array}\n"; + } + } +} +close($gff); +close($SUPER_FINAL_OUTPUT); + +print "COMPLETED, super_final_output.txt generated! If i was you, i will open super_final_output.txt in excel\n"; + +###################################################################### +# CUSTOM SUBS +###################################################################### + +# who needs custom subs... + +###################################################################### +# TEMPLATE SUBS + +###################################################################### +# PARAMETERS + +sub checkParams { + #----- + # Do any and all options checking here... + # + my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "gff|g:s"); + my %options; + + # Add any other command line options, and the code to handle them + # + GetOptions( \%options, @standard_options ); + + # if no arguments supplied print the usage and exit + # + exec("pod2usage $0") if (0 == (keys (%options) )); + + # If the -help option is set, print the usage and exit + # + exec("pod2usage $0") if $options{'help'}; + + # Compulsory items + #if(!exists $options{''} ) { printParamError (""); } + if(!exists $options{'in'} ) { printParamError ("You MUST supply your final_output.txt from annotateM"); } + + return \%options; +} + +sub printParamError +{ + #----- + # What to do if there's something wrong with a parameter + # + my ($error) = @_; + print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); +} + +sub overrideDefault +{ + #----- + # Set and override default values for parameters + # + my ($default_value, $option_name) = @_; + if(exists $global_options->{$option_name}) + { + return $global_options->{$option_name}; + } + return $default_value; +} + +##################################################################### +# FILE IO + +sub openWrite +{ + #----- + # Open a file for writing + # + my ($fn) = @_; + open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n"; + return $fh; +} + +sub openRead +{ + #----- + # Open a file for reading + # + my ($fn) = @_; + open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n"; + return $fh; +} + +###################################################################### +# EXTERNAL COMMANDS +# +# checkAndRunCommand("ls", { +# -a => "" +# }, +# WARN_ON_FAILURE); + +sub checkFileExists { + #----- + # Does a file exists? + # + my ($file) = @_; + unless(-e $file) { + croak "**ERROR: $0 : Cannot find:\n$file\n"; + } +} + +sub logExternalCommand +{ + #----- + # Log a command line command to the command line! + # + if(1 == $global_log_commands) { + print $_[0], "\n"; + } +} + +sub isCommandInPath +{ + #----- + # Is this command in the path? + # + my ($cmd, $failure_type) = @_; + if (system("which $cmd |> /dev/null")) { + handleCommandFailure($cmd, $failure_type); + } +} + +sub runExternalCommand +{ + #----- + # Run a command line command on the command line! + # + my ($cmd) = @_; + logExternalCommand($cmd); + system($cmd); +} + +sub checkAndRunCommand +{ + #----- + # Run external commands more sanelier + # + my ($cmd, $params, $failure_type) = @_; + + isCommandInPath($cmd, $failure_type); + + # join the parameters to the command + my $param_str = join " ", map {formatParams($_)} @{$params}; + + my $cmd_str = $cmd . " " . $param_str; + + print "The command currently running:\t$cmd_str\n"; + logExternalCommand($cmd_str); + + # make sure that all went well + if (system($cmd_str)) { + handleCommandFailure($cmd_str, $failure_type) + } +} + +sub formatParams { + + #--------- + # Handles and formats the different ways of passing parameters to + # checkAndRunCommand + # + my $ref = shift; + + if (ref($ref) eq "ARRAY") { + return join(" ", @{$ref}); + } elsif (ref($ref) eq "HASH") { + return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref}); + } + croak 'The elements of the $params argument in checkAndRunCommand can ' . + 'only contain references to arrays or hashes\n'; +} + + +sub handleCommandFailure { + #----- + # What to do when all goes bad! + # + my ($cmd, $failure_type) = @_; + if (defined($failure_type)) { + if ($failure_type == DIE_ON_FAILURE) { + croak "**ERROR: $0 : " . $! . "\n"; + } elsif ($failure_type == WARN_ON_FAILURE) { + carp "**WARNING: $0 : " . $! . "\n"; + } + } +} + + +###################################################################### +# MISC + +sub printAtStart { +print<<"EOF"; +---------------------------------------------------------------- + + $0 + post_annotateM - annotate my genome + Due to the blast processes against multiple databases, this whole + annotation pipeline will usually take awhile. Please be patient! + What you get in the end will save you heaps of time. + +---------------------------------------------------------------- +EOF +} + +__DATA__ + +=head1 NAME + + annotateM + +=head1 COPYRIGHT + + Copyright (C) Mohamed Fauzi Haroon + Special appearance from Adam Skarshewski + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +=head1 DESCRIPTION + + Want to annotate your genome? annotateM! + +=head1 SYNOPSIS + + annotateM -i [finaloutputfromannotatem] -l [locustag] -g [gff] + + -i final_output.txt final_output.txt file generated from annotateM + -l locustag Name of locus tag + -g locustag.gff location of gff file in prokka_annotation folder + [-help -h] Displays basic usage information + +=cut + +