Permalink
Newer
Older
100755 399 lines (327 sloc) 11.3 KB
Apr 17, 2014 @fauziharoon run this script to get the contig id, orf start and end, and also strand
1 #!/usr/bin/env perl
2 ###############################################################################
3 #
4 # post_annotateM
5 #
6 # The idea here is an automated way of annotating your genome based on
7 # multiple available databases and to produce a tab-delimited file of
8 # all the annotations, evalues, scores, descriptions.
9 #
10 # Suggested workflow:
11 # 1) run your genome nucleotide fasta file through annotateM
Apr 17, 2014 @fauziharoon newest version
12 # 2) then run post_annotateM to include the contig id,orf_start and end
13 # 3) generate a tab-delimited file
14 # 4) open the file in ms excel or oo calc
15 # 5) manually curate the annotations based on evalues/scores/desc etc
16 # 6) metabolic reconstruction of organism
Apr 17, 2014 @fauziharoon run this script to get the contig id, orf start and end, and also strand
17 #
18 # Copyright (C) Mohamed Fauzi Haroon
19 # Special appearance from Adam Skarshewski
20 #
21 # This program is free software: you can redistribute it and/or modify
22 # it under the terms of the GNU General Public License as published by
23 # the Free Software Foundation, either version 3 of the License, or
24 # (at your option) any later version.
25 #
26 # This program is distributed in the hope that it will be useful,
27 # but WITHOUT ANY WARRANTY; without even the implied warranty of
28 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 # GNU General Public License for more details.
30 #
31 # You should have received a copy of the GNU General Public License
32 # along with this program. If not, see <http://www.gnu.org/licenses/>.
33 #
34 ###############################################################################
35
36 #pragmas
37 use strict;
38 use warnings;
39
40 #core Perl modules
41 use Getopt::Long;
42 use Carp;
43 use Data::Dumper;
44
45 #CPAN modules
46
47 #locally-written modules
48
49 BEGIN {
50 select(STDERR);
51 $| = 1;
52 select(STDOUT);
53 $| = 1;
54 }
55
56 # edit here to log all external commands
57 my $global_log_commands = 0;
58
59 # ext command failure levels
60 use constant {
61 IGNORE_FAILURE => 0,
62 WARN_ON_FAILURE => 1,
63 DIE_ON_FAILURE => 2
64 };
65
66 # get input params and print copyright
67 printAtStart();
68 my $global_options = checkParams();
69
70 ######################################################################
71 # CODE HERE
72 ######################################################################
73
74 # check that the final_output.txt file exists
75 checkFileExists($global_options->{'in'});
76
77 # identify the ORF called amino acid fasta file
78 my $locus = $global_options->{'locustag'};
79
80 # gff file
81 my $gff_file = $global_options->{'gff'};
82
83 # to store hash
84 my %hashy = ();
85 my %orf = ();
86
87 # open my final_output.txt file
88 open my $final_output, "./final_output.txt", or die "Couldn't open final_output.txt\n";
89 while (<$final_output>)
90 {
91 #skips first line
92 next if $. < 2;
93 chomp $_;
94 my @columns = split (/\t/, $_);
95 my @baba = @columns[1..$#columns];
96 $hashy{$columns[0]} = join("\t", @baba);
97 #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba);
98 # push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba);
99 }
100
101 close($final_output);
102 #print Dumper (\%orf);
103
104 # open the prokka annotation gff file
105 # sample gff file:
106 ###gff-version 3
107 ##sequence-region contig_3875 1 10320
108 #contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001
109 #contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002
110 #contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003
111 open my $SUPER_FINAL_OUTPUT, "> ./super_final_output.txt";
112 print {$SUPER_FINAL_OUTPUT} "ORF_ID\tcontig_ID\tORF_start_pos\tORF_end_pos\tstrand\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n";
113
114 open my $gff, "./$gff_file", or die "Couldn't open $locus.gff\n";
115 while (<$gff>)
116 {
117 if ($_ =~ /^##/){
118 if ($_ =~ /^##FASTA/){
119 last;
120 }
121 else{
122 next;
123 }
124 }
125 chomp $_;
126 my @orf_id = ();
127 my @columns = split (/\t/, $_);
128 #my @last_column = split (/\t/, $columns[8]);
129 # $columns[8] =~ /(ID=)/;
130 # print $1;
131 if ($_ =~ /ID=(\w+);/)
132 {
133 push @orf_id, $1;
134 # print Dumper (\@orf_id);
135 }
136 foreach my $value_in_orf_array (@orf_id)
137 {
138 if (exists $hashy{$value_in_orf_array})
139 {
140 # print "$value_in_orf_array\t$columns[3]\t$columns[4]";
141 my @print_array = [$value_in_orf_array, $columns[3], $columns[4]];
142 # unshift @{$hashy{$value_in_orf_array}}, join("\t", @print_array);
143 # $hashy{$value_in_orf_array} .= "\t" . $value_in_orf_array . "\t$columns[3]\t$columns[4]\t";
144
145 print {$SUPER_FINAL_OUTPUT} "$value_in_orf_array\t$columns[0]\t$columns[3]\t$columns[4]\t$columns[6]\t$hashy{$value_in_orf_array}\n";
146 }
147 }
148 }
149 close($gff);
150 close($SUPER_FINAL_OUTPUT);
151
152 print "COMPLETED, super_final_output.txt generated! If i was you, i will open super_final_output.txt in excel\n";
153
154 ######################################################################
155 # CUSTOM SUBS
156 ######################################################################
157
158 # who needs custom subs...
159
160 ######################################################################
161 # TEMPLATE SUBS
162
163 ######################################################################
164 # PARAMETERS
165
166 sub checkParams {
167 #-----
168 # Do any and all options checking here...
169 #
170 my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "gff|g:s");
171 my %options;
172
173 # Add any other command line options, and the code to handle them
174 #
175 GetOptions( \%options, @standard_options );
176
177 # if no arguments supplied print the usage and exit
178 #
179 exec("pod2usage $0") if (0 == (keys (%options) ));
180
181 # If the -help option is set, print the usage and exit
182 #
183 exec("pod2usage $0") if $options{'help'};
184
185 # Compulsory items
186 #if(!exists $options{''} ) { printParamError (""); }
187 if(!exists $options{'in'} ) { printParamError ("You MUST supply your final_output.txt from annotateM"); }
188
189 return \%options;
190 }
191
192 sub printParamError
193 {
194 #-----
195 # What to do if there's something wrong with a parameter
196 #
197 my ($error) = @_;
198 print "**ERROR: $0 : $error\n"; exec("pod2usage $0");
199 }
200
201 sub overrideDefault
202 {
203 #-----
204 # Set and override default values for parameters
205 #
206 my ($default_value, $option_name) = @_;
207 if(exists $global_options->{$option_name})
208 {
209 return $global_options->{$option_name};
210 }
211 return $default_value;
212 }
213
214 #####################################################################
215 # FILE IO
216
217 sub openWrite
218 {
219 #-----
220 # Open a file for writing
221 #
222 my ($fn) = @_;
223 open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n";
224 return $fh;
225 }
226
227 sub openRead
228 {
229 #-----
230 # Open a file for reading
231 #
232 my ($fn) = @_;
233 open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n";
234 return $fh;
235 }
236
237 ######################################################################
238 # EXTERNAL COMMANDS
239 #
240 # checkAndRunCommand("ls", {
241 # -a => ""
242 # },
243 # WARN_ON_FAILURE);
244
245 sub checkFileExists {
246 #-----
247 # Does a file exists?
248 #
249 my ($file) = @_;
250 unless(-e $file) {
251 croak "**ERROR: $0 : Cannot find:\n$file\n";
252 }
253 }
254
255 sub logExternalCommand
256 {
257 #-----
258 # Log a command line command to the command line!
259 #
260 if(1 == $global_log_commands) {
261 print $_[0], "\n";
262 }
263 }
264
265 sub isCommandInPath
266 {
267 #-----
268 # Is this command in the path?
269 #
270 my ($cmd, $failure_type) = @_;
271 if (system("which $cmd |> /dev/null")) {
272 handleCommandFailure($cmd, $failure_type);
273 }
274 }
275
276 sub runExternalCommand
277 {
278 #-----
279 # Run a command line command on the command line!
280 #
281 my ($cmd) = @_;
282 logExternalCommand($cmd);
283 system($cmd);
284 }
285
286 sub checkAndRunCommand
287 {
288 #-----
289 # Run external commands more sanelier
290 #
291 my ($cmd, $params, $failure_type) = @_;
292
293 isCommandInPath($cmd, $failure_type);
294
295 # join the parameters to the command
296 my $param_str = join " ", map {formatParams($_)} @{$params};
297
298 my $cmd_str = $cmd . " " . $param_str;
299
300 print "The command currently running:\t$cmd_str\n";
301 logExternalCommand($cmd_str);
302
303 # make sure that all went well
304 if (system($cmd_str)) {
305 handleCommandFailure($cmd_str, $failure_type)
306 }
307 }
308
309 sub formatParams {
310
311 #---------
312 # Handles and formats the different ways of passing parameters to
313 # checkAndRunCommand
314 #
315 my $ref = shift;
316
317 if (ref($ref) eq "ARRAY") {
318 return join(" ", @{$ref});
319 } elsif (ref($ref) eq "HASH") {
320 return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref});
321 }
322 croak 'The elements of the $params argument in checkAndRunCommand can ' .
323 'only contain references to arrays or hashes\n';
324 }
325
326
327 sub handleCommandFailure {
328 #-----
329 # What to do when all goes bad!
330 #
331 my ($cmd, $failure_type) = @_;
332 if (defined($failure_type)) {
333 if ($failure_type == DIE_ON_FAILURE) {
334 croak "**ERROR: $0 : " . $! . "\n";
335 } elsif ($failure_type == WARN_ON_FAILURE) {
336 carp "**WARNING: $0 : " . $! . "\n";
337 }
338 }
339 }
340
341
342 ######################################################################
343 # MISC
344
345 sub printAtStart {
346 print<<"EOF";
347 ----------------------------------------------------------------
348
349 $0
350 post_annotateM - annotate my genome
351 Due to the blast processes against multiple databases, this whole
352 annotation pipeline will usually take awhile. Please be patient!
353 What you get in the end will save you heaps of time.
354
355 ----------------------------------------------------------------
356 EOF
357 }
358
359 __DATA__
360
361 =head1 NAME
362
Apr 17, 2014 @fauziharoon newest version
363 post_annotateM
Apr 17, 2014 @fauziharoon run this script to get the contig id, orf start and end, and also strand
364
365 =head1 COPYRIGHT
366
367 Copyright (C) Mohamed Fauzi Haroon
368 Special appearance from Adam Skarshewski
369
370 This program is free software: you can redistribute it and/or modify
371 it under the terms of the GNU General Public License as published by
372 the Free Software Foundation, either version 3 of the License, or
373 (at your option) any later version.
374
375 This program is distributed in the hope that it will be useful,
376 but WITHOUT ANY WARRANTY; without even the implied warranty of
377 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
378 GNU General Public License for more details.
379
380 You should have received a copy of the GNU General Public License
381 along with this program. If not, see <http://www.gnu.org/licenses/>.
382
383 =head1 DESCRIPTION
384
385 Want to annotate your genome? annotateM!
386
387 =head1 SYNOPSIS
388
Apr 17, 2014 @fauziharoon newest version
389 post_annotateM -i [finaloutputfromannotatem] -l [locustag] -g [gff]
Apr 17, 2014 @fauziharoon run this script to get the contig id, orf start and end, and also strand
390
391 -i final_output.txt final_output.txt file generated from annotateM
392 -l locustag Name of locus tag
393 -g locustag.gff location of gff file in prokka_annotation folder
394 [-help -h] Displays basic usage information
395
396 =cut
397
398