Apr 17, 2014
run this script to get the contig id, orf start and end, and also strand
|
|
|
17 |
# |
|
18 |
# Copyright (C) Mohamed Fauzi Haroon |
|
19 |
# Special appearance from Adam Skarshewski |
|
20 |
# |
|
21 |
# This program is free software: you can redistribute it and/or modify |
|
22 |
# it under the terms of the GNU General Public License as published by |
|
23 |
# the Free Software Foundation, either version 3 of the License, or |
|
24 |
# (at your option) any later version. |
|
25 |
# |
|
26 |
# This program is distributed in the hope that it will be useful, |
|
27 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
28 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
29 |
# GNU General Public License for more details. |
|
30 |
# |
|
31 |
# You should have received a copy of the GNU General Public License |
|
32 |
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
33 |
# |
|
34 |
############################################################################### |
|
35 |
|
|
36 |
#pragmas |
|
37 |
use strict; |
|
38 |
use warnings; |
|
39 |
|
|
40 |
#core Perl modules |
|
41 |
use Getopt::Long; |
|
42 |
use Carp; |
|
43 |
use Data::Dumper; |
|
44 |
|
|
45 |
#CPAN modules |
|
46 |
|
|
47 |
#locally-written modules |
|
48 |
|
|
49 |
BEGIN { |
|
50 |
select(STDERR); |
|
51 |
$| = 1; |
|
52 |
select(STDOUT); |
|
53 |
$| = 1; |
|
54 |
} |
|
55 |
|
|
56 |
# edit here to log all external commands |
|
57 |
my $global_log_commands = 0; |
|
58 |
|
|
59 |
# ext command failure levels |
|
60 |
use constant { |
|
61 |
IGNORE_FAILURE => 0, |
|
62 |
WARN_ON_FAILURE => 1, |
|
63 |
DIE_ON_FAILURE => 2 |
|
64 |
}; |
|
65 |
|
|
66 |
# get input params and print copyright |
|
67 |
printAtStart(); |
|
68 |
my $global_options = checkParams(); |
|
69 |
|
|
70 |
###################################################################### |
|
71 |
# CODE HERE |
|
72 |
###################################################################### |
|
73 |
|
|
74 |
# check that the final_output.txt file exists |
|
75 |
checkFileExists($global_options->{'in'}); |
|
76 |
|
|
77 |
# identify the ORF called amino acid fasta file |
|
78 |
my $locus = $global_options->{'locustag'}; |
|
79 |
|
|
80 |
# gff file |
|
81 |
my $gff_file = $global_options->{'gff'}; |
|
82 |
|
|
83 |
# to store hash |
|
84 |
my %hashy = (); |
|
85 |
my %orf = (); |
|
86 |
|
|
87 |
# open my final_output.txt file |
|
88 |
open my $final_output, "./final_output.txt", or die "Couldn't open final_output.txt\n"; |
|
89 |
while (<$final_output>) |
|
90 |
{ |
|
91 |
#skips first line |
|
92 |
next if $. < 2; |
|
93 |
chomp $_; |
|
94 |
my @columns = split (/\t/, $_); |
|
95 |
my @baba = @columns[1..$#columns]; |
|
96 |
$hashy{$columns[0]} = join("\t", @baba); |
|
97 |
#$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba); |
|
98 |
# push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba); |
|
99 |
} |
|
100 |
|
|
101 |
close($final_output); |
|
102 |
#print Dumper (\%orf); |
|
103 |
|
|
104 |
# open the prokka annotation gff file |
|
105 |
# sample gff file: |
|
106 |
###gff-version 3 |
|
107 |
##sequence-region contig_3875 1 10320 |
|
108 |
#contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001 |
|
109 |
#contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002 |
|
110 |
#contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003 |
|
111 |
open my $SUPER_FINAL_OUTPUT, "> ./super_final_output.txt"; |
|
112 |
print {$SUPER_FINAL_OUTPUT} "ORF_ID\tcontig_ID\tORF_start_pos\tORF_end_pos\tstrand\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n"; |
|
113 |
|
|
114 |
open my $gff, "./$gff_file", or die "Couldn't open $locus.gff\n"; |
|
115 |
while (<$gff>) |
|
116 |
{ |
|
117 |
if ($_ =~ /^##/){ |
|
118 |
if ($_ =~ /^##FASTA/){ |
|
119 |
last; |
|
120 |
} |
|
121 |
else{ |
|
122 |
next; |
|
123 |
} |
|
124 |
} |
|
125 |
chomp $_; |
|
126 |
my @orf_id = (); |
|
127 |
my @columns = split (/\t/, $_); |
|
128 |
#my @last_column = split (/\t/, $columns[8]); |
|
129 |
# $columns[8] =~ /(ID=)/; |
|
130 |
# print $1; |
|
131 |
if ($_ =~ /ID=(\w+);/) |
|
132 |
{ |
|
133 |
push @orf_id, $1; |
|
134 |
# print Dumper (\@orf_id); |
|
135 |
} |
|
136 |
foreach my $value_in_orf_array (@orf_id) |
|
137 |
{ |
|
138 |
if (exists $hashy{$value_in_orf_array}) |
|
139 |
{ |
|
140 |
# print "$value_in_orf_array\t$columns[3]\t$columns[4]"; |
|
141 |
my @print_array = [$value_in_orf_array, $columns[3], $columns[4]]; |
|
142 |
# unshift @{$hashy{$value_in_orf_array}}, join("\t", @print_array); |
|
143 |
# $hashy{$value_in_orf_array} .= "\t" . $value_in_orf_array . "\t$columns[3]\t$columns[4]\t"; |
|
144 |
|
|
145 |
print {$SUPER_FINAL_OUTPUT} "$value_in_orf_array\t$columns[0]\t$columns[3]\t$columns[4]\t$columns[6]\t$hashy{$value_in_orf_array}\n"; |
|
146 |
} |
|
147 |
} |
|
148 |
} |
|
149 |
close($gff); |
|
150 |
close($SUPER_FINAL_OUTPUT); |
|
151 |
|
|
152 |
print "COMPLETED, super_final_output.txt generated! If i was you, i will open super_final_output.txt in excel\n"; |
|
153 |
|
|
154 |
###################################################################### |
|
155 |
# CUSTOM SUBS |
|
156 |
###################################################################### |
|
157 |
|
|
158 |
# who needs custom subs... |
|
159 |
|
|
160 |
###################################################################### |
|
161 |
# TEMPLATE SUBS |
|
162 |
|
|
163 |
###################################################################### |
|
164 |
# PARAMETERS |
|
165 |
|
|
166 |
sub checkParams { |
|
167 |
#----- |
|
168 |
# Do any and all options checking here... |
|
169 |
# |
|
170 |
my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "gff|g:s"); |
|
171 |
my %options; |
|
172 |
|
|
173 |
# Add any other command line options, and the code to handle them |
|
174 |
# |
|
175 |
GetOptions( \%options, @standard_options ); |
|
176 |
|
|
177 |
# if no arguments supplied print the usage and exit |
|
178 |
# |
|
179 |
exec("pod2usage $0") if (0 == (keys (%options) )); |
|
180 |
|
|
181 |
# If the -help option is set, print the usage and exit |
|
182 |
# |
|
183 |
exec("pod2usage $0") if $options{'help'}; |
|
184 |
|
|
185 |
# Compulsory items |
|
186 |
#if(!exists $options{''} ) { printParamError (""); } |
|
187 |
if(!exists $options{'in'} ) { printParamError ("You MUST supply your final_output.txt from annotateM"); } |
|
188 |
|
|
189 |
return \%options; |
|
190 |
} |
|
191 |
|
|
192 |
sub printParamError |
|
193 |
{ |
|
194 |
#----- |
|
195 |
# What to do if there's something wrong with a parameter |
|
196 |
# |
|
197 |
my ($error) = @_; |
|
198 |
print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); |
|
199 |
} |
|
200 |
|
|
201 |
sub overrideDefault |
|
202 |
{ |
|
203 |
#----- |
|
204 |
# Set and override default values for parameters |
|
205 |
# |
|
206 |
my ($default_value, $option_name) = @_; |
|
207 |
if(exists $global_options->{$option_name}) |
|
208 |
{ |
|
209 |
return $global_options->{$option_name}; |
|
210 |
} |
|
211 |
return $default_value; |
|
212 |
} |
|
213 |
|
|
214 |
##################################################################### |
|
215 |
# FILE IO |
|
216 |
|
|
217 |
sub openWrite |
|
218 |
{ |
|
219 |
#----- |
|
220 |
# Open a file for writing |
|
221 |
# |
|
222 |
my ($fn) = @_; |
|
223 |
open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n"; |
|
224 |
return $fh; |
|
225 |
} |
|
226 |
|
|
227 |
sub openRead |
|
228 |
{ |
|
229 |
#----- |
|
230 |
# Open a file for reading |
|
231 |
# |
|
232 |
my ($fn) = @_; |
|
233 |
open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n"; |
|
234 |
return $fh; |
|
235 |
} |
|
236 |
|
|
237 |
###################################################################### |
|
238 |
# EXTERNAL COMMANDS |
|
239 |
# |
|
240 |
# checkAndRunCommand("ls", { |
|
241 |
# -a => "" |
|
242 |
# }, |
|
243 |
# WARN_ON_FAILURE); |
|
244 |
|
|
245 |
sub checkFileExists { |
|
246 |
#----- |
|
247 |
# Does a file exists? |
|
248 |
# |
|
249 |
my ($file) = @_; |
|
250 |
unless(-e $file) { |
|
251 |
croak "**ERROR: $0 : Cannot find:\n$file\n"; |
|
252 |
} |
|
253 |
} |
|
254 |
|
|
255 |
sub logExternalCommand |
|
256 |
{ |
|
257 |
#----- |
|
258 |
# Log a command line command to the command line! |
|
259 |
# |
|
260 |
if(1 == $global_log_commands) { |
|
261 |
print $_[0], "\n"; |
|
262 |
} |
|
263 |
} |
|
264 |
|
|
265 |
sub isCommandInPath |
|
266 |
{ |
|
267 |
#----- |
|
268 |
# Is this command in the path? |
|
269 |
# |
|
270 |
my ($cmd, $failure_type) = @_; |
|
271 |
if (system("which $cmd |> /dev/null")) { |
|
272 |
handleCommandFailure($cmd, $failure_type); |
|
273 |
} |
|
274 |
} |
|
275 |
|
|
276 |
sub runExternalCommand |
|
277 |
{ |
|
278 |
#----- |
|
279 |
# Run a command line command on the command line! |
|
280 |
# |
|
281 |
my ($cmd) = @_; |
|
282 |
logExternalCommand($cmd); |
|
283 |
system($cmd); |
|
284 |
} |
|
285 |
|
|
286 |
sub checkAndRunCommand |
|
287 |
{ |
|
288 |
#----- |
|
289 |
# Run external commands more sanelier |
|
290 |
# |
|
291 |
my ($cmd, $params, $failure_type) = @_; |
|
292 |
|
|
293 |
isCommandInPath($cmd, $failure_type); |
|
294 |
|
|
295 |
# join the parameters to the command |
|
296 |
my $param_str = join " ", map {formatParams($_)} @{$params}; |
|
297 |
|
|
298 |
my $cmd_str = $cmd . " " . $param_str; |
|
299 |
|
|
300 |
print "The command currently running:\t$cmd_str\n"; |
|
301 |
logExternalCommand($cmd_str); |
|
302 |
|
|
303 |
# make sure that all went well |
|
304 |
if (system($cmd_str)) { |
|
305 |
handleCommandFailure($cmd_str, $failure_type) |
|
306 |
} |
|
307 |
} |
|
308 |
|
|
309 |
sub formatParams { |
|
310 |
|
|
311 |
#--------- |
|
312 |
# Handles and formats the different ways of passing parameters to |
|
313 |
# checkAndRunCommand |
|
314 |
# |
|
315 |
my $ref = shift; |
|
316 |
|
|
317 |
if (ref($ref) eq "ARRAY") { |
|
318 |
return join(" ", @{$ref}); |
|
319 |
} elsif (ref($ref) eq "HASH") { |
|
320 |
return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref}); |
|
321 |
} |
|
322 |
croak 'The elements of the $params argument in checkAndRunCommand can ' . |
|
323 |
'only contain references to arrays or hashes\n'; |
|
324 |
} |
|
325 |
|
|
326 |
|
|
327 |
sub handleCommandFailure { |
|
328 |
#----- |
|
329 |
# What to do when all goes bad! |
|
330 |
# |
|
331 |
my ($cmd, $failure_type) = @_; |
|
332 |
if (defined($failure_type)) { |
|
333 |
if ($failure_type == DIE_ON_FAILURE) { |
|
334 |
croak "**ERROR: $0 : " . $! . "\n"; |
|
335 |
} elsif ($failure_type == WARN_ON_FAILURE) { |
|
336 |
carp "**WARNING: $0 : " . $! . "\n"; |
|
337 |
} |
|
338 |
} |
|
339 |
} |
|
340 |
|
|
341 |
|
|
342 |
###################################################################### |
|
343 |
# MISC |
|
344 |
|
|
345 |
sub printAtStart { |
|
346 |
print<<"EOF"; |
|
347 |
---------------------------------------------------------------- |
|
348 |
|
|
349 |
$0 |
|
350 |
post_annotateM - annotate my genome |
|
351 |
Due to the blast processes against multiple databases, this whole |
|
352 |
annotation pipeline will usually take awhile. Please be patient! |
|
353 |
What you get in the end will save you heaps of time. |
|
354 |
|
|
355 |
---------------------------------------------------------------- |
|
356 |
EOF |
|
357 |
} |
|
358 |
|
|
359 |
__DATA__ |
|
360 |
|
|
361 |
=head1 NAME |
|
362 |
|