Permalink
Newer
Older
100755 977 lines (848 sloc) 30.6 KB
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
1 #!/usr/bin/env perl
2 ###############################################################################
3 #
4 # annotateM
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
5 #
6 # The idea here is an automated way of annotating your genome based on
7 # multiple available databases and to produce a tab-delimited file of
Apr 9, 2014 @fauziharoon aesthetics
8 # all the annotations, evalues, scores, descriptions.
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
9 #
Apr 9, 2014 @fauziharoon aesthetics
10 # Suggested workflow:
11 # 1) run your genome nucleotide fasta file through annotateM
Apr 17, 2014 @fauziharoon newest version
12 # 2) then run post_annotateM to include the contig id,orf_start and end
13 # 3) generate a tab-delimited file
14 # 4) open the file in ms excel or oo calc
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
15 # 5) manually curate the annotations based on evalues/scores/desc etc
Apr 17, 2014 @fauziharoon newest version
16 # 6) metabolic reconstruction of organism
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
17 #
18 # Copyright (C) Mohamed Fauzi Haroon
Apr 9, 2014 @fauziharoon aesthetics
19 # Special appearance from Adam Skarshewski
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
20 #
21 # This program is free software: you can redistribute it and/or modify
22 # it under the terms of the GNU General Public License as published by
23 # the Free Software Foundation, either version 3 of the License, or
24 # (at your option) any later version.
25 #
26 # This program is distributed in the hope that it will be useful,
27 # but WITHOUT ANY WARRANTY; without even the implied warranty of
28 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 # GNU General Public License for more details.
30 #
31 # You should have received a copy of the GNU General Public License
32 # along with this program. If not, see <http://www.gnu.org/licenses/>.
33 #
34 ###############################################################################
35
36 #pragmas
37 use strict;
38 use warnings;
39
40 #core Perl modules
41 use Getopt::Long;
42 use Carp;
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
43 use Data::Dumper;
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
44
45 #CPAN modules
46
47 #locally-written modules
48
49 BEGIN {
50 select(STDERR);
51 $| = 1;
52 select(STDOUT);
53 $| = 1;
54 }
55
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
56 # edit here to log all external commands
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
57 my $global_log_commands = 0;
58
59 # ext command failure levels
60 use constant {
61 IGNORE_FAILURE => 0,
62 WARN_ON_FAILURE => 1,
63 DIE_ON_FAILURE => 2
64 };
65
66 # get input params and print copyright
67 printAtStart();
68 my $global_options = checkParams();
69
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
70 # Database paths
71 my $img_protein_database = '/srv/db/img/4.1/dereplicated/img_dereplicated_species.genes.faa'
72
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
73 ######################################################################
74 # CODE HERE
75 ######################################################################
76
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
77 # check that the file exists
78 checkFileExists($global_options->{'in'});
79
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
80 # run prokka to generate the ORFs and also prokka annotations
Apr 10, 2014 @fauziharoon bypass prokka if prokka_annotation folder is already present
81 if (! -e "./prokka_annotation/")
82 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
83 print "Running prokka v1.8\n";
84 checkAndRunCommand("prokka", [{
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
85 "--locustag" => $global_options->{'locustag'},
86 "--outdir" => "prokka_annotation",
87 "--prefix" => $global_options->{'locustag'},
88 "--kingdom" => $global_options->{'kingdom'},
89 "--cpus" => $global_options->{'threads'},
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
90 $global_options->{'in'},
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
91 }], DIE_ON_FAILURE);
Apr 10, 2014 @fauziharoon bypass prokka if prokka_annotation folder is already present
92 }
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
93
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
94 # identify the ORF called amino acid fasta file
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
95 my $locus = $global_options->{'locustag'};
96
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
97 # blast against img
98 if (! -e "./$locus.faaVSimg.blastp")
99 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
100 print "BLASTing against IMG 4.1 database...............\n";
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
101 checkAndRunCommand("cat",
102 [[
103 "prokka_annotation/$locus.faa |",
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
104 "parallel",
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
105 "-j" => $global_options->{'threads'},
106 "--block"=> "100k",
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
107 "--recstart",
108 "'>'",
109 "--pipe",
110 "blastp",
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
111 -db => $img_protein_database,
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
112 -outfmt => 6,
113 -max_target_seqs => 1,
114 -evalue => $global_options->{'evalue'},
115 -query => "-",
116 "> $locus.faaVSimg.blastp",
117 ]], DIE_ON_FAILURE);
118 }
119
120 # reciprocal blast of img positive hits against genome ORF
121 if (! -e "./subsetimg.faaVS$locus.faa.blastp")
122 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
123 print "Reciprocal BLASTing positive IMG hits to $locus.faa ...............\n";
124 checkAndRunCommand("contig_extractor.pl",
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
125 [[
126 -i => "$locus.faaVSimg.blastp",
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
127 -d => $img_protein_database,
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
128 -b => '',
129 -S => '',
130 -o => "subsetimg.faa",
131 ]], DIE_ON_FAILURE);
132
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
133 checkAndRunCommand("makeblastdb",
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
134 [[
135 -in => "prokka_annotation/$locus.faa",
136 -dbtype => "prot",
137 ]], DIE_ON_FAILURE);
138
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
139 checkAndRunCommand("blastp",
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
140 [[
141 -query => "subsetimg.faa",
142 -db => "prokka_annotation/$locus.faa",
143 -outfmt => 6,
144 -max_target_seqs => 1,
145 -evalue => $global_options->{'evalue'},
146 -num_threads => $global_options->{'threads'},
147 -out => "subsetimg.faaVS$locus.faa.blastp",
148 ]], DIE_ON_FAILURE);
149 }
150
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
151 # blast against uniref
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
152 if (! -e "./$locus.faaVSuniref90.blastp")
153 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
154 print "BLASTing against latest Uniref90 April2014 database ................\n";
155 checkAndRunCommand("cat",[[
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
156 "prokka_annotation/$locus.faa |",
157 "parallel",
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
158 "-j" => $global_options->{'threads'},
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
159 "--block"=> "100k",
160 "--recstart",
161 "'>'",
162 "--pipe",
163 "blastp",
164 -db => "/srv/db/uniprot/uniref-20140403/uniref90.fasta",
165 -outfmt => 6,
166 -max_target_seqs => 1,
167 -evalue => $global_options->{'evalue'},
168 -query => "-",
169 "> $locus.faaVSuniref90.blastp",
170 #-num_threads => $global_options->{'threads'},
171 ]], DIE_ON_FAILURE);
172 }
173
174 # reciprocal blast of Uniref positive hits against genome ORF
175 if (! -e "./subsetuniref.faaVS$locus.faa.blastp")
176 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
177 print "Reciprocal BLASTing positive Uniref hits to $locus.faa ...............\n";
178 checkAndRunCommand("contig_extractor.pl",
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
179 [[
180 -i => "$locus.faaVSuniref90.blastp",
181 -d => "/srv/db/uniprot/uniref-20140403/uniref90.fasta",
182 -b => '',
183 -S => '',
184 -o => "subsetuniref.faa",
185 ]], DIE_ON_FAILURE);
186
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
187 checkAndRunCommand("blastp",
Apr 7, 2014 @fauziharoon removed the hashes for the first part of the script
188 [[
189 -query => "subsetuniref.faa",
190 -db => "prokka_annotation/$locus.faa",
191 -outfmt => 6,
192 -max_target_seqs => 1,
193 -evalue => $global_options->{'evalue'},
194 -num_threads => $global_options->{'threads'},
195 -out => "subsetuniref.faaVS$locus.faa.blastp",
196 ]], DIE_ON_FAILURE);
197 }
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
198
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
199 # blast against COG
200 if (! -e "./$locus.faaVSCOG.blastp")
201 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
202 print "BLASTing against the one and only COG database................\n";
203 checkAndRunCommand("cat",[[
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
204 "prokka_annotation/$locus.faa |",
205 "parallel",
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
206 "-j" => $global_options->{'threads'},
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
207 "--block"=> "100k",
208 "--recstart",
209 "'>'",
210 "--pipe",
211 "blastp",
212 -db => "/srv/db/cog/cog_blast_prot_db",
213 -outfmt => 6,
214 -max_target_seqs => 1,
215 -evalue => $global_options->{'evalue'},
216 -query => "-",
217 "> $locus.faaVSCOG.blastp",
218 #-num_threads => $global_options->{'threads'},
219 ]], DIE_ON_FAILURE);
220 }
221
222 # HMMSCAN against PFAM
223 if (! -e "./$locus.faaVSPfam-A.hmm.hmmscanned")
224 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
225 print "HMMscanning against latest Pfam 27 database................\n";
226 checkAndRunCommand("pfam_scan.pl",[[
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
227 -cpu => $global_options->{'threads'},
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
228 -e_seq => $global_options->{'evalue'},
229 -outfile => "$locus.faaVSPfam-A.hmm.hmmscanned",
230 -fasta => "prokka_annotation/$locus.faa",
231 -dir => "/srv/db/pfam/27",
232 ]], DIE_ON_FAILURE);
233 }
234
235 # HMMSCAN against TIGRfam
236 if (! -e "./$locus.faaVStigr_all.hmm.hmmscanned")
237 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
238 print "HMMscanning against TIGRfam April2014 database................\n";
239 checkAndRunCommand("hmmscan",[[
Apr 10, 2014 @fauziharoon bypass prokka if prokka_annotation folder is already present
240 "--tblout",
241 "$locus.faaVStigr_all.hmm.hmmscanned",
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
242 "--noali",
243 -E => $global_options->{'evalue'},
Apr 10, 2014 @fauziharoon bypass prokka if prokka_annotation folder is already present
244 "--cpu",
245 $global_options->{'threads'},
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
246 "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_HMM/tigr_all.hmm",
247 "prokka_annotation/$locus.faa",
248 ]], DIE_ON_FAILURE);
249 }
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
250
Apr 9, 2014 @fauziharoon aesthetics
251 # convert the hmmscan output to tab delimited
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
252 checkAndRunCommand("awk",[[
253 "'{\$1=\$1}{ print }'",
254 "$locus.faaVSPfam-A.hmm.hmmscanned",
255 "| sed 's/\\s/\\t/g'",
256 "> $locus.faaVSPfam-A.hmm.hmmscanned.tab",
257 ]], DIE_ON_FAILURE);
258
259 checkAndRunCommand("awk",[[
260 "'{\$1=\$1}{ print }'",
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
261 "$locus.faaVStigr_all.hmm.hmmscanned",
262 "| sed 's/^\\s+//'",
263 "| sed 's/\\s+\$//'",
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
264 "| sed 's/\\s/\\t/g'",
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
265 "> $locus.faaVStigr_all.hmm.hmmscanned.tab",
266 ]], DIE_ON_FAILURE);
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
267
268
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
269 # declare hashes for img
270 my %access2imgid=();
271 my %img2reciprocal = ();
272 my %imghash2 =();
273 #my @orfid = ();
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
274
Apr 9, 2014 @fauziharoon aesthetics
275 # read the img blast output and store in hash
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
276 # SAMPLE img blast output -
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
277 # phycis_04080 649633083|649978419 38.08 1116 640 14 13 1099 1 1094 0.0 663
278 # phycis_04081 649633083|649980044 28.40 405 237 10 49 422 20 402 3e-27 119
279 # phycis_04082 649633030|649661236 42.86 259 144 3 1 256 1 258 1e-61 205
280 # phycis_04083 640753047|640896165 61.55 1186 444 3 1 1177 1 1183 0.0 1504
281
282 # columns[0] = orfid
283 # columns[1] = imgid
284 # columns[10] = evalue
285 # columns[11] = blast score
286
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
287 open my $IMGblast, "./$locus.faaVSimg.blastp", or die "Couldn't open file $locus.faaVSimg.blastp\n";
288 while (<$IMGblast>)
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
289 {
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
290 chomp $_;
291 my @columns = split (/\t/, $_);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
292 # push @orfid, $columns[0];
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
293 if ($columns[11] > 60)
294 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
295 # push @orfid, $columns[0];
296 #store access2imgid hash with the imgid key and point towards the orfid and value is the output i want printed out later
297 $access2imgid{$columns[1]}->{$columns[0]} = "$columns[1]\t$columns[0]\t$columns[10]\t$columns[11]";
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
298 }
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
299 }
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
300 #print Dumper (\%access2imgid);
301
302
303 # read img id2names.txt which is the file to get the gene identity of the imgid
304 # SAMPLE img id2names.txt file -
305 # 650716001|650846201 Ahos_0001 replication initiator protein Cdc6-3 Acidianus hospitalis W1
306 # 650716001|650846202 Ahos_0002 hypothetical protein Acidianus hospitalis W1
307 # 650716001|650846203 Ahos_0003 transcriptional coactivator/pterin dehydratase Acidianus hospitalis W1
308 # 650716001|650846204 Ahos_0004 GGCT (gamma glutamyl cyclotransferase) domain-containing protein Acidianus hospitalis W1
309
310 # columns[0] = imgid
311 # columns[1] = gene name
312 # columns[2] = organism
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
313
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
314 open my $imgid2names, "/srv/db/img/4.1/blastdbs/img4.1_id2names.txt", or die "Couldn't open img4.1_id2names.txt\n";
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
315 open my $img_temp_OUT, ">img_output_temp.txt";
316 while (<$imgid2names>)
Apr 3, 2014 @fauziharoon Added additional features
317 {
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
318 chomp $_;
319 my @columns = split (/\t/, $_);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
320 if (exists $access2imgid{$columns[0]})
321 {
322 foreach my $orfid (keys $access2imgid{$columns[0]})
323 {
324 #print "$orfid\n";
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
325 $img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]";
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
326 print {$img_temp_OUT} "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]\n";
327 #$img2reciprocal{$columns[0]} = "$orfid\t$columns[1]\t$columns[2]";
328 #print {$img_temp_OUT} "$orfid\t$columns[1]\t$columns[2]\n";
329 #print Dumper (\%access2imgid);
330 }
331 }
Apr 3, 2014 @fauziharoon Fixed some problems with reciprocal blast
332 }
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
333 #print Dumper (\%access2imgid);
Apr 3, 2014 @fauziharoon Added additional features
334
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
335 close($IMGblast);
336 close($imgid2names);
337 close($img_temp_OUT);
338
339 # read my reciprocal img blast output and store in hash
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
340 # SAMPLE
341 # 2513020047|2513221347 phycis_01043 39.30 285 168 4 21 301 21 304 2e-51 172
342 # 648028035|648160186 phycis_03502 40.55 217 122 4 7 221 422 633 1e-48 167
343 # 639633053|639783588 phycis_00179 49.23 260 121 4 14 269 20 272 3e-80 246
344 # 639633064|639773205 phycis_02647 29.24 383 234 11 8 370 3 368 2e-45 160
345
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
346 open my $rIMGblast, "./subsetimg.faaVS$locus.faa.blastp", or die "Couldn't open file subsetimg.faaVS$locus.faa.blastp\n";
347 open my $img_temp_OUT2, ">img_output_temp2.txt";
348 while (<$rIMGblast>)
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
349 {
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
350 chomp $_;
351 my @columns = split (/\t/, $_);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
352 if (exists $img2reciprocal{$columns[0]})
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
353 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
354 print {$img_temp_OUT2} $img2reciprocal{$columns[0]} . "\treciprocal\n";
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
355 }
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
356 else
357 {
358 print {$img_temp_OUT2} "$columns[0]\t$columns[1]\tNA\tNA\tNA\tNA\tNOT reciprocal\n";
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
359 }
360
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
361 }
Apr 3, 2014 @fauziharoon Added additional features
362
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
363 close($img_temp_OUT2);
364
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
365 # hashes for uniref
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
366 my %hash4 = ();
367 my %hash5 =();
368 my %hash6 = ();
369
Apr 9, 2014 @fauziharoon aesthetics
370 # read uniref blast and store in hash
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
371 open my $unirefblast, "./$locus.faaVSuniref90.blastp", or die "Couldn't open file $locus.faaVSuniref90.blastp\n";
372 while (<$unirefblast>)
Apr 3, 2014 @fauziharoon Fixed some problems with reciprocal blast
373 {
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
374 chomp $_;
375 my @columns = split (/\t/, $_);
376 if ($columns[11] > 60)
377 {
378 $hash4{$columns[0]} = $columns[1];
379 $hash4{$columns[1]} = $columns[0];
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
380 $hash5{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]";
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
381 }
Apr 3, 2014 @fauziharoon Fixed some problems with reciprocal blast
382 }
383
Apr 9, 2014 @fauziharoon aesthetics
384 # read uniref id2names.txt
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
385 open my $unirefid2names, "/srv/db/uniprot/uniref-20140403/uniref90_id2names.txt", or die "Couldn't open id2names.txt\n";
386 open my $uniref_temp_OUT, ">uniref_output_temp.txt";
387 while (<$unirefid2names>)
388 {
389 chomp $_;
390 my @columns = split (/\t/, $_);
391 if (exists $hash5{$columns[0]})
392 {
393 $hash6{$columns[0]} = "$hash5{$columns[0]}\t$columns[1]\t$columns[2]";
394 print {$uniref_temp_OUT} "$hash5{$columns[0]}\t$columns[1]\t$columns[2]\n";
395 }
396 }
397
398 close($unirefblast);
399 close($unirefid2names);
400 close($uniref_temp_OUT);
401
402 # read my reciprocal img blast output and store in hash
403 open my $runirefblast, "./subsetuniref.faaVS$locus.faa.blastp", or die "Couldn't open file subsetuniref.faaVS$locus.faa.blastp\n";
404 open my $uniref_temp_OUT2, ">uniref_output_temp2.txt";
405 while (<$runirefblast>)
406 {
407 chomp $_;
408 my @columns = split (/\t/, $_);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
409 if (exists $hash6{$columns[0]})
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
410 {
Apr 9, 2014 @fauziharoon aesthetics
411 print {$uniref_temp_OUT2} "$hash6{$columns[0]}\treciprocal\n";
Apr 7, 2014 @fauziharoon added parsing of IMG and Uniref90 results into temporary files
412 }
413 }
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
414
415 close($uniref_temp_OUT2);
416
Apr 9, 2014 @fauziharoon aesthetics
417 # hashes for pfam
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
418 my %hash7 = ();
419 my %hash8 = ();
420
Apr 9, 2014 @fauziharoon aesthetics
421 # read pfam hmmscan output and store in hash
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
422 open my $pfamoutput, "./$locus.faaVSPfam-A.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVSPfam-A.hmm.hmmscanned.tab\n";
423 while (<$pfamoutput>)
424 {
425 next if /^\s*(#.*)?$/;
426 next if $pfamoutput =~ /^#/;
427 next if $pfamoutput =~ /^=/;
Apr 9, 2014 @fauziharoon aesthetics
428 chomp $_;
429 my @columns = split (/\t/, $_);
430 if ($columns[11] > 60)
431 {
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
432 my @pfam_columns = split (/\./, $columns[5]);
433 my $pfam_id = $pfam_columns[0];
Apr 9, 2014 @fauziharoon aesthetics
434 $hash7{$columns[0]} = $pfam_columns[0];
435 $hash7{$pfam_columns[0]} = $columns[0];
436 $hash8{$pfam_columns[0]} = "$columns[0]\t$columns[12]\t$columns[11]";
437 }
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
438 }
439
Apr 9, 2014 @fauziharoon aesthetics
440 # read Pfam-A.clans.tsv
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
441 open my $pfamid2names, "/srv/db/pfam/27/Pfam-A.clans.tsv", or die "Couldn't open Pfam-A.clans.tsv\n";
442 open my $pfam_temp_OUT, ">pfam_output_temp.txt";
443 while (<$pfamid2names>)
444 {
445 chomp $_;
446 my @columns = split (/\t/, $_);
447 if (exists $hash8{$columns[0]})
448 {
449 print {$pfam_temp_OUT} "$hash8{$columns[0]}\t$columns[4]\n";
450 }
451 }
452
453 close($pfamoutput);
454 close($pfamid2names);
455 close($pfam_temp_OUT);
456
Apr 9, 2014 @fauziharoon aesthetics
457 # hashes for tigrfam
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
458 my %hash9 = ();
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
459 my %hash10 = ();
460
Apr 9, 2014 @fauziharoon aesthetics
461 # read tigrfam hmmscan output and store in hash
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
462 open my $tigrfamoutput, "./$locus.faaVStigr_all.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVStigr_all.hmm.hmmscanned.tab\n";
463 while (<$tigrfamoutput>)
464 {
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
465 next if /^\s*(#.*)?$/;
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
466 next if $tigrfamoutput =~ /^#/;
467 chomp $_;
468 my @columns = split (/\t/, $_);
469 if ($columns[5] > 10)
Apr 9, 2014 @fauziharoon aesthetics
470 {
471 $hash9{$columns[2]} = $columns[0];
472 $hash9{$columns[0]} = $columns[2];
473 $hash10{$columns[0]} = "$columns[2]\t$columns[4]\t$columns[5]";
474 }
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
475 }
476
Apr 9, 2014 @fauziharoon aesthetics
477 # read tigrfam id2names2description
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
478 open my $tigrfamid2names, "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_INFO/tigr_info_combined.parsed_updated2", or die "Couldn't open tigr_info_combined.parsed_updated2\n";
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
479 open my $tigrfam_temp_OUT, ">tigrfam_output_temp.txt";
480 while (<$tigrfamid2names>)
481 {
482 chomp $_;
483 my @columns = split (/\t/, $_);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
484 $columns[0] =~ s/^\s+|\s+$//g;
485 #$columns[0] =~ s/^\s+//;
486 #$columns[0] =~ s/\s+$//;
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
487 if (exists $hash10{$columns[0]})
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
488 {
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
489 print {$tigrfam_temp_OUT} "$hash10{$columns[0]}\t$columns[1]\t$columns[2]\n";
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
490 }
491 }
492
493 close($tigrfamoutput);
494 close($tigrfamid2names);
495 close($tigrfam_temp_OUT);
496
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
497 # hashes for cog
498 my %hash11 = ();
499 my %hash12 = ();
500 my %hash13 = ();
501
502 # read cog blastp output and store in hash
503 open my $cogblast, "./$locus.faaVSCOG.blastp", or die "Couldn't open file $locus.faaVSCOG.blastp\n";
504 while (<$cogblast>)
505 {
506 chomp $_;
507 my @columns = split (/\t/, $_);
508 if ($columns[11] > 60)
509 {
510 $hash11{$columns[0]} = $columns[1];
511 $hash11{$columns[1]} = $columns[0];
512 $hash12{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]";
513 }
514 }
515
516 # read cog prot2COG.tab
517 open my $cogid2names, "/srv/db/cog/prot2COG.tab", or die "Couldn't open prot2COG.tab\n";
518 open my $cog_temp_OUT, "> cog_output_temp.txt";
519 while (<$cogid2names>)
520 {
521 chomp $_;
522 my @columns = split (/\t/, $_);
523 if (exists $hash12{$columns[0]})
524 {
525 $hash13{$columns[0]} = "$hash12{$columns[0]}\t$columns[1]";
526 $hash13{$columns[1]} = $hash12{$columns[0]};
527 print {$cog_temp_OUT} "$hash12{$columns[0]}\t$columns[1]\n";
528 }
529 }
530
531 close($cogblast);
532 close($cogid2names);
533 close($cog_temp_OUT);
534
535 # read cog listcogs.txt
536 open my $cogid2longernames, "/srv/db/cog/listcogs.txt", or die "Couldn't open listcogs.txt\n";
537 open my $cog_temp_OUT2, "> cog_output_temp2.txt";
538 while(<$cogid2longernames>)
539 {
540 chomp $_;
541 my @columns = split (/\t/, $_);
542 if (exists $hash13{$columns[5]})
543 {
544 print {$cog_temp_OUT2} "$hash13{$columns[5]}\t$columns[3]\t$columns[4]\t$columns[6]\n";
545 }
546 }
547
548 close($cog_temp_OUT2);
549
550
551 ### now to parse all the temporary files and combine into one tab-delimited-file
552 # to store the IDs => DB => values/annotations
553 my %combined_bighash =();
554
555 # open file for output
556 open my $FINAL_OUTPUT, "> ./final_output.txt";
557 # print header
Apr 9, 2014 @fauziharoon aesthetics
558 print {$FINAL_OUTPUT} "ORF_ID\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n";
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
559
560 # img
561 open my $img_annotation, "./img_output_temp2.txt", or die "Couldn't open img_output_temp2.txt\n";
562 while (<$img_annotation>)
563 {
564 chomp $_;
565 my @columns = split (/\t/, $_);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
566 # my @baba = @columns[1..$#columns];
567 my @baba = @columns[2..$#columns];
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
568 #print "@baba \n";
Apr 9, 2014 @fauziharoon aesthetics
569 #$combined_bighash{$columns[0]}->{'img'} = join("\t", @baba);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
570 push @{$combined_bighash{$columns[1]}->{'a-img'}}, join("\t", @baba);
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
571 }
572
573 # uniref
574 open my $uniref_annotation, "./uniref_output_temp2.txt", or die "Couldn't open uniref_output_temp2.txt\n";
575 while (<$uniref_annotation>)
576 {
577 chomp $_;
578 my @columns = split (/\t/, $_);
579 my @baba = @columns[1..$#columns];
Apr 9, 2014 @fauziharoon aesthetics
580 #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba);
581 push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba);
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
582 }
583
584 # prokka
585 # need to parse faa file to give prokka id2names
586 checkAndRunCommand("grep",[[
587 "'>'",
588 "prokka_annotation/$locus.faa |",
589 "sed",
590 "'s/>//g' |",
591 "sed",
592 -e => "'s/ /\\t/'",
593 "> prokka_temp_output.txt",
594 ]], DIE_ON_FAILURE);
595
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
596 # SAMPLE gff file
597 ##gff-version 3
598 ##sequence-region contig_3875 1 10320
599 #contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001
600 #contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002
601 #contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003
602
603 #open my $prokka_gff, "./prokka_annotation/$locus.gff", or die "Couldn't open $locus.gff\n";
604 #while (<$prokka_gff>)
605 #{
606 # next if $prokka_gff =~ /^#/;
607 # chomp $_;
608 # my @main_columns = split (/\t/, $_);
609 # $prokka_gff = my $ID =~ m/[ID\=](.*)[\;]/;
610 # $prokka_gff = my $product =~ m/[product\=](.*)[\;]/;
611 # print "$ID\t$product\n";
612 #}
613
614
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
615 open my $prokka_annotation, "./prokka_temp_output.txt", or die "Couldn't open prokka_temp_output.txt\n";
616 while (<$prokka_annotation>)
617 {
618 chomp $_;
619 my @columns = split (/\t/, $_);
620 my @baba = @columns[1..$#columns];
Apr 9, 2014 @fauziharoon aesthetics
621 #$combined_bighash{$columns[0]}->{'prokka'} = join("\t", @baba);
622 push @{$combined_bighash{$columns[0]}->{'c-prokka'}}, join("\t", @baba);
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
623 }
624
625 # cog
626 open my $cog_annotation, "./cog_output_temp2.txt", or die "Couldn't open cog_output_temp2.txt\n";
627 while (<$cog_annotation>)
628 {
629 chomp $_;
630 my @columns = split (/\t/, $_);
631 my @baba = @columns[1..$#columns];
Apr 9, 2014 @fauziharoon aesthetics
632 #$combined_bighash{$columns[0]}->{'cog'} = join("\t", @baba);
633 push @{$combined_bighash{$columns[0]}->{'d-cog'}}, join("\t", @baba);
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
634 }
635
636 # pfam
637 open my $pfam_annotation, "./pfam_output_temp.txt", or die "Couldn't open pfam_output_temp.txt\n";
638 while (<$pfam_annotation>)
639 {
640 chomp $_;
641 my @columns = split (/\t/, $_);
642 my @baba = @columns[1..$#columns];
Apr 9, 2014 @fauziharoon aesthetics
643 #$combined_bighash{$columns[0]}->{'pfam'} = join("\t", @baba);
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
644 push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba);
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
645 }
Apr 9, 2014 @fauziharoon aesthetics
646 #print Dumper \%combined_bighash;
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
647
648 # tigrfam
649 open my $tigrfam_annotation, "./tigrfam_output_temp.txt", or die "Couldn't open tigrfam_output_temp.txt\n";
650 while (<$tigrfam_annotation>)
651 {
652 chomp $_;
653 my @columns = split (/\t/, $_);
654 my @baba = @columns[1..$#columns];
Apr 9, 2014 @fauziharoon aesthetics
655 #$combined_bighash{$columns[0]}->{'tigrfam'} = join("\t", @baba);
656 push @{$combined_bighash{$columns[0]}->{'f-tigrfam'}}, join("\t", @baba);
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
657 }
658
659 # to print finally.................
Apr 9, 2014 @fauziharoon aesthetics
660 # assign key and value for number of expected columns for each annotation type, important for putting NA in missing annotation types
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
661 my %column_lengths = (
Apr 9, 2014 @fauziharoon aesthetics
662 'a-img' => 5,
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
663 'b-uniref' => 5,
Apr 9, 2014 @fauziharoon aesthetics
664 'c-prokka' => 1,
665 'd-cog' => 5,
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
666 'e-pfam' => 3,
Apr 9, 2014 @fauziharoon aesthetics
667 'f-tigrfam' => 4,
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
668 );
669
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
670 # print the orfids first
Apr 9, 2014 @fauziharoon aesthetics
671 foreach my $ID (sort(keys %combined_bighash))
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
672 {
Apr 9, 2014 @fauziharoon aesthetics
673 print {$FINAL_OUTPUT} "$ID\t";
674 foreach my $annotation_type (sort(keys %column_lengths))
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
675 {
676 # if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values
677 if (! exists $combined_bighash{$ID}->{$annotation_type})
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
678 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
679 # cool way of printing a certain string multiple times based on the values in the hash
680 print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t";
681 }
682 # check the derefencing made with @{$combined_bighash{$columns[0]}->{'f-tigrfam'}} and so on..
683 # the derefencing allows the hash be converted into an array so that we can read the hash for the different types of annotation types
684 elsif (ref($combined_bighash{$ID}->{$annotation_type}) eq 'ARRAY')
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
685 {
Apr 10, 2014 @fauziharoon fixed indentations
686 # place to store the columns in the values of the hash annotation types
687 my @storage_array;
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
688 foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}})
Apr 10, 2014 @fauziharoon fixed indentations
689 {
690 # each annotation types have different number of columns, so we need to split the columns first before
691 # we can add in the extra values if lets say an orfid hits multiple pfam/cog/tigrfam values
692 my @values = split("\t",$line);
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
693 # cool and alternate way of doing columns[1] = values[1], and so on.., repetitively
Apr 10, 2014 @fauziharoon fixed indentations
694 # what it basically means as long as the value i less than the number of columns in each annotation type
695 # add +1 to the string $i and do the push below
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
696 for (my $i = 0; $i <= $#values; $i++)
Apr 9, 2014 @fauziharoon aesthetics
697 {
Apr 10, 2014 @fauziharoon fixed indentations
698 push @{$storage_array[$i]}, $values[$i];
Apr 9, 2014 @fauziharoon aesthetics
699 }
Apr 10, 2014 @fauziharoon fixed indentations
700 }
701 #print Dumper(\@storage_array);
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
702 # array to store the multiple hits in each column. eg. test0001 orfid hits multiple pfam values pf0008 & pf0010
Apr 10, 2014 @fauziharoon fixed indentations
703 # so we would like to have the values combined together in the same column delimited by a comma
704 my @print_info_array;
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
705 for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++)
Apr 10, 2014 @fauziharoon fixed indentations
706 {
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
707 push @print_info_array, join("; ", @{$storage_array[$i]});
Apr 9, 2014 @fauziharoon aesthetics
708 }
709 #print Dumper(\@print_info_array);
710 print {$FINAL_OUTPUT} join("\t", @print_info_array), "\t";
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
711 }
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
712 else
713 {
Apr 10, 2014 @fauziharoon fixed indentations
714 print {$FINAL_OUTPUT} "$combined_bighash{$ID}{$annotation_type}\t";
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
715 }
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
716 }
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
717 print {$FINAL_OUTPUT} "\n";
Apr 8, 2014 @fauziharoon This version produces the tab-delimited file ready for consumption. S…
718 }
719
720 #close all files
721 close($img_annotation);
722 close($uniref_annotation);
723 close($prokka_annotation);
724 close($cog_annotation);
725 close($pfam_annotation);
726 close($tigrfam_annotation);
727 close($FINAL_OUTPUT);
Apr 7, 2014 @fauziharoon Parsed all the database id2names and also parsed output for Pfam and …
728
Apr 3, 2014 @fauziharoon Added additional features
729
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
730 ######################################################################
731 # CUSTOM SUBS
732 ######################################################################
733
Apr 9, 2014 @fauziharoon aesthetics
734 # who needs custom subs...
735
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
736 ######################################################################
737 # TEMPLATE SUBS
738
739 ######################################################################
740 # PARAMETERS
741
742 sub checkParams {
743 #-----
744 # Do any and all options checking here...
745 #
Apr 3, 2014 @fauziharoon Added additional features
746 my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "kingdom|k:s", "threads|t:s", "evalue|e:s");
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
747 my %options;
748
749 # Add any other command line options, and the code to handle them
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
750 #
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
751 GetOptions( \%options, @standard_options );
752
753 # if no arguments supplied print the usage and exit
754 #
755 exec("pod2usage $0") if (0 == (keys (%options) ));
756
757 # If the -help option is set, print the usage and exit
758 #
759 exec("pod2usage $0") if $options{'help'};
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
760
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
761 # Compulsory items
762 #if(!exists $options{''} ) { printParamError (""); }
763 if(!exists $options{'in'} ) { printParamError ("You MUST supply a fasta file"); }
764
765 return \%options;
766 }
767
768 sub printParamError
769 {
770 #-----
771 # What to do if there's something wrong with a parameter
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
772 #
773 my ($error) = @_;
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
774 print "**ERROR: $0 : $error\n"; exec("pod2usage $0");
775 }
776
777 sub overrideDefault
778 {
779 #-----
780 # Set and override default values for parameters
781 #
782 my ($default_value, $option_name) = @_;
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
783 if(exists $global_options->{$option_name})
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
784 {
785 return $global_options->{$option_name};
786 }
787 return $default_value;
788 }
789
Apr 9, 2014 @fauziharoon aesthetics
790 #####################################################################
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
791 # FILE IO
792
793 sub openWrite
794 {
795 #-----
796 # Open a file for writing
797 #
798 my ($fn) = @_;
799 open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n";
800 return $fh;
801 }
802
803 sub openRead
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
804 {
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
805 #-----
806 # Open a file for reading
807 #
808 my ($fn) = @_;
809 open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n";
810 return $fh;
811 }
812
813 ######################################################################
814 # EXTERNAL COMMANDS
815 #
816 # checkAndRunCommand("ls", {
817 # -a => ""
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
818 # },
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
819 # WARN_ON_FAILURE);
820
821 sub checkFileExists {
822 #-----
823 # Does a file exists?
824 #
825 my ($file) = @_;
826 unless(-e $file) {
827 croak "**ERROR: $0 : Cannot find:\n$file\n";
828 }
829 }
830
831 sub logExternalCommand
832 {
833 #-----
834 # Log a command line command to the command line!
835 #
836 if(1 == $global_log_commands) {
837 print $_[0], "\n";
838 }
839 }
840
841 sub isCommandInPath
842 {
843 #-----
844 # Is this command in the path?
845 #
846 my ($cmd, $failure_type) = @_;
847 if (system("which $cmd |> /dev/null")) {
848 handleCommandFailure($cmd, $failure_type);
849 }
850 }
851
852 sub runExternalCommand
853 {
854 #-----
855 # Run a command line command on the command line!
856 #
857 my ($cmd) = @_;
858 logExternalCommand($cmd);
859 system($cmd);
860 }
861
862 sub checkAndRunCommand
863 {
864 #-----
865 # Run external commands more sanelier
866 #
867 my ($cmd, $params, $failure_type) = @_;
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
868
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
869 isCommandInPath($cmd, $failure_type);
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
870
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
871 # join the parameters to the command
872 my $param_str = join " ", map {formatParams($_)} @{$params};
873
874 my $cmd_str = $cmd . " " . $param_str;
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
875
876 print "The command currently running:\t$cmd_str\n";
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
877 logExternalCommand($cmd_str);
878
879 # make sure that all went well
880 if (system($cmd_str)) {
881 handleCommandFailure($cmd_str, $failure_type)
882 }
883 }
884
885 sub formatParams {
886
887 #---------
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
888 # Handles and formats the different ways of passing parameters to
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
889 # checkAndRunCommand
890 #
891 my $ref = shift;
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
892
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
893 if (ref($ref) eq "ARRAY") {
894 return join(" ", @{$ref});
895 } elsif (ref($ref) eq "HASH") {
896 return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref});
897 }
898 croak 'The elements of the $params argument in checkAndRunCommand can ' .
899 'only contain references to arrays or hashes\n';
900 }
901
902
903 sub handleCommandFailure {
904 #-----
905 # What to do when all goes bad!
906 #
907 my ($cmd, $failure_type) = @_;
908 if (defined($failure_type)) {
909 if ($failure_type == DIE_ON_FAILURE) {
910 croak "**ERROR: $0 : " . $! . "\n";
911 } elsif ($failure_type == WARN_ON_FAILURE) {
912 carp "**WARNING: $0 : " . $! . "\n";
913 }
914 }
915 }
916
917
918 ######################################################################
919 # MISC
920
921 sub printAtStart {
922 print<<"EOF";
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
923 ----------------------------------------------------------------
924
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
925 $0
Apr 9, 2014 @fauziharoon aesthetics
926 annotateM - annotate my genome
927 Due to the blast processes against multiple databases, this whole
928 annotation pipeline will usually take awhile. Please be patient!
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
929 What you get in the end will save you heaps of time.
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
930
931 ----------------------------------------------------------------
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
932 EOF
933 }
934
935 __DATA__
936
937 =head1 NAME
938
939 annotateM
940
941 =head1 COPYRIGHT
942
Apr 9, 2014 @fauziharoon aesthetics
943 Copyright (C) Mohamed Fauzi Haroon
944 Special appearance from Adam Skarshewski
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
945
946 This program is free software: you can redistribute it and/or modify
947 it under the terms of the GNU General Public License as published by
948 the Free Software Foundation, either version 3 of the License, or
949 (at your option) any later version.
950
951 This program is distributed in the hope that it will be useful,
952 but WITHOUT ANY WARRANTY; without even the implied warranty of
953 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
954 GNU General Public License for more details.
955
956 You should have received a copy of the GNU General Public License
957 along with this program. If not, see <http://www.gnu.org/licenses/>.
958
959 =head1 DESCRIPTION
960
Apr 3, 2014 @fauziharoon Added additional features
961 Want to annotate your genome? annotateM!
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
962
963 =head1 SYNOPSIS
964
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
965 annotateM -i [fasta_file] -l [locus] -k [kingdom] -t [threads] -e [evalue]
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
966
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
967 -i FASTA_FILE Nucleotide fasta file
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
968 -l locustag Name of locus tag
969 -k kingdom (Bacteria/Archaea/Phage/Viruses) Kingdom of genome to be annotated
970 -t threads Number of threads
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
971 -e evalue Evalue for BLAST, recommended 1e-3
Apr 15, 2014 @fauziharoon uses latest prokka version 1.8 and Pfam_scan now
972 [-help -h] Displays basic usage information
Jul 16, 2014 @wwood version 0.6: use IMG dereplicated, use -j with parallel
973
Apr 3, 2014 @fauziharoon This is my first commit of annotateM
974 =cut
975
976