Apr 3, 2014
This is my first commit of annotateM
20
#
21
# This program is free software: you can redistribute it and/or modify
22
# it under the terms of the GNU General Public License as published by
23
# the Free Software Foundation, either version 3 of the License, or
24
# (at your option) any later version.
25
#
26
# This program is distributed in the hope that it will be useful,
27
# but WITHOUT ANY WARRANTY; without even the implied warranty of
28
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
# GNU General Public License for more details.
30
#
31
# You should have received a copy of the GNU General Public License
32
# along with this program. If not, see <http://www.gnu.org/licenses/>.
33
#
34
# ##############################################################################
35
36
# pragmas
37
use strict;
38
use warnings;
39
40
# core Perl modules
41
use Getopt::Long;
42
use Carp;
Apr 7, 2014
removed the hashes for the first part of the script
159
" --block" => " 100k" ,
160
" --recstart" ,
161
" '>'" ,
162
" --pipe" ,
163
" blastp" ,
164
-db => " /srv/db/uniprot/uniref-20140403/uniref90.fasta" ,
165
-outfmt => 6,
166
-max_target_seqs => 1,
167
-evalue => $global_options -> {' evalue' },
168
-query => " -" ,
169
" > $locus .faaVSuniref90.blastp" ,
170
# -num_threads => $global_options->{'threads'},
171
]], DIE_ON_FAILURE);
172
}
173
174
# reciprocal blast of Uniref positive hits against genome ORF
175
if (! -e " ./subsetuniref.faaVS$locus .faa.blastp" )
176
{
Apr 15, 2014
uses latest prokka version 1.8 and Pfam_scan now
300
# print Dumper (\%access2imgid);
301
302
303
# read img id2names.txt which is the file to get the gene identity of the imgid
304
# SAMPLE img id2names.txt file -
305
# 650716001|650846201 Ahos_0001 replication initiator protein Cdc6-3 Acidianus hospitalis W1
306
# 650716001|650846202 Ahos_0002 hypothetical protein Acidianus hospitalis W1
307
# 650716001|650846203 Ahos_0003 transcriptional coactivator/pterin dehydratase Acidianus hospitalis W1
308
# 650716001|650846204 Ahos_0004 GGCT (gamma glutamyl cyclotransferase) domain-containing protein Acidianus hospitalis W1
309
310
# columns[0] = imgid
311
# columns[1] = gene name
312
# columns[2] = organism
Apr 7, 2014
added parsing of IMG and Uniref90 results into temporary files
385
open my $unirefid2names , " /srv/db/uniprot/uniref-20140403/uniref90_id2names.txt" , or die " Couldn't open id2names.txt\n " ;
386
open my $uniref_temp_OUT , " >uniref_output_temp.txt" ;
387
while (<$unirefid2names >)
388
{
389
chomp $_ ;
390
my @columns = split (/ \t / , $_ );
391
if (exists $hash5 {$columns [0]})
392
{
393
$hash6 {$columns [0]} = " $hash5 {$columns [0]}\t $columns [1]\t $columns [2]" ;
394
print {$uniref_temp_OUT } " $hash5 {$columns [0]}\t $columns [1]\t $columns [2]\n " ;
395
}
396
}
397
398
close ($unirefblast );
399
close ($unirefid2names );
400
close ($uniref_temp_OUT );
401
402
# read my reciprocal img blast output and store in hash
403
open my $runirefblast , " ./subsetuniref.faaVS$locus .faa.blastp" , or die " Couldn't open file subsetuniref.faaVS$locus .faa.blastp\n " ;
404
open my $uniref_temp_OUT2 , " >uniref_output_temp2.txt" ;
405
while (<$runirefblast >)
406
{
407
chomp $_ ;
408
my @columns = split (/ \t / , $_ );
Apr 8, 2014
This version produces the tab-delimited file ready for consumption. S…
497
# hashes for cog
498
my %hash11 = ();
499
my %hash12 = ();
500
my %hash13 = ();
501
502
# read cog blastp output and store in hash
503
open my $cogblast , " ./$locus .faaVSCOG.blastp" , or die " Couldn't open file $locus .faaVSCOG.blastp\n " ;
504
while (<$cogblast >)
505
{
506
chomp $_ ;
507
my @columns = split (/ \t / , $_ );
508
if ($columns [11] > 60)
509
{
510
$hash11 {$columns [0]} = $columns [1];
511
$hash11 {$columns [1]} = $columns [0];
512
$hash12 {$columns [1]} = " $columns [0]\t $columns [10]\t $columns [11]" ;
513
}
514
}
515
516
# read cog prot2COG.tab
517
open my $cogid2names , " /srv/db/cog/prot2COG.tab" , or die " Couldn't open prot2COG.tab\n " ;
518
open my $cog_temp_OUT , " > cog_output_temp.txt" ;
519
while (<$cogid2names >)
520
{
521
chomp $_ ;
522
my @columns = split (/ \t / , $_ );
523
if (exists $hash12 {$columns [0]})
524
{
525
$hash13 {$columns [0]} = " $hash12 {$columns [0]}\t $columns [1]" ;
526
$hash13 {$columns [1]} = $hash12 {$columns [0]};
527
print {$cog_temp_OUT } " $hash12 {$columns [0]}\t $columns [1]\n " ;
528
}
529
}
530
531
close ($cogblast );
532
close ($cogid2names );
533
close ($cog_temp_OUT );
534
535
# read cog listcogs.txt
536
open my $cogid2longernames , " /srv/db/cog/listcogs.txt" , or die " Couldn't open listcogs.txt\n " ;
537
open my $cog_temp_OUT2 , " > cog_output_temp2.txt" ;
538
while (<$cogid2longernames >)
539
{
540
chomp $_ ;
541
my @columns = split (/ \t / , $_ );
542
if (exists $hash13 {$columns [5]})
543
{
544
print {$cog_temp_OUT2 } " $hash13 {$columns [5]}\t $columns [3]\t $columns [4]\t $columns [6]\n " ;
545
}
546
}
547
548
close ($cog_temp_OUT2 );
549
550
551
# ## now to parse all the temporary files and combine into one tab-delimited-file
552
# to store the IDs => DB => values/annotations
553
my %combined_bighash =();
554
555
# open file for output
556
open my $FINAL_OUTPUT , " > ./final_output.txt" ;
557
# print header
Apr 15, 2014
uses latest prokka version 1.8 and Pfam_scan now
596
# SAMPLE gff file
597
# #gff-version 3
598
# #sequence-region contig_3875 1 10320
599
# contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001
600
# contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002
601
# contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003
602
603
# open my $prokka_gff, "./prokka_annotation/$locus.gff", or die "Couldn't open $locus.gff\n";
604
# while (<$prokka_gff>)
605
# {
606
# next if $prokka_gff =~ /^#/;
607
# chomp $_;
608
# my @main_columns = split (/\t/, $_);
609
# $prokka_gff = my $ID =~ m/[ID\=](.*)[\;]/;
610
# $prokka_gff = my $product =~ m/[product\=](.*)[\;]/;
611
# print "$ID\t$product\n";
612
# }
613
614
Apr 3, 2014
This is my first commit of annotateM
819
# WARN_ON_FAILURE);
820
821
sub checkFileExists {
822
# -----
823
# Does a file exists?
824
#
825
my ($file ) = @_ ;
826
unless (-e $file ) {
827
croak " **ERROR: $0 : Cannot find:\n $file \n " ;
828
}
829
}
830
831
sub logExternalCommand
832
{
833
# -----
834
# Log a command line command to the command line!
835
#
836
if (1 == $global_log_commands ) {
837
print $_ [0], " \n " ;
838
}
839
}
840
841
sub isCommandInPath
842
{
843
# -----
844
# Is this command in the path?
845
#
846
my ($cmd , $failure_type ) = @_ ;
847
if (system (" which $cmd |> /dev/null" )) {
848
handleCommandFailure($cmd , $failure_type );
849
}
850
}
851
852
sub runExternalCommand
853
{
854
# -----
855
# Run a command line command on the command line!
856
#
857
my ($cmd ) = @_ ;
858
logExternalCommand($cmd );
859
system ($cmd );
860
}
861
862
sub checkAndRunCommand
863
{
864
# -----
865
# Run external commands more sanelier
866
#
867
my ($cmd , $params , $failure_type ) = @_ ;
Apr 3, 2014
This is my first commit of annotateM
893
if (ref ($ref ) eq " ARRAY" ) {
894
return join (" " , @{$ref });
895
} elsif (ref ($ref ) eq " HASH" ) {
896
return join (" " , map { $_ . " " . $ref -> {$_ }} keys %{$ref });
897
}
898
croak ' The elements of the $params argument in checkAndRunCommand can ' .
899
' only contain references to arrays or hashes\n' ;
900
}
901
902
903
sub handleCommandFailure {
904
# -----
905
# What to do when all goes bad!
906
#
907
my ($cmd , $failure_type ) = @_ ;
908
if (defined ($failure_type )) {
909
if ($failure_type == DIE_ON_FAILURE) {
910
croak " **ERROR: $0 : " . $! . " \n " ;
911
} elsif ($failure_type == WARN_ON_FAILURE) {
912
carp " **WARNING: $0 : " . $! . " \n " ;
913
}
914
}
915
}
916
917
918
# #####################################################################
919
# MISC
920
921
sub printAtStart {
922
print <<"EOF" ;
Apr 3, 2014
This is my first commit of annotateM
945
946
This program is free software: you can redistribute it and/or modify
947
it under the terms of the GNU General Public License as published by
948
the Free Software Foundation, either version 3 of the License, or
949
(at your option) any later version.
950
951
This program is distributed in the hope that it will be useful,
952
but WITHOUT ANY WARRANTY; without even the implied warranty of
953
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
954
GNU General Public License for more details.
955
956
You should have received a copy of the GNU General Public License
957
along with this program. If not, see <http://www.gnu.org/licenses/>.
958
959
=head1 DESCRIPTION
960