Permalink
Browse files

This version produces the tab-delimited file ready for consumption. S…

…till requires cleaning up though.
  • Loading branch information...
1 parent 564f8fb commit 39b54fa2ee2e69b4bf1bc640ba7539dc5bdf483b @fauziharoon committed Apr 8, 2014
Showing with 189 additions and 19 deletions.
  1. +189 −19 annotateM
View
208 annotateM
@@ -30,6 +30,7 @@ use warnings;
#core Perl modules
use Getopt::Long;
use Carp;
+use Data::Dumper;
#CPAN modules
@@ -227,7 +228,7 @@ checkAndRunCommand("hmmscan",[[
]], DIE_ON_FAILURE);
}
-# convert the hmmscan output to tab delimited
+#convert the hmmscan output to tab delimited
checkAndRunCommand("awk",[[
"'{\$1=\$1}{ print }'",
"$locus.faaVSPfam-A.hmm.hmmscanned",
@@ -243,12 +244,12 @@ checkAndRunCommand("awk",[[
]], DIE_ON_FAILURE);
-# hashes for img
+#hashes for img
my %hash = ();
my %hash2 = ();
my %hash3 =();
-# read the img blast output and store in hash
+#read the img blast output and store in hash
open my $IMGblast, "./$locus.faaVSimg.blastp", or die "Couldn't open file $locus.faaVSimg.blastp\n";
while (<$IMGblast>)
{
@@ -298,12 +299,12 @@ while (<$rIMGblast>)
close($img_temp_OUT2);
-# hashes for uniref
+#hashes for uniref
my %hash4 = ();
my %hash5 =();
my %hash6 = ();
-# read uniref blast and store in hash
+#read uniref blast and store in hash
open my $unirefblast, "./$locus.faaVSuniref90.blastp", or die "Couldn't open file $locus.faaVSuniref90.blastp\n";
while (<$unirefblast>)
{
@@ -317,7 +318,7 @@ while (<$unirefblast>)
}
}
-# read uniref id2names.txt
+#read uniref id2names.txt
open my $unirefid2names, "/srv/db/uniprot/uniref-20140403/uniref90_id2names.txt", or die "Couldn't open id2names.txt\n";
open my $uniref_temp_OUT, ">uniref_output_temp.txt";
while (<$unirefid2names>)
@@ -350,12 +351,11 @@ while (<$runirefblast>)
close($uniref_temp_OUT2);
-# hashes for pfam
+#hashes for pfam
my %hash7 = ();
my %hash8 = ();
-my %hash9 = ();
-# read pfam hmmscan output and store in hash
+#read pfam hmmscan output and store in hash
open my $pfamoutput, "./$locus.faaVSPfam-A.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVSPfam-A.hmm.hmmscanned.tab\n";
while (<$pfamoutput>)
{
@@ -396,15 +396,14 @@ close($pfamid2names);
close($pfam_temp_OUT);
#hashes for tigrfam
+my %hash9 = ();
my %hash10 = ();
-my %hash11 = ();
-my %hash12 = ();
#read tigrfam hmmscan output and store in hash
open my $tigrfamoutput, "./$locus.faaVStigr_all.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVStigr_all.hmm.hmmscanned.tab\n";
while (<$tigrfamoutput>)
{
- next if /^\s*(#.*)?$/;
+ next if /^\s*(#.*)?$/;
next if $tigrfamoutput =~ /^#/;
# next if $pfamoutput =! /TIGR/;
# if ($tigrfamoutput =~ /TIGR.*/)
@@ -414,9 +413,9 @@ while (<$tigrfamoutput>)
if ($columns[5] > 10)
# if ($columns[5])
{
- $hash10{$columns[2]} = $columns[0];
- $hash10{$columns[0]} = $columns[2];
- $hash11{$columns[0]} = "$columns[2]\t$columns[5]\t$columns[4]";
+ $hash9{$columns[2]} = $columns[0];
+ $hash9{$columns[0]} = $columns[2];
+ $hash10{$columns[0]} = "$columns[2]\t$columns[5]\t$columns[4]";
}
# }
}
@@ -431,17 +430,188 @@ while (<$tigrfamid2names>)
#print "$columns[0]\n";
$columns[0] =~ s/^\s+//;
$columns[0] =~ s/\s+$//;
- if (exists $hash11{$columns[0]})
+ if (exists $hash10{$columns[0]})
{
- print {$tigrfam_temp_OUT} "$hash11{$columns[0]}\t$columns[1]\t$columns[2]\n";
+ print {$tigrfam_temp_OUT} "$hash10{$columns[0]}\t$columns[1]\t$columns[2]\n";
}
}
close($tigrfamoutput);
close($tigrfamid2names);
close($tigrfam_temp_OUT);
+# hashes for cog
+my %hash11 = ();
+my %hash12 = ();
+my %hash13 = ();
+
+# read cog blastp output and store in hash
+open my $cogblast, "./$locus.faaVSCOG.blastp", or die "Couldn't open file $locus.faaVSCOG.blastp\n";
+while (<$cogblast>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ if ($columns[11] > 60)
+ {
+ $hash11{$columns[0]} = $columns[1];
+ $hash11{$columns[1]} = $columns[0];
+ $hash12{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]";
+ }
+}
+
+# read cog prot2COG.tab
+open my $cogid2names, "/srv/db/cog/prot2COG.tab", or die "Couldn't open prot2COG.tab\n";
+open my $cog_temp_OUT, "> cog_output_temp.txt";
+while (<$cogid2names>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ if (exists $hash12{$columns[0]})
+ {
+ $hash13{$columns[0]} = "$hash12{$columns[0]}\t$columns[1]";
+ $hash13{$columns[1]} = $hash12{$columns[0]};
+ print {$cog_temp_OUT} "$hash12{$columns[0]}\t$columns[1]\n";
+ }
+}
+
+close($cogblast);
+close($cogid2names);
+close($cog_temp_OUT);
+
+# read cog listcogs.txt
+open my $cogid2longernames, "/srv/db/cog/listcogs.txt", or die "Couldn't open listcogs.txt\n";
+open my $cog_temp_OUT2, "> cog_output_temp2.txt";
+while(<$cogid2longernames>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ if (exists $hash13{$columns[5]})
+ {
+ print {$cog_temp_OUT2} "$hash13{$columns[5]}\t$columns[3]\t$columns[4]\t$columns[6]\n";
+ }
+}
+
+close($cog_temp_OUT2);
+
+
+### now to parse all the temporary files and combine into one tab-delimited-file
+# to store the IDs => DB => values/annotations
+my %combined_bighash =();
+
+# open file for output
+open my $FINAL_OUTPUT, "> ./final_output.txt";
+# print header
+print {$FINAL_OUTPUT} "ORF_ID\t\n";
+
+# img
+open my $img_annotation, "./img_output_temp2.txt", or die "Couldn't open img_output_temp2.txt\n";
+while (<$img_annotation>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ my @baba = @columns[1..$#columns];
+ #print "@baba \n";
+ $combined_bighash{$columns[0]}->{'img'} = join("\t", @baba);
+}
+
+# uniref
+open my $uniref_annotation, "./uniref_output_temp2.txt", or die "Couldn't open uniref_output_temp2.txt\n";
+while (<$uniref_annotation>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ my @baba = @columns[1..$#columns];
+ $combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba);
+}
+
+# prokka
+# need to parse faa file to give prokka id2names
+checkAndRunCommand("grep",[[
+ "'>'",
+ "prokka_annotation/$locus.faa |",
+ "sed",
+ "'s/>//g' |",
+ "sed",
+ -e => "'s/ /\\t/'",
+ "> prokka_temp_output.txt",
+ ]], DIE_ON_FAILURE);
+
+open my $prokka_annotation, "./prokka_temp_output.txt", or die "Couldn't open prokka_temp_output.txt\n";
+while (<$prokka_annotation>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ my @baba = @columns[1..$#columns];
+ $combined_bighash{$columns[0]}->{'prokka'} = join("\t", @baba);
+}
+
+# cog
+open my $cog_annotation, "./cog_output_temp2.txt", or die "Couldn't open cog_output_temp2.txt\n";
+while (<$cog_annotation>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ my @baba = @columns[1..$#columns];
+ $combined_bighash{$columns[0]}->{'cog'} = join("\t", @baba);
+}
+
+# pfam
+open my $pfam_annotation, "./pfam_output_temp.txt", or die "Couldn't open pfam_output_temp.txt\n";
+while (<$pfam_annotation>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ my @baba = @columns[1..$#columns];
+ $combined_bighash{$columns[0]}->{'pfam'} = join("\t", @baba);
+}
+
+# tigrfam
+open my $tigrfam_annotation, "./tigrfam_output_temp.txt", or die "Couldn't open tigrfam_output_temp.txt\n";
+while (<$tigrfam_annotation>)
+{
+ chomp $_;
+ my @columns = split (/\t/, $_);
+ my @baba = @columns[1..$#columns];
+ $combined_bighash{$columns[0]}->{'tigrfam'} = join("\t", @baba);
+}
+
+# to print finally.................
+my %column_lengths = (
+ 'img' => 5,
+ 'uniref' => 5,
+ 'prokka' => 1,
+ 'cog' => 5,
+ 'pfam' => 3,
+ 'tigrfam' => 4,
+);
+
+foreach my $item (sort(keys %combined_bighash))
+{
+ print {$FINAL_OUTPUT} "$item\t";
+ foreach my $annotation_type (keys %column_lengths)
+ {
+ if (! exists $combined_bighash{$item}->{$annotation_type})
+ {
+ print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t";
+ }
+ else
+ {
+ print {$FINAL_OUTPUT} "$annotation_type = $combined_bighash{$item}{$annotation_type}\t";
+ }
+ }
+ print {$FINAL_OUTPUT} "\n";
+}
+
+#close all files
+close($img_annotation);
+close($uniref_annotation);
+close($prokka_annotation);
+close($cog_annotation);
+close($pfam_annotation);
+close($tigrfam_annotation);
+close($FINAL_OUTPUT);
+# to beautify the output
######################################################################
@@ -637,7 +807,7 @@ sub printAtStart {
print<<"EOF";
----------------------------------------------------------------
$0
- Copyright (C) Mohamed Fauzi Haroon
+ Copyright (C) Mohamed Fauzi Haroon, Yuji Sekiguchi, Adam Skarshewski
This program comes with ABSOLUTELY NO WARRANTY;
This is free software, and you are welcome to redistribute it
@@ -654,7 +824,7 @@ __DATA__
=head1 COPYRIGHT
- copyright (C) Mohamed Fauzi Haroon
+ Copyright (C) Mohamed Fauzi Haroon, Yuji Sekiguchi, Adam Skarshewski
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by

0 comments on commit 39b54fa

Please sign in to comment.