-
Notifications
You must be signed in to change notification settings - Fork 5
/
Eukaryotic_virus_Generate_report_All_Samples.pl
483 lines (425 loc) · 18 KB
/
Eukaryotic_virus_Generate_report_All_Samples.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
################################################################################
## Author:
## Guoyan Zhao <gzhao@pathology.wustl.edu>
#*
#* Copyright (C) Washington University in St. Louis Developed by Guoyan Zhao,
#* Washington University in St. Louis, Department of Pathology and Immunology.
#*
###############################################################################
#!/usr/bin/perl
use strict;
my $usage = "
This script will read corresponding files in the given director and
generate a report which contains SampleDescription, SequenceReport,
AssignmentSummary, InterestingReads.
perl $0 <run folder> <program version>
<run folder> = full path of the folder holding files for this sequence run
<program version> = e.g. 0.063
";
die $usage unless scalar @ARGV == 2;
my ( $dir, $version ) = @ARGV;
my @temp = split("/", $dir);
my $run_name = pop @temp;
my $outFile = $dir."/Analysis_Report_".$run_name.".txt";
open (OUT, ">$outFile") or die "can not open file $outFile!\n";
my ($wkday,$month,$day,$time,$year) = split(/\s+/, localtime);
print OUT "Illumina V${version}; Processing date: $day-$month-$year\n";
print OUT "cd-hit parameter: -c 0.95 -n 8 -G 0 -aS 0.95 -g 1 -r 1 -M 9000 -d 0\n";
print OUT "MegaBLAST against ref genome database parameter: -evalue 1e-9 -word_size 16 -show_gis \n";
print OUT "BLASTn against NT VIRUSDB database parameter: -evalue 1e-4 -show_gis -task blastn \n";
print OUT "BLASTx against NR VIRUSDB parameter: -evalue 1e-2 -show_gis \n";
print OUT "MegaBLAST against full NT database parameter:\n";
print OUT " -evalue 1e-9 -word_size 16 -show_gis -num_descriptions 50 -num_alignments 50 \n";
print OUT "BLASTn against nt database parameter:\n";
print OUT " -evalue 1e-9 -show_gis -task blastn -num_descriptions 50 -num_alignments 50 \n";
print OUT "BLASTx against nr database parameter: \n";
print OUT "-evalue 1e-2 -show_gis -num_descriptions 50 -num_alignments 50 \n";
print OUT "database version: Human genomic: human_g1k_v37.fasta\n";
print OUT "NR, NT, taxonomy: 20160802\n";
print OUT "E-value cutoff for significant hit:\n";
print OUT "BLASTn VIRUSDB 1e-5, MegaBLAST ref genome, MegaBLAST NT: e-10\n";
print OUT "BLASTn NT: e-10, BLASTx VIRUSDB and BLASTx NR: e-3\n";
my $c = "**************************************************************************\n";
my $c2 = "#########################################################################\n\n";
print OUT $c;
###################################################
print OUT "How to read this file:\n\n";
print OUT "For the Summary section:\n";
print OUT "column 1: sample name\n";
print OUT "column 2: total number of reads obtained for this sample (pairs)\n\n";
print OUT "If there is any viral sequence identified in this sample, it will show up under \n";
print OUT "the information of this sample. There are 3 columns to describe the viral sequences identified in this sample:\n";
print OUT "column 1: number of viral sequences (could be read or contig)\n";
print OUT "column 2: range of percentage identity to blast hits. Some times one sequence can hit multiple \n";
print OUT "sequences in the database with the same e value. Only the first best hit percent identity is taken.\n";
print OUT "column 3: name of the virus (the last rank in the taxonomy lineage of the virus)\n\n";
###################################################
print OUT "For the Sequence Report section: It was divided into three subsections.\n\n";
print OUT "First Subsection:\n";
print OUT "column 1: #totalPair: total number of reads in pairs obtained for this sample.\n";
print OUT "column 2: #unjoined: number of reads not joined by fastq-join\n";
print OUT "column 3: #joined: number of reads joined by fastq-join \n";
print OUT "column 4: %total: percentage of joined reads over total pair of reads\n";
print OUT "column 5: #TotalAfterStitch: number of reads (unjoined+joined) after fastq-join \n";
print OUT "column 6: #QC: number of reads left after quality control (removing low quality nt and read, low complexity read, exact duplicates etc)\n";
print OUT "column 7: %Stitched: percentage of reads left after quality control (reads left after quality control divided by total number of reads after join read 1 and 2 (column 5)\n";
print OUT "column 8: #unique: number of unique reads after removing redundancy using CD-HIT\n";
print OUT "column 9: %Stitched: percentage of deduplicated reads (reads left after deduplication divided by total number of reads after join read 1 and 2 (column 5)\n";
print OUT "column 10: sample: name of the sample\n\n";
print OUT "Second Subsection:\n";
print OUT "column 1: #TANTAN_good: number of good quality reads after TANTAN filtering low complexity sequence.\n";
print OUT "column 2: %unique: percentage of #TANTAN_good reads over total number of deduplicated reads (column 8 in the first subsection\n";
print OUT "column 3: #RM_good: number of good quality reads after RepeatMasker filtering low complexity sequence.\n";
print OUT "column 4: %unique: percentage of #RM_good reads over total number of deduplicated reads (column 8 in the first subsection\n";
print OUT "column 5: #RefGenomeFiltered: number of reads after reference genome filtering.\n";
print OUT "column 6: %unique: percentage of #RM_good reads over total number of deduplicated reads (column 8 in the first subsection\n";
print OUT "column 7: #BN_VIRUSDB_Hit: number of reads have significant hit in BLASTn Virus-only database\n";
print OUT "column 8: #BX_VIRUSDB_Hit: number of reads have significant hit in BLASTx Virus-only database\n";
print OUT "column 9: #Total_VIRUSDB_Hit: number of reads have significant hit in both BLASTn and BLASTx Virus-only database (candidate eukaryotic viral sequences)\n";
print OUT "column 10: %unique: percentage of #RM_good reads over total number of deduplicated reads (column 8 in the first subsection\n";
print OUT "column 11: VIRUSDB_Hit_Bac: number of candidate eukaryotic viral reads mapped to bacteria\n";
print OUT "column 12: %VIRUSDB_Hit: percentage of VIRUSDB_Hit_Bac over total number of candidate eukaryotic viral sequences (column 9\n";
print OUT "column 13: VIRUSDB_Hit_NON_Bac: number of candidate eukaryotic viral reads not mapped to bacteria\n";
print OUT "column 14: %VIRUSDB_Hit: percentage of VIRUSDB_Hit_NON_Bac over total number of candidate eukaryotic viral sequences (column 9\n";
print OUT "column 15: sample: name of the sample\n\n";
print OUT "Third Subsection: information about BLAST results.\n";
print OUT "column 1: #BNinput: number of reads entered BLASTn NT step\n";
print OUT "column 2: %VIRUSDB_Hit: percentage of #BNinput over total number of candidate \n eukaryotic viral sequences (column 3)\n";
print OUT "column 3: BXinput: number of reads entered BLASTx NR step\n";
print OUT "column 4: %VIRUSDB_Hit: percentage of BXinput over total number of candidate \n eukaryotic viral sequences (column 3)\n";
print OUT "column 5: Unassigned: number of reads that do not have sequence similarity with \n anything in the database.\n";
print OUT "column 6: %VIRUSDB_Hit: percentage of Unassigned over total number of candidate \n eukaryotic viral sequences (column 3)\n";
print OUT "column 7: sample name \n\n";
###################################################
print OUT "For the Taxonomy Assignment section:\n";
print OUT "Describes the number of sequences assigned to each category.\n\n";
print OUT "If any viral sequence is detected in this sample, one line of description for each viral lineage will\n";
print OUT "be under the assignment information. The description includes the full taxonomy lineage of the virus, \n";
print OUT "the total number of reads that are assigned to this taxonomy lineage and the percentage of identify to the\n";
print OUT "most related viruses.\n\n";
print OUT "For the Interesting Reads section:\n";
print OUT "Sample name\n";
print OUT "Viral lineage, total number of reads assigned to this lineage, range of percentage of identity to the most related viruses\n\n";
print OUT "Description of the reads and the top hit BLAST information:\n";
print OUT "The fields are: Query Name, Query Length, Hit Name, Hit Length, Hit Description, Alignment Length, Percent Identity, Hit Start, Hit End, e value\n\n";
print OUT "Sequences in FASTA format\n\n";
print OUT $c;
print OUT "Viral sequence Summary:\n\n";
&generate_SampleDescription( $dir );
print OUT "End of Summary\n\n";
print OUT $c ;
print OUT "Ambiguous sequence Summary:\n\n";
&generate_SampleDescription_ambiguous( $dir );
print OUT "End of Summary\n\n";
print OUT $c ;
print OUT "\n\nSequence Report\n\n";
&generate_SequenceReport( $dir );
print OUT "End of Sequence Report\n\n";
print OUT $c ;
print OUT "\n\nTaxonomy Assignment:\n\n";
&generate_AssignmentSummary( $dir );
print OUT "End of Assignment\n\n";
print OUT $c ;
print OUT "\n\nViral Reads\n\n";
&generate_InterestingReads( $dir );
print OUT "End of Viral Reads\n\n";
print OUT $c ;
print OUT "\n\nAmbiguous Reads\n\n";
&generate_InterestingReads_ambiguous( $dir );
print OUT "End of Ambiguous Reads\n\n";
print "\n";
print OUT "# Finished\n";
exit;
############################################################################
sub generate_SampleDescription {
my ($dir) = @_;
print OUT $dir,"\n";
printf OUT "%15s\t%15s\t%40s\n", "ViralRead", "PercentIDrange", "IdentifiedVirus";
opendir(DH, $dir) or die "Can not open dir $dir!\n";
my @files = readdir DH;
foreach my $name (sort {$a cmp $b} @files) {
if (!($name eq ".") && !($name eq "..")) {
# name is either file name or sample name (directory)
my $full_path = $dir."/".$name;
if (-d $full_path) { # is a directory, sample directory
# get total number of sequences in the sample
my $SE1 = $full_path."/".$name."_SE1.RemoveAdapter.fastq";
my $total_seq = 0;
if (-e $SE1) {
my $line_number_SE1 = `wc -l $SE1`;
$total_seq = $line_number_SE1/4;
}
# print out report for this sample
printf OUT "%30s\t%8d\n", $name, $total_seq;
my $Summary_file = $full_path."/".$name.".AssignmentSummary";
if (-e $Summary_file) {
open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
foreach (1..21) {
<IN>;
}
while (<IN>) {
if ($_ =~ /^\s*$/) { # empty line
next;
}
elsif ($_ =~ /# Finished Assignment Summary/) {
next;
}
else {
chomp $_;
my $number_reads = 0;
my $range = "";
my @temp = split(/\t/, $_);
my $range = pop @temp;
my $info = pop @temp;
my $virus_info = pop @temp;
my $virus = "";
if ($info =~ /total number of reads: (\d+)/) {
$number_reads = $1;
}
if ($virus_info =~ /hit does not have taxonomy entry/) {
my @temp2 = split (",", $virus_info);
$virus = shift @temp2;
}
else {
my @temp2 = split(";", $virus_info);
$virus = pop @temp2;
}
printf OUT "%15d\t%15s\t%40s\n", $number_reads, $range, $virus;
}
}
close IN;
}
else {
print OUT "$Summary_file does not exist!\n";
}
}
}
}
}
############################################################################
sub generate_SampleDescription_ambiguous {
my ($dir) = @_;
printf OUT "%15s\t%15s\t%40s\n", "AmbiguousRead", "PercentIDrange", "VirusLineage";
opendir(DH, $dir) or die "Can not open dir $dir!\n";
my @files = readdir DH;
foreach my $name (sort {$a cmp $b} @files) {
if (!($name eq ".") && !($name eq "..")) {
# name is either file name or sample name (directory)
my $full_path = $dir."/".$name;
if (-d $full_path) { # is a directory, sample directory
# get total number of sequences in the sample
my $SE1 = $full_path."/".$name."_SE1.RemoveAdapter.fastq";
my $total_seq = 0;
if (-e $SE1) {
my $line_number_SE1 = `wc -l $SE1`;
$total_seq = $line_number_SE1/4;
}
# print out report for this sample
printf OUT "%30s\t%8d\n", $name, $total_seq;
my $Summary_file = $full_path."/".$name.".AmbiguousReadsAssignmentSummary";
if (-e $Summary_file) {
open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
while (<IN>) {
if ($_ =~ /^\s*$/) { # empty line
next;
}
elsif ($_ =~ /# Finished Assignment Summary/) {
next;
}
else {
chomp $_;
my $number_reads = 0;
my $range = "";
my @temp = split(/\t/, $_);
my $range = pop @temp;
my $info = pop @temp;
my $virus_info = pop @temp;
my $virus = "";
if ($info =~ /total number of reads: (\d+)/) {
$number_reads = $1;
}
if ($virus_info =~ /hit does not have taxonomy entry/) {
my @temp2 = split (",", $virus_info);
$virus = shift @temp2;
}
else {
my @temp2 = split(";", $virus_info);
$virus = pop @temp2;
}
printf OUT "%15d\t%15s\t%40s\n", $number_reads, $range, $virus;
}
}
close IN;
}
else {
print OUT "$Summary_file does not exist!\n";
}
}
}
}
}
##########################################################################
sub generate_SequenceReport {
my ( $dir ) = @_;
opendir(DH, $dir) or die "Can not open dir $dir!\n";
my @files = readdir DH;
# use the same format as in the script generating assignment report
printf OUT "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", "#totalPair", "#unjoined", "#joined", "%total", "#TotalAfterStitch", "#QC", "%Stitched", "#unique", "%Stitched", "sample" ;
foreach my $name (sort {$a cmp $b} @files) {
# name is either file name or sample name (directory)
my $Summary_file = $dir."/".$name."/".$name.".AssignmentSummary";
if (-e $Summary_file) {
open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
while (my $line = <IN>) {
if ($line =~ /#total/) {
$line = <IN>;
chomp $line;
print OUT $line, "\t", $name, "\n";
next;
}
}
}
}
print OUT $c2 ;
printf OUT "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", "#TANTAN__good", "%unique", "#RM_good", "%unique","#RefGenomeFiltered", "%unique", "#BN_VIRUSDB_Hit", "#BX_VIRUSDB_Hit", "Total_VIRUSDB_Hit", "%unique","VIRUSDB_Hit_Bac", "%VIRUSDB_Hit", "VIRUSDB_Hit_NON_Bac", "%VIRUSDB_Hit", "sample";
foreach my $name (sort {$a cmp $b} @files) {
# name is either file name or sample name (directory)
my $Summary_file = $dir."/".$name."/".$name.".AssignmentSummary";
if (-e $Summary_file) {
open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
while (my $line = <IN>) {
if ($line =~ /#RM_good/) {
$line = <IN>;
chomp $line;
print OUT $line, "\t", $name, "\n";
next;
}
}
}
}
print OUT $c2 ;
printf OUT "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", "#BNinput", "%VIRUSDB_Hit" , "#BXinput", "%VIRUSDB_Hit" , "#unassigned", "%VIRUSDB_Hit", "sample" ;
foreach my $name (sort {$a cmp $b} @files) {
# name is either file name or sample name (directory)
my $Summary_file = $dir."/".$name."/".$name.".AssignmentSummary";
if (-e $Summary_file) {
open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
while (my $line = <IN>) {
if ($line =~ /#BNinput/) {
$line = <IN>;
chomp $line;
print OUT $line, "\t", $name, "\n";
next;
}
}
}
}
print OUT $c2 ;
}
#####################################################################
# Assignment Summary
sub generate_AssignmentSummary {
my ( $dir ) = @_;
opendir(DH, $dir) or die "Can not open dir $dir!\n";
my @files = readdir DH;
foreach my $name (sort {$a cmp $b} @files) {
# name is either file name or sample name (directory)
my $full_path = $dir."/".$name;
if (!($name eq ".") && !($name eq "..")) {
# if (!($name =~ /\./)) {
if (-d $full_path) { # is a directory
my $Summary_file = $full_path."/".$name.".AssignmentSummary";
if (-e $Summary_file) {
open (IN, $Summary_file) or die "can not open file $Summary_file!\n";
while (<IN>) {
if ($_ =~ /# Finished Assignment Summary/) {
next;
}
print OUT $_;
}
}
print OUT $c2 ;
}
}
}
}
############################################################################
sub count_num_of_seq () {
my ($fastaFile) = @_;
my $count = 0;
open (FastaFile, $fastaFile) or die "Can't Open FASTA file: $fastaFile";
while (my $line = <FastaFile>){
if ($line =~ ">") {
$count++;
}
}
close FastaFile;
return $count;
}
####################################################################################
# viral sequence
sub generate_InterestingReads {
my ( $dir ) = @_;
opendir(DH, $dir) or die "Can not open dir $dir!\n";
my @files = readdir DH;
foreach my $name (sort {$a cmp $b} @files) {
# name is either file name or sample name (directory)
my $full_path = $dir."/".$name;
if (!($name eq ".") && !($name eq "..")) {
# if (!($name =~ /\./)) {
if (-d $full_path) { # is a directory
print OUT $name, "\n";
my $tempF = $full_path."/".$name.".AssignmentReport";
if ( -e $tempF ) {
open (IN, $tempF) or die "can not open file $tempF!\n";
foreach (1..20) {
<IN>;
}
while (<IN>) {
if ($_ =~ /# Finished Assignment Report/) {
next;
}
print OUT $_;
}
close IN;
}
else {
print OUT "$name does not have .AssignmentReport file!\n";
}
print OUT $c2;
}
}
}
}
####################################################################################
# ambiguous sequence
sub generate_InterestingReads_ambiguous {
my ( $dir ) = @_;
opendir(DH, $dir) or die "Can not open dir $dir!\n";
my @files = readdir DH;
foreach my $name (sort {$a cmp $b} @files) {
# name is either file name or sample name (directory)
my $full_path = $dir."/".$name;
if (!($name eq ".") && !($name eq "..")) {
# if (!($name =~ /\./)) {
if (-d $full_path) { # is a directory
print OUT $name, "\n";
my $tempF = $full_path."/".$name.".AmbiguousReadsAssignmentReport";
if ( -e $tempF ) {
open (IN, $tempF) or die "can not open file $tempF!\n";
while (<IN>) {
if ($_ =~ /# Finished Assignment Report/) {
next;
}
print OUT $_;
}
close IN;
}
else {
print OUT "$name does not have .AssignmentReport file!\n";
}
print OUT $c2;
}
}
}
}