-
Notifications
You must be signed in to change notification settings - Fork 2
/
cbrain_check_jobs.pl
executable file
·98 lines (92 loc) · 2.88 KB
/
cbrain_check_jobs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;
my $usage = q{Usage:
cbrain_check_jobs.pl [-w tasks_to_rerun.cfa] dataset taskdb.cfa
dataset is the base directory name where output was directed.
It assumes the log files are alongside output files as:
in ~/work/cbrain/[aln|strg]/dataset/sampleID/sampleID.log
..and that sampleID is the 3rd token in the taskdb.cfa header, e.g.:
>5 /path/to/fastq_or_aln_dir MSSM_RNA_BP_PFC_10 ...
Option -w allows creation of a copy and renumbered list of tasks entries
that were found having issues.
};
umask 0002;
getopts('o:w:') || die($usage."\n");
my $outfile=$Getopt::Std::opt_o;
if ($outfile) {
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
select(OUTF);
}
# --
my $ds=shift(@ARGV) || die($usage." No dataset name provided\n");
my $tfa=shift(@ARGV) || die($usage." No taskdb file provided\n");
my $tdbout=$Getopt::Std::opt_w;
if ($tdbout) {
die("Error: -w option cannot be the same with input taskdb!\n")
if $tfa eq $tdbout;
open(WT, '>'.$tdbout) || die("Error creating $tdbout\n");
}
open(TF, $tfa) || die(" Error opening taskdb file $tfa\n" );
my $wtid=0;
my $wrest=0;
while(<TF>) {
my $line=$_;
if (m/^>(\d+)/) {
my $tid=$1; #task ID
$wrest=0;
chomp;
my @t=split();
my $sid=$t[2];
my $odir=$ENV{HOME}."/work/cbrain/aln/$ds/$sid";
die("Error: no sample directory: $odir\n") unless -d $odir;
my $fbam="$odir/$sid.bam";
my $fcram="$odir/$sid.cram";
my $nobam= (! -f $fbam);
my $nocram= (! -f $fcram);
my $bamsize= -s $fbam;
my $cramsize= -s $fcram;
my $flog="$odir/$sid.log";
my $badcram=($cramsize<300000000); # should be >300M
my $badbam=($bamsize>0 && $bamsize<300000000);
my $cramckfail=0;
my $bamckfail=0;
if (!$badcram) { # check cram integrity
$cramckfail=system("samtools quickcheck $fcram");
$badcram=1 if ($cramckfail);
}
if ($bamsize>0 && !$badbam) {
$bamckfail=system("samtools quickcheck $fbam");
$badbam=1 if ($bamckfail);
}
my $errmsg='.';
# also check logs for anything suspicious
my $errs=`egrep -i 'erro|warn|fail|couldn|invalid|unable|cann|not found|broken|dump|fault' $flog`;
if (length($errs)>2) {
$errs=~s/[\n\r]+$//;
$errmsg=join(";", (split(/[\n\r]+/, $errs) ));
}
if ($badcram || $badbam || length($errmsg)>1) {
my $bst=$nobam ? 'nobam' : ($badbam ? 'badbam' : 'bamOK');
my $cst=$nocram ? 'nocram' : ($badcram ? 'badcram' : 'cramOK');
print join("\t",$tid, $cst, $bst, $errmsg)."\n";
if ($tdbout) {
$wtid++;
$line=~s/^>\d+/>$wtid/;
print WT $line;
$wrest=1; #to print the other lines for this record, if any
}
}
} elsif ($wrest) {
print $line;
}
}
close(WT) if ($tdbout);
close(TF);
# --
if ($outfile) {
select(STDOUT);
close(OUTF);
}
#************ Subroutines **************