|
|
@@ -8,54 +8,56 @@ |
|
|
import os
|
|
|
import sys
|
|
|
|
|
|
-def ParseLogs(trimlist, maplist, software, sample):
|
|
|
+def ParseLogs(trimlist, maplist, software, sample, layout):
|
|
|
"""
|
|
|
Merges statistics from lists of split trimmomatic and hisat/tophat log files for a given sample
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
arguments:
|
|
|
trimlist --> a list of trimmomatic log files
|
|
|
maplist --> a list of hisat2 or tophat2 log files
|
|
|
software --> read mapping software that generated log files. 'hisat' or 'tophat'
|
|
|
sample --> A sample number that is present at the beginning of each filename
|
|
|
-
|
|
|
+
|
|
|
output:
|
|
|
A dictionary, where the sample ID number is the key, and the following statistics are stored in the list value:
|
|
|
Number of raw input read pairs
|
|
|
Number of read pairs following trimming
|
|
|
Number of concordant, single alignments
|
|
|
Overall Alignment rate
|
|
|
-
|
|
|
+
|
|
|
The overall Alignment rate for hisat2 is calculated as follows:
|
|
|
-
|
|
|
+
|
|
|
(concordant single alignments + concordant multiple alignments + non-paired single alignments)/(cleaned input pairs)
|
|
|
-
|
|
|
+
|
|
|
The overall Alignment rate for tophat2 is calculated as follows:
|
|
|
-
|
|
|
+
|
|
|
((left mapped reads + right mapped reads)/2)/(cleaned input pairs)
|
|
|
-
|
|
|
+
|
|
|
Logfile names must be formatted as follows:
|
|
|
-
|
|
|
+
|
|
|
<sample number>-<unique file identifier>-align_summary.txt (only for tophat)
|
|
|
<sample number>-<unique file identifier>-fastq-out.txt (for hisat)
|
|
|
<sample number>-unique file identifier>-fastq-trimmomatic.txt
|
|
|
-
|
|
|
+
|
|
|
for example:
|
|
|
2-000000-000000.fastq-align_summary.txt
|
|
|
1-000000-000000.fastq-out.txt
|
|
|
- 1-000000-000000.fastq-trimmomatic.txt
|
|
|
-
|
|
|
+ 1-000000-000000.fastq-trimmomatic.txt
|
|
|
+
|
|
|
NOTE: do not mix log files for tophat and hisat
|
|
|
-
|
|
|
+
|
|
|
"""
|
|
|
triminput = []
|
|
|
trimoutput = []
|
|
|
mapped = []
|
|
|
concordant = []
|
|
|
+ multi = []
|
|
|
+ single = []
|
|
|
sample = str(sample)
|
|
|
software = str(software)
|
|
|
report = {}
|
|
|
- if software == 'hisat2':
|
|
|
+ if software == 'hisat2' and layout == 'paired':
|
|
|
for file in trimlist:
|
|
|
basename = os.path.splitext(file)[0]
|
|
|
dataset = basename.split('-')[0]
|
|
|
@@ -81,8 +83,8 @@ def ParseLogs(trimlist, maplist, software, sample): |
|
|
elif 'aligned exactly 1 time' in line:
|
|
|
singlemap = int(line.split('(')[0])
|
|
|
mapped.append(singlemap)
|
|
|
-
|
|
|
- elif software == 'tophat2':
|
|
|
+
|
|
|
+ elif software == 'tophat2' and layout == 'paired':
|
|
|
for file in trimlist:
|
|
|
basename = os.path.splitext(file)[0]
|
|
|
dataset = basename.split('-')[0]
|
|
|
@@ -102,10 +104,10 @@ def ParseLogs(trimlist, maplist, software, sample): |
|
|
discordpair = []
|
|
|
leftmap = []
|
|
|
rightmap = []
|
|
|
-
|
|
|
+
|
|
|
count = 0
|
|
|
for line in open(file):
|
|
|
- count +=1
|
|
|
+ count +=1
|
|
|
if count == 11:
|
|
|
totalpairs = int(line.split(':')[1].strip())
|
|
|
totalpair.append(totalpairs)
|
|
|
@@ -121,19 +123,19 @@ def ParseLogs(trimlist, maplist, software, sample): |
|
|
elif count == 7:
|
|
|
rightmapped = int(line.split(':')[1].split('(')[0].strip())
|
|
|
rightmap.append(rightmapped)
|
|
|
-
|
|
|
+
|
|
|
totalpair = sum(totalpair)
|
|
|
multipair = sum(multipair)
|
|
|
discordpair = sum(discordpair)
|
|
|
leftmap = sum(leftmap)
|
|
|
rightmap = sum(rightmap)
|
|
|
mappedsingles = leftmap + rightmap
|
|
|
- concordantpairs = totalpair - multipair - discordpair
|
|
|
+ concordantpairs = totalpair - multipair - discordpair
|
|
|
concordant.append(concordantpairs)
|
|
|
mapped.append(mappedsingles)
|
|
|
-
|
|
|
- if software == 'tophat2':
|
|
|
-
|
|
|
+
|
|
|
+ if software == 'tophat2' and layout == 'paired':
|
|
|
+
|
|
|
inputraw = sum(triminput)
|
|
|
inputclean = sum(trimoutput)
|
|
|
concordant = sum(concordant)
|
|
|
@@ -142,30 +144,110 @@ def ParseLogs(trimlist, maplist, software, sample): |
|
|
float2 = float(inputclean)
|
|
|
float3 = float1/2
|
|
|
rate = float3/float2
|
|
|
-
|
|
|
+
|
|
|
report[sample] = [inputraw, inputclean, concordant, rate*100]
|
|
|
return report
|
|
|
-
|
|
|
- elif software == 'hisat2':
|
|
|
+
|
|
|
+ elif software == 'hisat2' and layout == 'paired':
|
|
|
inputraw = sum(triminput)
|
|
|
inputclean = sum(trimoutput)
|
|
|
concordant = sum(concordant)
|
|
|
mapped = sum(mapped)
|
|
|
float1 = float(mapped)
|
|
|
float2 = float(inputclean)
|
|
|
rate = float1/float2
|
|
|
-
|
|
|
+
|
|
|
report[sample] = [inputraw, inputclean, concordant, rate*100]
|
|
|
return report
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+ elif software == 'tophat2' and layout == 'single':
|
|
|
+ for file in trimlist:
|
|
|
+ basename = os.path.splitext(file)[0]
|
|
|
+ dataset = basename.split('-')[0]
|
|
|
+ if dataset == sample:
|
|
|
+ for line in open(file):
|
|
|
+ if 'Input Reads' in line:
|
|
|
+ inp = int(line.split(' ')[2])
|
|
|
+ triminput.append(inp)
|
|
|
+ out = int(line.split(' ')[4])
|
|
|
+ trimoutput.append(out)
|
|
|
+ for file in maplist:
|
|
|
+ basename = os.path.splitext(file)[0]
|
|
|
+ dataset = basename.split('-')[0]
|
|
|
+ if dataset == sample:
|
|
|
+ count = 0
|
|
|
+ for line in open(file):
|
|
|
+ count +=1
|
|
|
+ if count == 3:
|
|
|
+ map = int(line.split(':')[1].split('(')[0].strip())
|
|
|
+ mapped.append(map)
|
|
|
+ elif count == 4:
|
|
|
+ mult = int(line.split(':')[1].split('(')[0].strip())
|
|
|
+ multi.append(mult)
|
|
|
+ totalreads = sum(triminput)
|
|
|
+ cleanreads = sum(trimoutput)
|
|
|
+ mappedtotal = sum(mapped)
|
|
|
+ multimapped = sum(multi)
|
|
|
+ singlemapped = mappedtotal - multimapped
|
|
|
+ float1 = float(cleanreads)
|
|
|
+ float2 = float(mappedtotal)
|
|
|
+ rate = float2/float1
|
|
|
+
|
|
|
+ report[sample] = [totalreads, cleanreads, singlemapped, rate*100]
|
|
|
+ return report
|
|
|
+
|
|
|
+
|
|
|
+ elif software == 'hisat2' and layout == 'single':
|
|
|
+ for file in trimlist:
|
|
|
+ basename = os.path.splitext(file)[0]
|
|
|
+ dataset = basename.split('-')[0]
|
|
|
+ if dataset == sample:
|
|
|
+ for line in open(file):
|
|
|
+ if 'Input Reads' in line:
|
|
|
+ inp = int(line.split(' ')[2])
|
|
|
+ triminput.append(inp)
|
|
|
+ out = int(line.split(' ')[4])
|
|
|
+ trimoutput.append(out)
|
|
|
+ for file in maplist:
|
|
|
+ basename = os.path.splitext(file)[0]
|
|
|
+ dataset = basename.split('-')[0]
|
|
|
+ if dataset == sample:
|
|
|
+ count = 0
|
|
|
+ for line in open(file):
|
|
|
+ count +=1
|
|
|
+ if count == 7:
|
|
|
+ sing = int(line.split('(')[0].strip())
|
|
|
+ single.append(sing)
|
|
|
+ elif count == 8:
|
|
|
+ mult = int(line.split('(')[0].strip())
|
|
|
+ multi.append(mult)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ totalreads = sum(triminput)
|
|
|
+ cleanreads = sum(trimoutput)
|
|
|
+ totalsingle = sum(single)
|
|
|
+ totalmulti = sum(multi)
|
|
|
+ mappedtotal = int(totalsingle + totalmulti)
|
|
|
+ float1 = float(cleanreads)
|
|
|
+ float2 = float(mappedtotal)
|
|
|
+ rate = float2/float1
|
|
|
+
|
|
|
+ report[sample] = [totalreads, cleanreads, totalsingle, rate*100]
|
|
|
+ return report
|
|
|
+
|
|
|
+
|
|
|
|
|
|
def main():
|
|
|
|
|
|
software = sys.argv[1]
|
|
|
+ layout = sys.argv[2]
|
|
|
|
|
|
#Print header of QC report
|
|
|
- print 'Sample' + '\t' + 'raw_input_pairs' + '\t' + 'cleaned_input_pairs' + '\t' + 'concordant_single_alignments' + '\t' + 'Overall_Alignment_Rate'
|
|
|
+ if layout == 'paired':
|
|
|
+ print 'Sample' + '\t' + 'raw_input_pairs' + '\t' + 'cleaned_input_pairs' + '\t' + 'concordant_single_alignments' + '\t' + 'Overall_Alignment_Rate'
|
|
|
+ elif layout == 'single':
|
|
|
+ print 'Sample' + '\t' + 'raw_input_reads' + '\t' + 'cleaned_input_reads' + '\t' + 'single_alignments' + '\t' + 'Overall_Alignment_Rate'
|
|
|
|
|
|
#locate input log files and append to lists
|
|
|
|
|
|
@@ -185,19 +267,14 @@ def main(): |
|
|
dataset = basename.split('-')[0]
|
|
|
if dataset not in inputlist:
|
|
|
inputlist.append(dataset)
|
|
|
-
|
|
|
- #for each unique sample, call the ParseLogs function and print results in tab-delimited format
|
|
|
+
|
|
|
+ #for each unique sample, call the ParseLogs function and print results in tab-delimited format
|
|
|
for dataset in inputlist:
|
|
|
- stats = ParseLogs(trimloglist, maploglist, software, dataset)
|
|
|
+ stats = ParseLogs(trimloglist, maploglist, software, dataset, layout)
|
|
|
print dataset + '\t' + str(stats[dataset][0]) + '\t' + str(stats[dataset][1]) + '\t' + str(stats[dataset][2]) + '\t' + str(stats[dataset][3]) + '%'
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
|
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
0 comments on commit
505b81a