Permalink
Switch branches/tags
Nothing to show
Find file
8c88334 May 4, 2017
373 lines (314 sloc) 13.6 KB
## William Poehlman ##
## LogParse.py v1.1 ##
## wpoehlm@clemson.edu ##
## First Write February 1st, 2017 ##
## This script merges the standard output log files of the OSG-GEM workflow for each sample, and prints a quality control report ##
import glob
import os
import sys
def ParseLogs(trimlist, maplist, software, sample, layout):
"""
Merges statistics from lists of split trimmomatic and hisat/tophat log files for a given sample
arguments:
trimlist --> a list of trimmomatic log files
maplist --> a list of hisat2 or tophat2 log files
software --> read mapping software that generated log files. 'hisat' or 'tophat'
sample --> A sample number that is present at the beginning of each filename
layout --> Sequencing layout. 'single' or 'paired'
output:
A dictionary, where the sample ID number is the key, and the following statistics are stored in the list value:
Number of raw input read pairs
Number of read pairs following trimming
Number of concordant, single alignments
Overall Alignment rate
The overall Alignment rate for hisat2 is calculated as follows:
(concordant single alignments + concordant multiple alignments + non-paired single alignments)/(cleaned input pairs)
The overall Alignment rate for tophat2 is calculated as follows:
((left mapped reads + right mapped reads)/2)/(cleaned input pairs)
The overall Alignment rate for STAR is calculated as follows:
(uniquely mapped reads + multiple alignments) / (cleaned input pairs)
Logfile names must be formatted as follows:
<sample number>-<unique file identifier>-align_summary.txt (only for tophat)
<sample number>-<unique file identifier>-fastq-out.txt (for hisat)
<sample number>-<unique file identifier>-Log.final.out
<sample number>-unique file identifier>-fastq-trimmomatic.txt
for example:
2-000000-000000.fastq-align_summary.txt
1-000000-000000.fastq-out.txt
1-000000-000000.fastq-trimmomatic.txt
NOTE: do not mix log files for tophat/hisat/star
"""
triminput = []
trimoutput = []
mapped = []
concordant = []
multi = []
single = []
sample = str(sample)
software = str(software)
report = {}
if software == 'hisat2' and layout == 'paired':
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
for line in open(file):
if 'Input Read Pairs' in line:
inp = int(line.split(' ')[3])
triminput.append(inp)
out = int(line.split(' ')[6])
trimoutput.append(out)
for file in maplist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
for line in open(file):
if 'aligned concordantly exactly 1 time' in line:
conc = int(line.split('(')[0])
concordant.append(conc)
mapped.append(conc)
elif 'aligned concordantly >1 times' in line:
mult = int(line.split('(')[0])
mapped.append(mult)
elif 'aligned exactly 1 time' in line:
singlemap = int(line.split('(')[0])
mapped.append(singlemap)
elif software == 'tophat2' and layout == 'paired':
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
for line in open(file):
if 'Input Read Pairs' in line:
inp = int(line.split(' ')[3])
triminput.append(inp)
out = int(line.split(' ')[6])
trimoutput.append(out)
for file in maplist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
totalpair = []
multipair = []
discordpair = []
leftmap = []
rightmap = []
count = 0
for line in open(file):
count +=1
if count == 11:
totalpairs = int(line.split(':')[1].strip())
totalpair.append(totalpairs)
elif count == 12:
multipairs = int(line.split(':')[1].split('(')[0].strip())
multipair.append(multipairs)
elif count == 13:
discordpairs = int(line.split('(')[0].strip())
discordpair.append(discordpairs)
elif count == 3:
leftmapped = int(line.split(':')[1].split('(')[0].strip())
leftmap.append(leftmapped)
elif count == 7:
rightmapped = int(line.split(':')[1].split('(')[0].strip())
rightmap.append(rightmapped)
totalpair = sum(totalpair)
multipair = sum(multipair)
discordpair = sum(discordpair)
leftmap = sum(leftmap)
rightmap = sum(rightmap)
mappedsingles = leftmap + rightmap
concordantpairs = totalpair - multipair - discordpair
concordant.append(concordantpairs)
mapped.append(mappedsingles)
elif software == 'star' and layout == 'paired':
multimapped = []
uniqmapped = []
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
for line in open(file):
if 'Input Read Pairs' in line:
inp = int(line.split(' ')[3])
triminput.append(inp)
out = int(line.split(' ')[6])
trimoutput.append(out)
for file in maplist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
count = 0
for line in open(file):
count +=1
if count == 9:
uniq = int(line.split('|')[1].strip())
uniqmapped.append(uniq)
if count == 24:
multi = int(line.split('|')[1].strip())
multimapped.append(multi)
concordant = sum(uniqmapped)
mult = sum(multimapped)
mapped = concordant + mult
inputraw = sum(triminput)
inputclean = sum(trimoutput)
float1 = float(mapped)
float2 = float(inputclean)
rate = float1/float2
report[sample] = [inputraw, inputclean, concordant, rate*100]
return report
if software == 'star' and layout == 'single':
multimapped = []
uniqmapped = []
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
for line in open(file):
if 'Input Reads' in line:
inp = int(line.split(' ')[2])
triminput.append(inp)
out = int(line.split(' ')[4])
trimoutput.append(out)
for file in maplist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
count = 0
for line in open(file):
count +=1
if count == 9:
uniq = int(line.split('|')[1].strip())
uniqmapped.append(uniq)
if count == 24:
multi = int(line.split('|')[1].strip())
multimapped.append(multi)
concordant = sum(uniqmapped)
mult = sum(multimapped)
mapped = concordant + mult
inputraw = sum(triminput)
inputclean = sum(trimoutput)
float1 = float(mapped)
float2 = float(inputclean)
rate = float1/float2
report[sample] = [inputraw, inputclean, concordant, rate*100]
return report
if software == 'tophat2' and layout == 'paired':
inputraw = sum(triminput)
inputclean = sum(trimoutput)
concordant = sum(concordant)
mapped = sum(mapped)
float1 = float(mapped)
float2 = float(inputclean)
float3 = float1/2
rate = float3/float2
report[sample] = [inputraw, inputclean, concordant, rate*100]
return report
elif software == 'hisat2' and layout == 'paired':
inputraw = sum(triminput)
inputclean = sum(trimoutput)
concordant = sum(concordant)
mapped = sum(mapped)
float1 = float(mapped)
float2 = float(inputclean)
rate = float1/float2
report[sample] = [inputraw, inputclean, concordant, rate*100]
return report
elif software == 'tophat2' and layout == 'single':
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
for line in open(file):
if 'Input Reads' in line:
inp = int(line.split(' ')[2])
triminput.append(inp)
out = int(line.split(' ')[4])
trimoutput.append(out)
for file in maplist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
count = 0
for line in open(file):
count +=1
if count == 3:
map = int(line.split(':')[1].split('(')[0].strip())
mapped.append(map)
elif count == 4:
mult = int(line.split(':')[1].split('(')[0].strip())
multi.append(mult)
totalreads = sum(triminput)
cleanreads = sum(trimoutput)
mappedtotal = sum(mapped)
multimapped = sum(multi)
singlemapped = mappedtotal - multimapped
float1 = float(cleanreads)
float2 = float(mappedtotal)
rate = float2/float1
report[sample] = [totalreads, cleanreads, singlemapped, rate*100]
return report
elif software == 'hisat2' and layout == 'single':
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
for line in open(file):
if 'Input Reads' in line:
inp = int(line.split(' ')[2])
triminput.append(inp)
out = int(line.split(' ')[4])
trimoutput.append(out)
for file in maplist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset == sample:
count = 0
for line in open(file):
count +=1
if count == 7:
sing = int(line.split('(')[0].strip())
single.append(sing)
elif count == 8:
mult = int(line.split('(')[0].strip())
multi.append(mult)
totalreads = sum(triminput)
cleanreads = sum(trimoutput)
totalsingle = sum(single)
totalmulti = sum(multi)
mappedtotal = int(totalsingle + totalmulti)
float1 = float(cleanreads)
float2 = float(mappedtotal)
rate = float2/float1
report[sample] = [totalreads, cleanreads, totalsingle, rate*100]
return report
def main():
software = sys.argv[1]
layout = sys.argv[2]
#Print header of QC report
if layout == 'paired':
print 'Sample' + '\t' + 'raw_input_pairs' + '\t' + 'cleaned_input_pairs' + '\t' + 'concordant_single_alignments' + '\t' + 'Overall_Alignment_Rate'
elif layout == 'single':
print 'Sample' + '\t' + 'raw_input_reads' + '\t' + 'cleaned_input_reads' + '\t' + 'single_alignments' + '\t' + 'Overall_Alignment_Rate'
#locate input log files and append to lists
#tophat alignment summary
if software == 'tophat2':
maploglist = [file for file in glob.glob(('*summary.txt'))]
elif software == 'hisat2':
maploglist = [file for file in glob.glob('*fastq-out.txt')]
elif software == 'star':
maploglist = [file for file in glob.glob('*Log.final.out')]
trimloglist = [file for file in glob.glob('*-trimmomatic.txt')]
if len(maploglist) != len(trimloglist):
print "WARNING: the number of trimmomatic log files is different than the number of hisat log files. Output may have frameshift."
inputlist = []
for file in maploglist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
if dataset not in inputlist:
inputlist.append(dataset)
#for each unique sample, call the ParseLogs function and print results in tab-delimited format
for dataset in inputlist:
stats = ParseLogs(trimloglist, maploglist, software, dataset, layout)
print dataset + '\t' + str(stats[dataset][0]) + '\t' + str(stats[dataset][1]) + '\t' + str(stats[dataset][2]) + '\t' + str(stats[dataset][3]) + '%'
if __name__ == '__main__':
main()