Permalink
Browse files

Added Log Merge Script to workflow. In addition to all standard outpu…

…t files, a report of trimming and alignment statistics, QC_Report.tab, will be present in outputs directory
  • Loading branch information...
1 parent 0d942b4 commit 869eb354ff6398124dc605f2bedf271c7d3bf536 @wpoehlm wpoehlm committed Feb 2, 2017
Showing with 220 additions and 4 deletions.
  1. +1 −1 submit
  2. +203 −0 task-files/gem-merge/Log_merge.py
  3. +13 −2 tools/dax-level-2
  4. +3 −1 tools/gem-merge
View
2 submit
@@ -33,7 +33,7 @@ if [ -e /stash2 ]; then
echo "An 'Output's directory will be created within the base of the workflow directory."
echo "This directory, $RUN_DIR/outputs"
echo "will have a 'merged_GEM.tab' file, an expression vector for each individual file,"
- echo "and all standard output files from trimmomatic/hisat2 jobs."
+ echo "and a report of trimming and alignment statistics, QC_Report.tab"
# plan and submit the workflow
echo
@@ -0,0 +1,203 @@
+## William Poehlman ##
+## LogParse.py v1.1 ##
+## wpoehlm@clemson.edu ##
+## First Write February 1st, 2017 ##
+## This script merges the standard output log files of the OSG-GEM workflow for each sample, and prints a quality control report ##
+
+import glob
+import os
+import sys
+
+def ParseLogs(trimlist, maplist, software, sample):
+ """
+ Merges statistics from lists of split trimmomatic and hisat/tophat log files for a given sample
+
+
+ arguments:
+ trimlist --> a list of trimmomatic log files
+ maplist --> a list of hisat2 or tophat2 log files
+ software --> read mapping software that generated log files. 'hisat' or 'tophat'
+ sample --> A sample number that is present at the beginning of each filename
+
+ output:
+ A dictionary, where the sample ID number is the key, and the following statistics are stored in the list value:
+ Number of raw input read pairs
+ Number of read pairs following trimming
+ Number of concordant, single alignments
+ Overall Alignment rate
+
+ The overall Alignment rate for hisat2 is calculated as follows:
+
+ (concordant single alignments + concordant multiple alignments + non-paired single alignments)/(cleaned input pairs)
+
+ The overall Alignment rate for tophat2 is calculated as follows:
+
+ ((left mapped reads + right mapped reads)/2)/(cleaned input pairs)
+
+ Logfile names must be formatted as follows:
+
+ <sample number>-<unique file identifier>-align_summary.txt (only for tophat)
+ <sample number>-<unique file identifier>-fastq-out.txt (for hisat)
+ <sample number>-unique file identifier>-fastq-trimmomatic.txt
+
+ for example:
+ 2-000000-000000.fastq-align_summary.txt
+ 1-000000-000000.fastq-out.txt
+ 1-000000-000000.fastq-trimmomatic.txt
+
+ NOTE: do not mix log files for tophat and hisat
+
+ """
+ triminput = []
+ trimoutput = []
+ mapped = []
+ concordant = []
+ sample = str(sample)
+ software = str(software)
+ report = {}
+ if software == 'hisat2':
+ for file in trimlist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ for line in open(file):
+ if 'Input Read Pairs' in line:
+ inp = int(line.split(' ')[3])
+ triminput.append(inp)
+ out = int(line.split(' ')[6])
+ trimoutput.append(out)
+ for file in maplist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ for line in open(file):
+ if 'aligned concordantly exactly 1 time' in line:
+ conc = int(line.split('(')[0])
+ concordant.append(conc)
+ mapped.append(conc)
+ elif 'aligned concordantly >1 times' in line:
+ mult = int(line.split('(')[0])
+ mapped.append(mult)
+ elif 'aligned exactly 1 time' in line:
+ singlemap = int(line.split('(')[0])
+ mapped.append(singlemap)
+
+ elif software == 'tophat2':
+ for file in trimlist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ for line in open(file):
+ if 'Input Read Pairs' in line:
+ inp = int(line.split(' ')[3])
+ triminput.append(inp)
+ out = int(line.split(' ')[6])
+ trimoutput.append(out)
+ for file in maplist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ totalpair = []
+ multipair = []
+ discordpair = []
+ leftmap = []
+ rightmap = []
+
+ count = 0
+ for line in open(file):
+ count +=1
+ if count == 11:
+ totalpairs = int(line.split(':')[1].strip())
+ totalpair.append(totalpairs)
+ elif count == 12:
+ multipairs = int(line.split(':')[1].split('(')[0].strip())
+ multipair.append(multipairs)
+ elif count == 13:
+ discordpairs = int(line.split('(')[0].strip())
+ discordpair.append(discordpairs)
+ elif count == 3:
+ leftmapped = int(line.split(':')[1].split('(')[0].strip())
+ leftmap.append(leftmapped)
+ elif count == 7:
+ rightmapped = int(line.split(':')[1].split('(')[0].strip())
+ rightmap.append(rightmapped)
+
+ totalpair = sum(totalpair)
+ multipair = sum(multipair)
+ discordpair = sum(discordpair)
+ leftmap = sum(leftmap)
+ rightmap = sum(rightmap)
+ mappedsingles = leftmap + rightmap
+ concordantpairs = totalpair - multipair - discordpair
+ concordant.append(concordantpairs)
+ mapped.append(mappedsingles)
+
+ if software == 'tophat2':
+
+ inputraw = sum(triminput)
+ inputclean = sum(trimoutput)
+ concordant = sum(concordant)
+ mapped = sum(mapped)
+ float1 = float(mapped)
+ float2 = float(inputclean)
+ float3 = float1/2
+ rate = float3/float2
+
+ report[sample] = [inputraw, inputclean, concordant, rate*100]
+ return report
+
+ elif software == 'hisat2':
+ inputraw = sum(triminput)
+ inputclean = sum(trimoutput)
+ concordant = sum(concordant)
+ mapped = sum(mapped)
+ float1 = float(mapped)
+ float2 = float(inputclean)
+ rate = float1/float2
+
+ report[sample] = [inputraw, inputclean, concordant, rate*100]
+ return report
+
+
+
+def main():
+
+ software = sys.argv[1]
+
+ #Print header of QC report
+ print 'Sample' + '\t' + 'raw_input_pairs' + '\t' + 'cleaned_input_pairs' + '\t' + 'concordant_single_alignments' + '\t' + 'Overall_Alignment_Rate'
+
+ #locate input log files and append to lists
+
+ #tophat alignment summary
+ maploglist = [file for file in glob.glob(('*summary.txt'))]
+
+ if len(maploglist) == 0:
+ #If there are no tophat alignment summaries, look for hisat standard output instead
+ maploglist = [file for file in glob.glob('*fastq-out.txt')]
+
+ trimloglist = [file for file in glob.glob('*-trimmomatic.txt')]
+ if len(maploglist) != len(trimloglist):
+ print "WARNING: the number of trimmomatic log files is different than the number of hisat log files. Output may have frameshift."
+ inputlist = []
+ for file in maploglist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset not in inputlist:
+ inputlist.append(dataset)
+
+ #for each unique sample, call the ParseLogs function and print results in tab-delimited format
+ for dataset in inputlist:
+ stats = ParseLogs(trimloglist, maploglist, software, dataset)
+ print dataset + '\t' + str(stats[dataset][0]) + '\t' + str(stats[dataset][1]) + '\t' + str(stats[dataset][2]) + '\t' + str(stats[dataset][3]) + '%'
+
+
+if __name__ == '__main__':
+ main()
+
+
+
+
+
+
+
View
@@ -24,7 +24,7 @@ from Pegasus.DAX3 import *
ref_files = {}
gff3_files = {}
fpkm_files = []
-
+log_files = []
# read the config file
conf = ConfigParser.SafeConfigParser()
r = conf.read(base_dir + "/osg-gem.conf")
@@ -115,9 +115,11 @@ def hisat2(task_files, dax, base_name, part, common_part, forward_file, reverse_
#f2 = File(common_part + "-align_summary.txt")
#j.uses(f2, link=Link.OUTPUT, transfer=True)
f3 = File(base_name + "-" + common_part + "-out.txt")
+ log_files.append(f3)
j.uses(f3, link=Link.OUTPUT, transfer=True)
f4 = File(base_name + "-" + common_part + "-trimmomatic.txt")
j.uses(f4, link=Link.OUTPUT, transfer=True)
+ log_files.append(f4)
dax.addJob(j)
return f1
@@ -142,10 +144,12 @@ def tophat(task_files, dax, base_name, part, common_part, forward_file, reverse_
j.uses(f1, link=Link.OUTPUT, transfer=False)
f2 = File(base_name + "-" + common_part + "-align_summary.txt")
j.uses(f2, link=Link.OUTPUT, transfer=True)
+ log_files.append(f2)
f3 = File(base_name + "-" + common_part + "-out.txt")
j.uses(f3, link=Link.OUTPUT, transfer=True)
f4 = File(base_name + "-" + common_part + "-trimmomatic.txt")
j.uses(f4, link=Link.OUTPUT, transfer=True)
+ log_files.append(f4)
dax.addJob(j)
return f1
@@ -308,15 +312,22 @@ def main():
# merge all the fpkms
j = Job(name="gem-merge")
-
+ if conf.getboolean("config", "hisat2"):
+ j.addArguments("hisat2")
+ elif conf.getboolean("config", "tophat2"):
+ j.addArguments("tophat2")
# inputs
add_task_files(task_files, dax, j, "gem-merge")
for f in fpkm_files:
j.uses(f, link=Link.INPUT)
+ for g in log_files:
+ j.uses(g, link=Link.INPUT)
# outputs
f = File("merged_GEM.tab")
+ g = File("QC_Report.tab")
j.uses(f, link=Link.OUTPUT, transfer=True)
+ j.uses(g, link=Link.OUTPUT, transfer=True)
dax.addJob(j)
# Write the DAX
View
@@ -1,5 +1,7 @@
#!/bin/bash
+SOFTWARE=$1
+
set -e
# remove "empty" inputs
@@ -31,4 +33,4 @@ mv *.fpkm inputs/
perl GEM_merge.pl inputs/ > merged_GEM.tab
-
+python Log_merge.py $SOFTWARE > QC_Report.tab

0 comments on commit 869eb35

Please sign in to comment.