Permalink
Browse files

Added support for single end reads. User may select single or paired …

…end reads as input
  • Loading branch information...
1 parent 869eb35 commit 505b81a63d472f5e5e3222fbd6bae3eefb3dab34 @wpoehlm wpoehlm committed Feb 21, 2017
Binary file not shown.
View
@@ -15,16 +15,28 @@ reference_prefix = chr21-GRCh38
#
# input2 = DRR046893
#
+# or a single fastq file for single end reads. Example:
+# input3 = SRR4343300.fastq.gz
input1 = ./Test_data/TEST_1.fastq.gz ./Test_data/TEST_2.fastq.gz
+#input1 = ./Test_data/SRR4343300.fastq.gz
+#input2 = SRR4343300
#input2 = DRR046893
+
[config]
+
# Memory available to the jobs. This should be roughly 2X the
# size of the reference genome, rounded up whole GB
memory = 4 GB
+# Reads are single end
+single = False
+
+# Reads are paired end
+paired = True
+
# process using TopHat2
tophat2 = False
@@ -37,3 +49,4 @@ cufflinks = False
# process using StringTie
stringtie = True
+
@@ -8,54 +8,56 @@
import os
import sys
-def ParseLogs(trimlist, maplist, software, sample):
+def ParseLogs(trimlist, maplist, software, sample, layout):
"""
Merges statistics from lists of split trimmomatic and hisat/tophat log files for a given sample
-
-
+
+
arguments:
trimlist --> a list of trimmomatic log files
maplist --> a list of hisat2 or tophat2 log files
software --> read mapping software that generated log files. 'hisat' or 'tophat'
sample --> A sample number that is present at the beginning of each filename
-
+
output:
A dictionary, where the sample ID number is the key, and the following statistics are stored in the list value:
Number of raw input read pairs
Number of read pairs following trimming
Number of concordant, single alignments
Overall Alignment rate
-
+
The overall Alignment rate for hisat2 is calculated as follows:
-
+
(concordant single alignments + concordant multiple alignments + non-paired single alignments)/(cleaned input pairs)
-
+
The overall Alignment rate for tophat2 is calculated as follows:
-
+
((left mapped reads + right mapped reads)/2)/(cleaned input pairs)
-
+
Logfile names must be formatted as follows:
-
+
<sample number>-<unique file identifier>-align_summary.txt (only for tophat)
<sample number>-<unique file identifier>-fastq-out.txt (for hisat)
<sample number>-unique file identifier>-fastq-trimmomatic.txt
-
+
for example:
2-000000-000000.fastq-align_summary.txt
1-000000-000000.fastq-out.txt
- 1-000000-000000.fastq-trimmomatic.txt
-
+ 1-000000-000000.fastq-trimmomatic.txt
+
NOTE: do not mix log files for tophat and hisat
-
+
"""
triminput = []
trimoutput = []
mapped = []
concordant = []
+ multi = []
+ single = []
sample = str(sample)
software = str(software)
report = {}
- if software == 'hisat2':
+ if software == 'hisat2' and layout == 'paired':
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
@@ -81,8 +83,8 @@ def ParseLogs(trimlist, maplist, software, sample):
elif 'aligned exactly 1 time' in line:
singlemap = int(line.split('(')[0])
mapped.append(singlemap)
-
- elif software == 'tophat2':
+
+ elif software == 'tophat2' and layout == 'paired':
for file in trimlist:
basename = os.path.splitext(file)[0]
dataset = basename.split('-')[0]
@@ -102,10 +104,10 @@ def ParseLogs(trimlist, maplist, software, sample):
discordpair = []
leftmap = []
rightmap = []
-
+
count = 0
for line in open(file):
- count +=1
+ count +=1
if count == 11:
totalpairs = int(line.split(':')[1].strip())
totalpair.append(totalpairs)
@@ -121,19 +123,19 @@ def ParseLogs(trimlist, maplist, software, sample):
elif count == 7:
rightmapped = int(line.split(':')[1].split('(')[0].strip())
rightmap.append(rightmapped)
-
+
totalpair = sum(totalpair)
multipair = sum(multipair)
discordpair = sum(discordpair)
leftmap = sum(leftmap)
rightmap = sum(rightmap)
mappedsingles = leftmap + rightmap
- concordantpairs = totalpair - multipair - discordpair
+ concordantpairs = totalpair - multipair - discordpair
concordant.append(concordantpairs)
mapped.append(mappedsingles)
-
- if software == 'tophat2':
-
+
+ if software == 'tophat2' and layout == 'paired':
+
inputraw = sum(triminput)
inputclean = sum(trimoutput)
concordant = sum(concordant)
@@ -142,30 +144,110 @@ def ParseLogs(trimlist, maplist, software, sample):
float2 = float(inputclean)
float3 = float1/2
rate = float3/float2
-
+
report[sample] = [inputraw, inputclean, concordant, rate*100]
return report
-
- elif software == 'hisat2':
+
+ elif software == 'hisat2' and layout == 'paired':
inputraw = sum(triminput)
inputclean = sum(trimoutput)
concordant = sum(concordant)
mapped = sum(mapped)
float1 = float(mapped)
float2 = float(inputclean)
rate = float1/float2
-
+
report[sample] = [inputraw, inputclean, concordant, rate*100]
return report
-
-
+
+ elif software == 'tophat2' and layout == 'single':
+ for file in trimlist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ for line in open(file):
+ if 'Input Reads' in line:
+ inp = int(line.split(' ')[2])
+ triminput.append(inp)
+ out = int(line.split(' ')[4])
+ trimoutput.append(out)
+ for file in maplist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ count = 0
+ for line in open(file):
+ count +=1
+ if count == 3:
+ map = int(line.split(':')[1].split('(')[0].strip())
+ mapped.append(map)
+ elif count == 4:
+ mult = int(line.split(':')[1].split('(')[0].strip())
+ multi.append(mult)
+ totalreads = sum(triminput)
+ cleanreads = sum(trimoutput)
+ mappedtotal = sum(mapped)
+ multimapped = sum(multi)
+ singlemapped = mappedtotal - multimapped
+ float1 = float(cleanreads)
+ float2 = float(mappedtotal)
+ rate = float2/float1
+
+ report[sample] = [totalreads, cleanreads, singlemapped, rate*100]
+ return report
+
+
+ elif software == 'hisat2' and layout == 'single':
+ for file in trimlist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ for line in open(file):
+ if 'Input Reads' in line:
+ inp = int(line.split(' ')[2])
+ triminput.append(inp)
+ out = int(line.split(' ')[4])
+ trimoutput.append(out)
+ for file in maplist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ count = 0
+ for line in open(file):
+ count +=1
+ if count == 7:
+ sing = int(line.split('(')[0].strip())
+ single.append(sing)
+ elif count == 8:
+ mult = int(line.split('(')[0].strip())
+ multi.append(mult)
+
+
+
+ totalreads = sum(triminput)
+ cleanreads = sum(trimoutput)
+ totalsingle = sum(single)
+ totalmulti = sum(multi)
+ mappedtotal = int(totalsingle + totalmulti)
+ float1 = float(cleanreads)
+ float2 = float(mappedtotal)
+ rate = float2/float1
+
+ report[sample] = [totalreads, cleanreads, totalsingle, rate*100]
+ return report
+
+
def main():
software = sys.argv[1]
+ layout = sys.argv[2]
#Print header of QC report
- print 'Sample' + '\t' + 'raw_input_pairs' + '\t' + 'cleaned_input_pairs' + '\t' + 'concordant_single_alignments' + '\t' + 'Overall_Alignment_Rate'
+ if layout == 'paired':
+ print 'Sample' + '\t' + 'raw_input_pairs' + '\t' + 'cleaned_input_pairs' + '\t' + 'concordant_single_alignments' + '\t' + 'Overall_Alignment_Rate'
+ elif layout == 'single':
+ print 'Sample' + '\t' + 'raw_input_reads' + '\t' + 'cleaned_input_reads' + '\t' + 'single_alignments' + '\t' + 'Overall_Alignment_Rate'
#locate input log files and append to lists
@@ -185,19 +267,14 @@ def main():
dataset = basename.split('-')[0]
if dataset not in inputlist:
inputlist.append(dataset)
-
- #for each unique sample, call the ParseLogs function and print results in tab-delimited format
+
+ #for each unique sample, call the ParseLogs function and print results in tab-delimited format
for dataset in inputlist:
- stats = ParseLogs(trimloglist, maploglist, software, dataset)
+ stats = ParseLogs(trimloglist, maploglist, software, dataset, layout)
print dataset + '\t' + str(stats[dataset][0]) + '\t' + str(stats[dataset][1]) + '\t' + str(stats[dataset][2]) + '\t' + str(stats[dataset][3]) + '%'
if __name__ == '__main__':
main()
-
-
-
-
-
View
Binary file not shown.
Oops, something went wrong.

0 comments on commit 505b81a

Please sign in to comment.