-
Notifications
You must be signed in to change notification settings - Fork 590
/
kallisto.py
131 lines (105 loc) · 4.89 KB
/
kallisto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
""" MultiQC module to parse output from Kallisto """
from __future__ import print_function
from collections import OrderedDict
import logging
import re
from multiqc import config
from multiqc.plots import bargraph
from multiqc.modules.base_module import BaseMultiqcModule
# Initialise the logger
log = logging.getLogger(__name__)
class MultiqcModule(BaseMultiqcModule):
""" Kallisto module """
def __init__(self):
# Initialise the parent object
super(MultiqcModule, self).__init__(name='Kallisto', anchor='kallisto',
href="http://pachterlab.github.io/kallisto/",
info="is a program for quantifying abundances of transcripts from RNA-Seq data.")
# Find and load any Kallisto reports
self.kallisto_data = dict()
for f in self.find_log_files('kallisto', filehandles=True):
self.parse_kallisto_log(f)
# Filter to strip out ignored sample names
self.kallisto_data = self.ignore_samples(self.kallisto_data)
if len(self.kallisto_data) == 0:
raise UserWarning
log.info("Found {} reports".format(len(self.kallisto_data)))
# Write parsed report data to a file
self.write_data_file(self.kallisto_data, 'multiqc_kallisto')
# Basic Stats Table
self.kallisto_general_stats_table()
# Alignment Rate Plot
self.add_section( plot = self.kallisto_alignment_plot() )
def parse_kallisto_log(self, f):
s_name = total_reads = paligned_reads = fraglength = None
for l in f['f']:
# Get input filename
match = re.search(r'\[quant\] will process (pair|file) 1: (\S+)', l)
if match:
s_name = self.clean_s_name(match.group(2), f['root'])
if s_name is not None:
# Alignment rates
aligned = re.search(r'\[quant\] processed ([\d,]+) reads, ([\d,]+) reads pseudoaligned', l)
if aligned:
total_reads = float(aligned.group(1).replace(',',''))
paligned_reads = float(aligned.group(2).replace(',',''))
# Paired end fragment lengths
flength = re.search(r'\[quant\] estimated average fragment length: ([\d\.]+)', l)
if flength:
fraglength = float(flength.group(1).replace(',',''))
if 'quantifying the abundances' in l:
if s_name in self.kallisto_data:
log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
self.add_data_source(f, s_name)
self.kallisto_data[s_name] = {
'total_reads': total_reads,
'pseudoaligned_reads': paligned_reads,
'not_pseudoaligned_reads': total_reads - paligned_reads,
'percent_aligned': (paligned_reads / total_reads) * 100
}
if fraglength is not None:
self.kallisto_data[s_name]['fragment_length'] = fraglength
s_name = total_reads = paligned_reads = fraglength = None
def kallisto_general_stats_table(self):
""" Take the parsed stats from the Kallisto report and add it to the
basic stats table at the top of the report """
headers = OrderedDict()
headers['fragment_length'] = {
'title': 'Frag Length',
'description': 'Estimated average fragment length',
'min': 0,
'suffix': 'bp',
'scale': 'RdYlGn'
}
headers['percent_aligned'] = {
'title': '% Aligned',
'description': '% processed reads that were pseudoaligned',
'max': 100,
'min': 0,
'suffix': '%',
'scale': 'YlGn'
}
headers['pseudoaligned_reads'] = {
'title': '{} Aligned'.format(config.read_count_prefix),
'description': 'Pseudoaligned reads ({})'.format(config.read_count_desc),
'min': 0,
'scale': 'PuRd',
'modify': lambda x: x * config.read_count_multiplier,
'shared_key': 'read_count'
}
self.general_stats_addcols(self.kallisto_data, headers)
def kallisto_alignment_plot (self):
""" Make the HighCharts HTML to plot the alignment rates """
# Specify the order of the different possible categories
keys = OrderedDict()
keys['pseudoaligned_reads'] = { 'color': '#437bb1', 'name': 'Pseudoaligned' }
keys['not_pseudoaligned_reads'] = { 'color': '#b1084c', 'name': 'Not aligned' }
# Config for the plot
config = {
'id': 'kallisto_alignment',
'title': 'Kallisto: Alignment Scores',
'ylab': '# Reads',
'cpswitch_counts_label': 'Number of Reads'
}
return bargraph.plot(self.kallisto_data, keys, config)