/
tophat.py
executable file
·144 lines (117 loc) · 5.76 KB
/
tophat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
""" MultiQC module to parse output from Tophat """
from __future__ import print_function
from collections import OrderedDict
import logging
import os
import re
from multiqc import config
from multiqc.plots import bargraph
from multiqc.modules.base_module import BaseMultiqcModule
# Initialise the logger
log = logging.getLogger(__name__)
class MultiqcModule(BaseMultiqcModule):
def __init__(self):
# Initialise the parent object
super(MultiqcModule, self).__init__(name='Tophat', anchor='tophat',
href="https://ccb.jhu.edu/software/tophat/",
info="is a fast splice junction mapper for RNA-Seq reads. "\
"It aligns RNA-Seq reads to mammalian-sized genomes.")
# Find and load any Tophat reports
self.tophat_data = dict()
for f in self.find_log_files('tophat'):
parsed_data = self.parse_tophat_log(f['f'])
if parsed_data is not None:
if (f['s_name'] == "align" or
f['s_name'] == 'align_summary.txt'):
s_name = os.path.basename(f['root'])
else:
s_name = f['s_name'].split("align_summary.txt",1)[0]
s_name = self.clean_s_name(s_name, f['root'])
if s_name in self.tophat_data:
log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
self.add_data_source(f, s_name)
self.tophat_data[s_name] = parsed_data
# Filter to strip out ignored sample names
self.tophat_data = self.ignore_samples(self.tophat_data)
if len(self.tophat_data) == 0:
raise UserWarning
log.info("Found {} reports".format(len(self.tophat_data)))
# Write parsed report data to a file
self.write_data_file(self.tophat_data, 'multiqc_tophat.txt')
# Basic Stats Table
self.tophat_general_stats_table()
# Alignment Rate Plot
self.tophat_alignment_plot()
def parse_tophat_log (self, raw_data):
""" Parse the Tophat alignment log file. """
if 'Aligned pairs' in raw_data:
# Paired end data
regexes = {
'overall_aligned_percent': r"([\d\.]+)% overall read mapping rate.",
'concordant_aligned_percent': r"([\d\.]+)% concordant pair alignment rate.",
'aligned_total': r"Aligned pairs:\s+(\d+)",
'aligned_multimap': r"Aligned pairs:\s+\d+\n\s+of these:\s+(\d+)",
'aligned_discordant': r"(\d+) \([\s\d\.]+%\) are discordant alignments",
'total_reads': r"[Rr]eads:\n\s+Input\s*:\s+(\d+)",
}
else:
# Single end data
regexes = {
'total_reads': r"[Rr]eads:\n\s+Input\s*:\s+(\d+)",
'aligned_total': r"Mapped\s*:\s+(\d+)",
'aligned_multimap': r"of these\s*:\s+(\d+)",
'overall_aligned_percent': r"([\d\.]+)% overall read mapping rate.",
}
parsed_data = {}
for k, r in regexes.items():
r_search = re.search(r, raw_data, re.MULTILINE)
if r_search:
parsed_data[k] = float(r_search.group(1))
# Exit if we didn't manage to parse enough fields - probably not a TopHat log
# Note that Bowtie2 / HiSAT2 logs contain some but not all of these strings
if len(parsed_data) < 4: return None
parsed_data['concordant_aligned_percent'] = parsed_data.get('concordant_aligned_percent', 0)
parsed_data['aligned_total'] = parsed_data.get('aligned_total', 0)
parsed_data['aligned_multimap'] = parsed_data.get('aligned_multimap', 0)
parsed_data['aligned_discordant'] = parsed_data.get('aligned_discordant', 0)
parsed_data['unaligned_total'] = parsed_data['total_reads'] - parsed_data['aligned_total']
parsed_data['aligned_not_multimapped_discordant'] = parsed_data['aligned_total'] - parsed_data['aligned_multimap'] - parsed_data['aligned_discordant']
return parsed_data
def tophat_general_stats_table(self):
""" Take the parsed stats from the Tophat report and add it to the
basic stats table at the top of the report """
headers = OrderedDict()
headers['overall_aligned_percent'] = {
'title': '% Aligned',
'description': 'overall read mapping rate',
'max': 100,
'min': 0,
'suffix': '%',
'scale': 'YlGn'
}
headers['aligned_not_multimapped_discordant'] = {
'title': '{} Aligned'.format(config.read_count_prefix),
'description': 'Aligned reads, not multimapped or discordant ({})'.format(config.read_count_desc),
'min': 0,
'scale': 'PuRd',
'modify': lambda x: x * config.read_count_multiplier,
'shared_key': 'read_count'
}
self.general_stats_addcols(self.tophat_data, headers)
def tophat_alignment_plot (self):
""" Make the HighCharts HTML to plot the alignment rates """
# Specify the order of the different possible categories
keys = OrderedDict()
keys['aligned_not_multimapped_discordant'] = { 'color': '#437bb1', 'name': 'Aligned' }
keys['aligned_multimap'] = { 'color': '#f7a35c', 'name': 'Multimapped' }
keys['aligned_discordant'] = { 'color': '#e63491', 'name': 'Discordant mappings' }
keys['unaligned_total'] = { 'color': '#7f0000', 'name': 'Not aligned' }
# Config for the plot
config = {
'id': 'tophat_alignment',
'title': 'Tophat: Alignment Scores',
'ylab': '# Reads',
'cpswitch_counts_label': 'Number of Reads'
}
self.add_section( plot = bargraph.plot(self.tophat_data, keys, config) )