multiqc/modules/tophat/tophat.py

#!/usr/bin/env python

""" MultiQC module to parse output from Tophat """

from __future__ import print_function
from collections import OrderedDict
import logging
import os
import re

from multiqc import config
from multiqc.plots import bargraph
from multiqc.modules.base_module import BaseMultiqcModule

# Initialise the logger
log = logging.getLogger(__name__)

class MultiqcModule(BaseMultiqcModule):

    def __init__(self):

        # Initialise the parent object
        super(MultiqcModule, self).__init__(name='Tophat', anchor='tophat',
        href="https://ccb.jhu.edu/software/tophat/",
        info="is a fast splice junction mapper for RNA-Seq reads. "\
        "It aligns RNA-Seq reads to mammalian-sized genomes.")

        # Find and load any Tophat reports
        self.tophat_data = dict()
        for f in self.find_log_files('tophat'):
            parsed_data = self.parse_tophat_log(f['f'])
            if parsed_data is not None:
                if (f['s_name'] == "align" or
                        f['s_name'] == 'align_summary.txt'):
                    s_name = os.path.basename(f['root'])
                else:
                    s_name = f['s_name'].split("align_summary.txt",1)[0]
                s_name = self.clean_s_name(s_name, f['root'])
                if s_name in self.tophat_data:
                    log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
                self.add_data_source(f, s_name)
                self.tophat_data[s_name] = parsed_data

        # Filter to strip out ignored sample names
        self.tophat_data = self.ignore_samples(self.tophat_data)

        if len(self.tophat_data) == 0:
            raise UserWarning

        log.info("Found {} reports".format(len(self.tophat_data)))

        # Write parsed report data to a file
        self.write_data_file(self.tophat_data, 'multiqc_tophat.txt')

        # Basic Stats Table
        self.tophat_general_stats_table()

        # Alignment Rate Plot
        self.tophat_alignment_plot()


    def parse_tophat_log (self, raw_data):
        """ Parse the Tophat alignment log file. """

        if 'Aligned pairs' in raw_data:
            # Paired end data
            regexes = {
                'overall_aligned_percent': r"([\d\.]+)% overall read mapping rate.",
                'concordant_aligned_percent': r"([\d\.]+)% concordant pair alignment rate.",
                'aligned_total': r"Aligned pairs:\s+(\d+)",
                'aligned_multimap': r"Aligned pairs:\s+\d+\n\s+of these:\s+(\d+)",
                'aligned_discordant': r"(\d+) \([\s\d\.]+%\) are discordant alignments",
                'total_reads': r"[Rr]eads:\n\s+Input\s*:\s+(\d+)",
            }
        else:
            # Single end data
            regexes = {
                'total_reads': r"[Rr]eads:\n\s+Input\s*:\s+(\d+)",
                'aligned_total': r"Mapped\s*:\s+(\d+)",
                'aligned_multimap': r"of these\s*:\s+(\d+)",
                'overall_aligned_percent': r"([\d\.]+)% overall read mapping rate.",
            }

        parsed_data = {}
        for k, r in regexes.items():
            r_search = re.search(r, raw_data, re.MULTILINE)
            if r_search:
                parsed_data[k] = float(r_search.group(1))

        # Exit if we didn't manage to parse enough fields - probably not a TopHat log
        # Note that Bowtie2 / HiSAT2 logs contain some but not all of these strings
        if len(parsed_data) < 4: return None

        parsed_data['concordant_aligned_percent'] = parsed_data.get('concordant_aligned_percent', 0)
        parsed_data['aligned_total'] = parsed_data.get('aligned_total', 0)
        parsed_data['aligned_multimap'] = parsed_data.get('aligned_multimap', 0)
        parsed_data['aligned_discordant'] = parsed_data.get('aligned_discordant', 0)
        parsed_data['unaligned_total'] = parsed_data['total_reads'] - parsed_data['aligned_total']
        parsed_data['aligned_not_multimapped_discordant'] = parsed_data['aligned_total'] - parsed_data['aligned_multimap'] - parsed_data['aligned_discordant']
        return parsed_data


    def tophat_general_stats_table(self):
        """ Take the parsed stats from the Tophat report and add it to the
        basic stats table at the top of the report """

        headers = OrderedDict()
        headers['overall_aligned_percent'] = {
            'title': '% Aligned',
            'description': 'overall read mapping rate',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'YlGn'
        }
        headers['aligned_not_multimapped_discordant'] = {
            'title': '{} Aligned'.format(config.read_count_prefix),
            'description': 'Aligned reads, not multimapped or discordant ({})'.format(config.read_count_desc),
            'min': 0,
            'scale': 'PuRd',
            'modify': lambda x: x * config.read_count_multiplier,
            'shared_key': 'read_count'
        }
        self.general_stats_addcols(self.tophat_data, headers)

    def tophat_alignment_plot (self):
        """ Make the HighCharts HTML to plot the alignment rates """

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['aligned_not_multimapped_discordant'] = { 'color': '#437bb1', 'name': 'Aligned' }
        keys['aligned_multimap'] =   { 'color': '#f7a35c', 'name': 'Multimapped' }
        keys['aligned_discordant'] = { 'color': '#e63491', 'name': 'Discordant mappings' }
        keys['unaligned_total'] =    { 'color': '#7f0000', 'name': 'Not aligned' }

        # Config for the plot
        config = {
            'id': 'tophat_alignment',
            'title': 'Tophat: Alignment Scores',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads'
        }

        self.add_section( plot =  bargraph.plot(self.tophat_data, keys, config) )