tools/stats/grouping.py

#!/usr/bin/env python
# Guruprasad Ananda
# Refactored 2011 to use numpy instead of rpy, Kanwei Li
"""
This tool provides the SQL "group by" functionality.
"""
from __future__ import print_function

import random
import subprocess
import sys
import tempfile
from itertools import groupby

import numpy


def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()


def mode(data):
    counts = {}
    for x in data:
        counts[x] = counts.get(x, 0) + 1
    maxcount = max(counts.values())
    modelist = []
    for x in counts:
        if counts[x] == maxcount:
            modelist.append(str(x))
    return ','.join(modelist)


def main():
    inputfile = sys.argv[2]
    ignorecase = int(sys.argv[4])
    ops = []
    cols = []
    round_val = []

    if sys.argv[5] != "None":
        asciitodelete = sys.argv[5]
        if asciitodelete:
            oldfile = open(inputfile, 'r')
            newinputfile = "input_cleaned.tsv"
            newfile = open(newinputfile, 'w')
            asciitodelete = asciitodelete.split(',')
            for i in range(len(asciitodelete)):
                asciitodelete[i] = chr(int(asciitodelete[i]))
            for line in oldfile:
                if line[0] not in asciitodelete:
                    newfile.write(line)
            oldfile.close()
            newfile.close()
            inputfile = newinputfile

    for var in sys.argv[6:]:
        op, col, do_round = var.split()
        ops.append(op)
        cols.append(col)
        round_val.append(do_round)
    """
    At this point, ops, cols and rounds will look something like this:
    ops:  ['mean', 'min', 'c']
    cols: ['1', '3', '4']
    round_val: ['no', 'yes' 'no']
    """

    try:
        group_col = int(sys.argv[3]) - 1
    except Exception:
        stop_err("Group column not specified.")

    tmpfile = tempfile.NamedTemporaryFile(mode='r')

    try:
        """
        The -k option for the Posix sort command is as follows:
        -k, --key=POS1[,POS2]
        start a key at POS1, end it at POS2 (origin 1)
        In other words, column positions start at 1 rather than 0, so
        we need to add 1 to group_col.
        if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
        """
        case = ''
        if ignorecase == 1:
            case = '-f'
        command_line = "sort -t '	' %s -k%s,%s -o %s %s" % (case, group_col + 1, group_col + 1, tmpfile.name, inputfile)
    except Exception as exc:
        stop_err('Initialization error -> %s' % str(exc))

    try:
        subprocess.check_output(command_line, stderr=subprocess.STDOUT, shell=True)
    except subprocess.CalledProcessError as e:
        stop_err("Sorting input dataset resulted in error: %s: %s" % (e.returncode, e.output))

    fout = open(sys.argv[1], "w")

    def is_new_item(line):
        try:
            item = line.strip().split("\t")[group_col]
        except IndexError:
            stop_err("The following line didn't have %s columns: %s" % (group_col + 1, line))

        if ignorecase == 1:
            return item.lower()
        return item

    for key, line_list in groupby(tmpfile, key=is_new_item):
        op_vals = [[] for _ in ops]
        out_str = key

        for line in line_list:
            fields = line.strip().split("\t")
            for i, col in enumerate(cols):
                col = int(col) - 1  # cXX from galaxy is 1-based
                try:
                    val = fields[col].strip()
                    op_vals[i].append(val)
                except IndexError:
                    sys.stderr.write('Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col + 1, line))
                    sys.exit(1)

        # Generate string for each op for this group
        for i, op in enumerate(ops):
            data = op_vals[i]
            rval = ""
            if op == "mode":
                rval = mode(data)
            elif op == "length":
                rval = len(data)
            elif op == "random":
                rval = random.choice(data)
            elif op in ['cat', 'cat_uniq']:
                if op == 'cat_uniq':
                    data = numpy.unique(data)
                rval = ','.join(data)
            elif op == "unique":
                rval = len(numpy.unique(data))
            else:
                # some kind of numpy fn
                try:
                    data = [float(_) for _ in data]
                except ValueError:
                    sys.stderr.write("Operation %s expected number values but got %s instead.\n" % (op, data))
                    sys.exit(1)
                rval = getattr(numpy, op)(data)
                if round_val[i] == 'yes':
                    rval = int(round(rval))
                else:
                    rval = '%g' % rval
            out_str += "\t%s" % rval

        fout.write(out_str + "\n")

    # Generate a useful info message.
    msg = "--Group by c%d: " % (group_col + 1)
    for i, op in enumerate(ops):
        if op == 'cat':
            op = 'concat'
        elif op == 'cat_uniq':
            op = 'concat_distinct'
        elif op == 'length':
            op = 'count'
        elif op == 'unique':
            op = 'count_distinct'
        elif op == 'random':
            op = 'randomly_pick'

        msg += op + "[c" + cols[i] + "] "

    print(msg)
    fout.close()
    tmpfile.close()


if __name__ == "__main__":
    main()