/
grouping.py
179 lines (153 loc) · 5.37 KB
/
grouping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python
# Guruprasad Ananda
# Refactored 2011 to use numpy instead of rpy, Kanwei Li
"""
This tool provides the SQL "group by" functionality.
"""
from __future__ import print_function
import random
import subprocess
import sys
import tempfile
from itertools import groupby
import numpy
def stop_err(msg):
sys.stderr.write(msg)
sys.exit()
def mode(data):
counts = {}
for x in data:
counts[x] = counts.get(x, 0) + 1
maxcount = max(counts.values())
modelist = []
for x in counts:
if counts[x] == maxcount:
modelist.append(str(x))
return ','.join(modelist)
def main():
inputfile = sys.argv[2]
ignorecase = int(sys.argv[4])
ops = []
cols = []
round_val = []
if sys.argv[5] != "None":
asciitodelete = sys.argv[5]
if asciitodelete:
oldfile = open(inputfile, 'r')
newinputfile = "input_cleaned.tsv"
newfile = open(newinputfile, 'w')
asciitodelete = asciitodelete.split(',')
for i in range(len(asciitodelete)):
asciitodelete[i] = chr(int(asciitodelete[i]))
for line in oldfile:
if line[0] not in asciitodelete:
newfile.write(line)
oldfile.close()
newfile.close()
inputfile = newinputfile
for var in sys.argv[6:]:
op, col, do_round = var.split()
ops.append(op)
cols.append(col)
round_val.append(do_round)
"""
At this point, ops, cols and rounds will look something like this:
ops: ['mean', 'min', 'c']
cols: ['1', '3', '4']
round_val: ['no', 'yes' 'no']
"""
try:
group_col = int(sys.argv[3]) - 1
except Exception:
stop_err("Group column not specified.")
tmpfile = tempfile.NamedTemporaryFile(mode='r')
try:
"""
The -k option for the Posix sort command is as follows:
-k, --key=POS1[,POS2]
start a key at POS1, end it at POS2 (origin 1)
In other words, column positions start at 1 rather than 0, so
we need to add 1 to group_col.
if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
"""
case = ''
if ignorecase == 1:
case = '-f'
command_line = "sort -t ' ' %s -k%s,%s -o %s %s" % (case, group_col + 1, group_col + 1, tmpfile.name, inputfile)
except Exception as exc:
stop_err('Initialization error -> %s' % str(exc))
try:
subprocess.check_output(command_line, stderr=subprocess.STDOUT, shell=True)
except subprocess.CalledProcessError as e:
stop_err("Sorting input dataset resulted in error: %s: %s" % (e.returncode, e.output))
fout = open(sys.argv[1], "w")
def is_new_item(line):
try:
item = line.strip().split("\t")[group_col]
except IndexError:
stop_err("The following line didn't have %s columns: %s" % (group_col + 1, line))
if ignorecase == 1:
return item.lower()
return item
for key, line_list in groupby(tmpfile, key=is_new_item):
op_vals = [[] for _ in ops]
out_str = key
for line in line_list:
fields = line.strip().split("\t")
for i, col in enumerate(cols):
col = int(col) - 1 # cXX from galaxy is 1-based
try:
val = fields[col].strip()
op_vals[i].append(val)
except IndexError:
sys.stderr.write('Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col + 1, line))
sys.exit(1)
# Generate string for each op for this group
for i, op in enumerate(ops):
data = op_vals[i]
rval = ""
if op == "mode":
rval = mode(data)
elif op == "length":
rval = len(data)
elif op == "random":
rval = random.choice(data)
elif op in ['cat', 'cat_uniq']:
if op == 'cat_uniq':
data = numpy.unique(data)
rval = ','.join(data)
elif op == "unique":
rval = len(numpy.unique(data))
else:
# some kind of numpy fn
try:
data = [float(_) for _ in data]
except ValueError:
sys.stderr.write("Operation %s expected number values but got %s instead.\n" % (op, data))
sys.exit(1)
rval = getattr(numpy, op)(data)
if round_val[i] == 'yes':
rval = int(round(rval))
else:
rval = '%g' % rval
out_str += "\t%s" % rval
fout.write(out_str + "\n")
# Generate a useful info message.
msg = "--Group by c%d: " % (group_col + 1)
for i, op in enumerate(ops):
if op == 'cat':
op = 'concat'
elif op == 'cat_uniq':
op = 'concat_distinct'
elif op == 'length':
op = 'count'
elif op == 'unique':
op = 'count_distinct'
elif op == 'random':
op = 'randomly_pick'
msg += op + "[c" + cols[i] + "] "
print(msg)
fout.close()
tmpfile.close()
if __name__ == "__main__":
main()