-
Notifications
You must be signed in to change notification settings - Fork 2k
/
h2o_kmeans.py
226 lines (188 loc) · 10.6 KB
/
h2o_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import h2o_cmd, h2o_util
import h2o_nodes
import re, math, random
from h2o_test import check_sandbox_for_errors
from operator import itemgetter
from h2o_test import OutputObj, dump_json
def pickRandKMeansParams(paramDict, params):
randomGroupSize = random.randint(1,len(paramDict))
for i in range(randomGroupSize):
randomKey = random.choice(paramDict.keys())
randomV = paramDict[randomKey]
randomValue = random.choice(randomV)
params[randomKey] = randomValue
# FIX! what about ignored columns during kmeans
def simpleCheckKMeans(modelResult, parameters, numRows, numCols, labels):
# labels should have the ignored columns removed
# numCols should be decremented by the ignored columns
# the names order should then match the labels order
ko = KMeansObj(modelResult, parameters)
# to unzip the tuplesSorted. zip with *
# ids, within_mse, rows, centers = zip(*tuplesSorted)
return ko.tuplesSorted, ko.iterations, ko.totss, ko.names
class KMeansObj(OutputObj):
def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs):
super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint)
print self.withinss # per cluster
print self.totss
print self.tot_withinss
print self.betweenss
# should model builder add this to the kmeansResult?
if 'python_elapsed' in kmeansResult:
self.python_elapsed = kmeansResult['python_elapsed']
size = self.size # [78, 5, 41, 76]
model_category = self.model_category # Clustering
iterations = self.iterations # 11.0
domains = self.domains
names = self.names
categorical_column_count = self.categorical_column_count # 0
centers_data = self.centers.data # [ 4 lists of centers ]
# h2o returns it sliced across centers. transpose the list of lists, drop 0 which is the cluster id?
# gotta turn the strings into numbers
centersStr = [list(x) for x in zip(*centers_data[1:])]
centers = [map(float, c) for c in centersStr]
withinss = self.withinss
totss = self.totss
if numRows:
assert numRows==sum(size)
if 'k' in parameters:
k = parameters['k']
assert len(centers) == k
assert len(size) == k
if numCols:
assert len(names) == numCols, \
"Need to pass correct numCols after ignored columns decrement %s %s %s" % (len(names), numCols, names)
for c in centers:
assert len(c) == numCols, "%s %s" % (len(c), numCols)
# this should be true
if labels:
assert len(labels) == numCols, \
"Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
assert len(labels) == len(names), \
"Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
assert labels == names
if 'max_iterations' in parameters:
max_iterations = parameters['max_iterations']
assert max_iterations >= iterations
# we could check the centers are within the min/max of each column
for i,c in enumerate(centers):
for n in c:
if math.isnan(float(n)):
raise Exception("cluster", i, "has NaN:", n, "center:", c)
# create a tuple for each cluster result, then sort by rows for easy comparison
# maybe should sort by centers?
# put a cluster index in there too, (leftmost) so we don't lose track
tuples = zip(range(len(centers)), centers, size, withinss)
# print "tuples:", dump_json(tuples)
# can we sort on the sum of the centers?
self.tuplesSorted = sorted(tuples, key=lambda tup: sum(tup[1]))
print "iterations:", iterations
# undo for printing what the caller will see
ids, centers, size, withinss = zip(*self.tuplesSorted)
for i,c in enumerate(centers):
print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)
print "rows_per_cluster[%s]: " % i, size[i]
print "withinss[%s]: " % i, withinss[i]
print "size[%s]:" % i, size[i]
print "KMeansObj created for:", "???"# vars(self)
# shouldn't have any errors
check_sandbox_for_errors()
# This is all messed up now...really want it to just do predict and compare histogram, and also do the compare results to expected
# will have to fix all this (and don't overlap with simpleCheck above)
def bigCheckResults(kmeansObj, kmeans, csvPathname, parseResult, predictKey, **kwargs):
predictResult = h2o_nodes.nodes[0].generate_predictions(data_key=parseResult['destination_key'], model_key=model_key, destination_key=predictKey)
summaryResult = h2o_nodes.nodes[0].summary_page(key=predictKey, timeoutSecs=120)
hcnt = summaryResult['summaries'][0]['hcnt'] # histogram
rows_per_cluster = hcnt
# FIX! does the cluster order/naming match, compared to cluster variances
sqr_error_per_cluster = cluster_variances
if (len(centers)!=len(rows_per_cluster) or len(centers)!=len(sqr_error_per_cluster)):
raise Exception("centers, rows_per_cluster, sqr_error_per_cluster should all be same length %s, %s, %s" % \
(len(centers), len(rows_per_cluster), len(sqr_error_per_cluster)))
print "Did iterations: %s given max_iter: %s" % (iterations, max_iter)
# shouldn't have to return a tuplesList from here any more
def compareResultsToExpected(tupleResultList, expected=None, allowedDelta=None, allowError=False, allowRowError=False):
# the expected/tupleResultlist should be sorted already by center sum, but just in case...
tupleResultList.sort(key=lambda tup: sum(tup[1]))
if expected is not None:
# sort expected, just in case, for the comparison
expected.sort(key=lambda tup: sum(tup[1]))
print "\nExpected:"
for e in expected:
print e
# now compare to expected, with some delta allowed
print "\nActual:"
for t in tupleResultList:
print t, "," # so can cut and paste and put results in an expected = [..] list
if expected is not None and not allowError: # allowedDelta must exist if expected exists
for i, (expCid, expCenter, expRows, expError) in enumerate(expected):
(actCid, actCenter, actRows, actError) = tupleResultList[i]
for (a,b) in zip(expCenter, actCenter): # compare list of floats
absAllowedDelta = abs(allowedDelta[0] * a)
absAllowedDelta = max(absAllowedDelta, allowedDelta[0]) # comparing to 0?
h2o_util.assertApproxEqual(a, b, tol=absAllowedDelta,
msg="Center value expected: %s actual: %s delta > %s" % (a, b, absAllowedDelta))
if not allowRowError and expRows: # allow error in row count?
absAllowedDelta = abs(allowedDelta[1] * expRows)
absAllowedDelta = max(absAllowedDelta, allowedDelta[1]) # comparing to 0?
h2o_util.assertApproxEqual(expRows, actRows, tol=absAllowedDelta,
msg="Rows expected: %s actual: %s delta > %s" % (expRows, actRows, absAllowedDelta))
if not allowRowError and expError: # allow error in row count?
absAllowedDelta = abs(allowedDelta[2] * expError)
absAllowedDelta = max(absAllowedDelta, allowedDelta[2]) # comparing to 0?
h2o_util.assertApproxEqual(expRows, actRows, tol=absAllowedDelta,
msg="Error expected: %s actual: %s delta > %s" % (expError, actError, absAllowedDelta))
# just print info on the distribution
def showClusterDistribution(tupleResultList, expected=None, allowedDelta=None, allowError=False, trial=0):
# sort the tuple list by center for the comparison. (this will be visible to the caller?)
from operator import itemgetter
if expected is not None:
# sort expected, just in case, for the comparison
expected.sort(key=itemgetter(0))
# get total row and total error
totalRows = 0
totalError = 0
print "\nExpected distribution, rows and error:"
for i, (expCid, expCenter, expRows, expError) in enumerate(expected):
totalRows += expRows
totalError += expError
# now go thru again and print percentages
print "totalRows:", totalRows, "totalError:", totalError
for i, (expCid, expCenter, expRows, expError) in enumerate(expected):
print expCenter, "pctRows: %0.2f" % (expRows/(totalRows+0.0)), "pctError: %0.2f" % (expError/(totalError+0.0))
if tupleResultList is not None:
tupleResultList.sort(key=itemgetter(0))
totalRows = 0
totalError = 0
print "\nActual distribution, rows and error:"
for i, (actCid, actCenter, actRows, actError) in enumerate(tupleResultList):
totalRows += actRows
totalError += actError
# now go thru again and print percentages
print "totalRows:", totalRows, "totalError:", totalError
for i, (actCid, actCenter, actRows, actError) in enumerate(tupleResultList):
print actCenter, "pctRows: %0.2f" % (actRows/(totalRows+0.0)), "pctError: %0.2f" % (actError/(totalError+0.0))
# compare this cluster centers to last one. since the files are concatenations,
# the results should be similar? 10% of first is allowed delta
def compareToFirstKMeans(self, centers, firstcenters):
# cluster centers could be a list or not. if a list, don't want to create list
# of that list so use extend on an empty list. covers all cases?
if type(centers) is list:
kList = centers
firstkList = firstcenters
elif type(centers) is dict:
raise Exception("compareToFirstKMeans: Not expecting dict for " + key)
else:
kList = [centers]
firstkList = [firstcenters]
print "kList:", kList, "firstkList:", firstkList
for k, firstk in zip(kList, firstkList):
# delta must be a positive number?
# too bad we can't do an assertAlmostEqual on the list directly..have to break them out
for k1, firstk1 in zip(k, firstk):
delta = .1 * abs(float(firstk1))
print "k1:", k1, "firstk1:", firstk1
msg = "Too large a delta (>" + str(delta) + ") comparing current and first cluster centers: " + \
str(float(k1)) + ", " + str(float(firstk1))
self.assertAlmostEqual(float(k1), float(firstk1), delta=delta, msg=msg)
self.assertGreaterEqual(abs(float(k1)), 0.0, str(k1) + " abs not >= 0.0 in current")