diff --git a/Makefile.am b/Makefile.am index f61f8c3..fdd7a1a 100755 --- a/Makefile.am +++ b/Makefile.am @@ -4,9 +4,9 @@ default: pythonlibdir=$(libdir)/python -pythonlib_SCRIPTS= cbworkloadgen +pythonlib_SCRIPTS= healthChecker -PYTHON_TOOLS= wrapper/cbworkloadgen +PYTHON_TOOLS= wrapper/healthChecker ${PYTHON_TOOLS}: wrapper/wrapper cp $< $@ diff --git a/README b/README deleted file mode 100644 index e69de29..0000000 diff --git a/analyzer.py b/analyzer.py index a90b5b7..3a15183 100755 --- a/analyzer.py +++ b/analyzer.py @@ -1,12 +1,11 @@ +import sys import datetime -import dbaccessor -import util +import logging +import util_cli as util import cluster_stats -import bucket_stats import diskqueue_stats import node_stats - import stats_buffer from Cheetah.Template import Template @@ -29,68 +28,45 @@ cluster_symptoms = {} bucket_symptoms = {} bucket_node_symptoms = {} +bucket_node_status = {} node_symptoms = {} indicator_error = {} indicator_warn = {} node_disparate = {} -def format_output(counter, result): - if len(result) == 1: - if counter.has_key("unit") and counter["unit"] == "GB": - return util.pretty_float(result[0]) - else: - return result[0] - else: - return result - class StatsAnalyzer: - def __init__(self): - self.accessor = dbaccessor.DbAccesor() + def __init__(self, log): + self.log = log def run_analysis(self): - self.accessor.connect_db() - self.accessor.browse_db() for bucket in stats_buffer.buckets.iterkeys(): bucket_list.append(bucket) bucket_symptoms[bucket] = [] bucket_node_symptoms[bucket] = {} + bucket_node_status[bucket] = {} for capsule, package_name in capsules: for pill in capsule: - #print pill['name'] + self.log.debug(pill['name']) for counter in pill['ingredients']: - if counter['type'] == 'SQL': - result = eval("{0}.{1}().run(self.accessor, \"{2}\")".format(package_name, counter['code'], counter['stmt'])) - elif counter['type'] == 'pythonSQL': - result = eval("{0}.{1}().run(self.accessor)".format(package_name, counter['code'])) - elif counter['type'] == 'python': - result = eval("{0}.{1}().run(counter)".format(package_name, counter['code'])) - - #if counter.has_key("unit") and counter["unit"] == "GB": - # util.pretty_print({counter["description"] : result}) - #else: - # util.pretty_print({counter["description"] : result}) + result = eval("{0}.{1}().run(counter)".format(package_name, counter['code'])) - #print counter + self.log.debug(counter) if pill.has_key("clusterwise") and pill["clusterwise"] : if isinstance(result, dict): if result.has_key("cluster"): - if counter.has_key("unit") and counter["unit"] == "GB": - cluster_symptoms[counter["name"]] = {"description" : counter["description"], "value": util.humanize_bytes(result["cluster"])} - else: - cluster_symptoms[counter["name"]] = {"description" : counter["description"], "value":result["cluster"]} + cluster_symptoms[counter["name"]] = {"description" : counter["description"], "value":result["cluster"]} else: cluster_symptoms[counter["name"]] = {"description" : counter["description"], "value":result} else: cluster_symptoms[counter["name"]] = {"description" : counter["description"], "value":result} if pill.has_key("perBucket") and pill["perBucket"] : - #bucket_symptoms[counter["name"]] = {"description" : counter["description"], "value":result} for bucket, values in result.iteritems(): if bucket == "cluster": continue for val in values: - if val[0] == "variance": + if val[0] == "variance" or val[0] == "error": continue elif val[0] == "total": bucket_symptoms[bucket].append({"description" : counter["description"], "value" : values[-1][1]}) @@ -104,14 +80,49 @@ def run_analysis(self): if pill.has_key("nodewise") and pill["nodewise"]: node_list[counter["name"]] = {"description" : counter["description"], "value":result} - if pill.has_key("indicator") and pill["indicator"] : + if pill.has_key("indicator"): if len(result) > 0: for bucket,values in result.iteritems(): - if values.has_key("error"): - indicator_error[counter["name"]] = {"description" : counter["description"], "bucket": bucket, "value":values["error"]} - if values.has_key("warn"): - indicator_warn[counter["name"]] = {"description" : counter["description"], "bucket": bucket, "value":values["warn"]} - + if type(values) is dict: + if values.has_key("error"): + indicator_error[counter["name"]] = {"description" : counter["description"], + "bucket": bucket, + "value":values["error"], + "cause" : pill["indicator"]["cause"], + "impact" : pill["indicator"]["impact"], + "action" : pill["indicator"]["action"], + } + for val in values["error"]: + bucket_node_status[bucket][val["node"]] = "error" + + if values.has_key("warn"): + indicator_warn[counter["name"]] = {"description" : counter["description"], + "bucket": bucket, + "value":values["warn"], + "cause" : pill["indicator"]["cause"], + "impact" : pill["indicator"]["impact"], + "action" : pill["indicator"]["action"], + } + elif type(values) is list: + for val in values: + if val[0] == "error": + indicator_error[counter["name"]] = {"description" : counter["description"], + "bucket": bucket, + "value":val[1], + "cause" : pill["indicator"]["cause"], + "impact" : pill["indicator"]["impact"], + "action" : pill["indicator"]["action"], + } + for val in values["error"]: + bucket_node_status[bucket][val["node"]] = "error" + elif val[0] == "warn": + indicator_warn[counter["name"]] = {"description" : counter["description"], + "bucket": bucket, + "value":val[1], + "cause" : pill["indicator"]["cause"], + "impact" : pill["indicator"]["impact"], + "action" : pill["indicator"]["action"], + } if pill.has_key("nodeDisparate") and pill["nodeDisparate"] : for bucket,values in result.iteritems(): if bucket == "cluster": @@ -121,42 +132,49 @@ def run_analysis(self): continue; if val[0] == "variance" and val[1] != 0: node_disparate[counter["name"]] = {"description" : counter["description"], "bucket": bucket, "value":values} - - self.accessor.close() - self.accessor.remove_db() - - def run_report(self): + + if len(indicator_error) > 0: + globals["cluster_health"] = "error" + elif len(indicator_warn) > 0: + globals["cluster_health"] = "warning" + + def run_report(self, txtfile, htmlfile, verbose): dict = { "globals" : globals, "cluster_symptoms" : cluster_symptoms, "bucket_symptoms" : bucket_symptoms, "bucket_node_symptoms" : bucket_node_symptoms, + "bucket_node_status" : bucket_node_status, "node_symptoms" : node_symptoms, "node_list" : node_list, "bucket_list" : bucket_list, "indicator_warn" : indicator_warn, "indicator_error" : indicator_error, + "verbose" : verbose, } - debug = True - if debug: - print "Nodelist Overview" - util.pretty_print(node_list) + f = open(txtfile, 'w') + report = {} + report["Report Time"] = globals["report_time"].strftime("%Y-%m-%d %H:%M:%S") + + report["Nodelist Overview"] = node_list - print "Cluster Overview" - util.pretty_print(cluster_symptoms) - - print "Bucket Metrics" - util.pretty_print(bucket_symptoms) - - print "Bucket Node Metrics" - util.pretty_print(bucket_node_symptoms) + report["Cluster Overview"] = cluster_symptoms + + report["Bucket Metrics"] = bucket_symptoms + + report["Bucket Node Metrics"] = bucket_node_symptoms - print "Key indicators" - util.pretty_print(indicator_error) - util.pretty_print(indicator_warn) + report["Key indicators"] = (indicator_error, indicator_warn) - print "Node disparate" - util.pretty_print(node_disparate) - #print Template(file="report-htm.tmpl", searchList=[dict]) \ No newline at end of file + report["Node disparate"] = node_disparate + + print >> f, util.pretty_print(report) + f.close() + + f = open(htmlfile, 'w') + print >> f, Template(file="report-htm.tmpl", searchList=[dict]) + f.close() + + sys.stderr.write("\nThis run finishes successfully. Please find output result at " + htmlfile) \ No newline at end of file diff --git a/buckets.py b/buckets.py index 7ab5c28..2bfa108 100755 --- a/buckets.py +++ b/buckets.py @@ -7,11 +7,21 @@ rest_cmds = { 'bucket-list': '/pools/default/buckets', + 'bucket-flush': '/pools/default/buckets/', + 'bucket-delete': '/pools/default/buckets/', + 'bucket-create': '/pools/default/buckets/', + 'bucket-edit': '/pools/default/buckets/', + 'bucket-get': '/pools/default/buckets', 'bucket-stats': '/pools/default/buckets/{0}/stats?zoom=hour', 'bucket-node-stats': '/pools/default/buckets/{0}/stats/{1}?zoom={2}' } methods = { 'bucket-list': 'GET', + 'bucket-delete': 'DELETE', + 'bucket-create': 'POST', + 'bucket-edit': 'POST', + 'bucket-flush': 'POST', + 'bucket-get': 'GET', 'bucket-stats': 'GET', 'bucket-node-stats': 'GET', } @@ -58,13 +68,68 @@ def runCmd(self, cmd, server, port, # get the parameters straight + if cmd in ('bucket-create', 'bucket-edit'): + if bucketname: + rest.setParam('name', bucketname) + if bucketname == "default": + if bucketport and bucketport != "11211": + usage("default bucket must be on port 11211.") + if bucketpassword: + usage("default bucket should only have empty password.") + authtype = 'sasl' + else: + if bucketport == "11211": + authtype = 'sasl' + else: + authtype = 'none' + if bucketpassword: + usage("a sasl bucket is supported only on port 11211.") + if buckettype: + rest.setParam('bucketType', buckettype) + if authtype: + rest.setParam('authType', authtype) + if bucketport: + rest.setParam('proxyPort', bucketport) + if bucketpassword: + rest.setParam('saslPassword', bucketpassword) + if bucketramsize: + rest.setParam('ramQuotaMB', bucketramsize) + if bucketreplication: + rest.setParam('replicaNumber', bucketreplication) + if cmd in ('bucket-delete', 'bucket-flush', 'bucket-edit'): + self.rest_cmd = self.rest_cmd + bucketname + if cmd == 'bucket-flush': + self.rest_cmd = self.rest_cmd + '/controller/doFlush' + opts = {} - opts['error_msg'] = "unable to %s" % cmd + opts['error_msg'] = "unable to %s; please check your username (-u) and password (-p);" % cmd opts['success_msg'] = "%s" % cmd data = rest.restCmd(methods[cmd], self.rest_cmd, self.user, self.password, opts) - return rest.getJson(data) + if cmd in("bucket-get", "bucket-stats", "bucket-node-stats"): + return rest.getJson(data) + elif cmd == "bucket-list": + if output == 'json': + print data + else: + json = rest.getJson(data) + for bucket in json: + print '%s' % bucket['name'] + print ' bucketType: %s' % bucket['bucketType'] + print ' authType: %s' % bucket['authType'] + if bucket['authType'] == "sasl": + print ' saslPassword: %s' % bucket['saslPassword'] + else: + print ' proxyPort: %s' % bucket['proxyPort'] + print ' numReplicas: %s' % bucket['replicaNumber'] + print ' ramQuota: %s' % bucket['quota']['ram'] + print ' ramUsed: %s' % bucket['basicStats']['memUsed'] + else: + if output == 'json': + print rest.jsonMessage(data) + else: + print data class BucketStats: def __init__(self, bucket_name): @@ -102,4 +167,3 @@ def runCmd(self, cmd, server, port, data = rest.restCmd(methods[cmd], self.rest_cmd, user, password, opts) return rest.getJson(data) - diff --git a/cluster_stats.py b/cluster_stats.py index e7ee4da..5d476c2 100755 --- a/cluster_stats.py +++ b/cluster_stats.py @@ -1,18 +1,22 @@ -import dbaccessor import stats_buffer -import util +import util_cli as util -class ExecSQL: - def run(self, accessor, stmt): - result = accessor.execute(stmt) - return result[0] +class BucketSummary: + def run(self, accessor): + return stats_buffer.bucket_info class DGMRatio: def run(self, accessor): - hdd = accessor.execute("SELECT sum(usedbyData) FROM StorageInfo WHERE type='hdd'") - ram = accessor.execute("SELECT sum(usedbyData) FROM StorageInfo WHERE type='ram'") - if ram[0] > 0: - ratio = hdd[0] / ram[0] + result = [] + hdd_total = 0 + ram_total = 0 + for node, nodeinfo in stats_buffer.nodes.iteritems(): + if nodeinfo["StorageInfo"].has_key("hdd"): + hdd_total += nodeinfo['StorageInfo']['hdd']['usedByData'] + if nodeinfo["StorageInfo"].has_key("ram"): + ram_total += nodeinfo['StorageInfo']['ram']['usedByData'] + if ram_total > 0: + ratio = hdd_total / ram_total else: ratio = 0 return ratio @@ -26,6 +30,7 @@ def run(self, accessor): "curr_items": [], "vb_replica_curr_items": [], } + num_error = [] for counter in accessor["counter"]: values = stats_info[accessor["scale"]][counter] nodeStats = values["nodeStats"] @@ -39,8 +44,10 @@ def run(self, accessor): if replica[1] == 0: res.append((active[0], "No replica")) else: - ratio = 1.0 * active[1] / replica[1] + ratio = 1.0 * active[1] / replica[1] res.append((active[0], util.pretty_float(ratio))) + if ratio < accessor["threshold"]: + num_error.append({"node":active[0], "value": ratio}) active_total += active[1] replica_total += replica[1] if replica_total == 0: @@ -49,12 +56,16 @@ def run(self, accessor): ratio = active_total * 1.0 / replica_total cluster += ratio res.append(("total", util.pretty_float(ratio))) + if ratio != accessor["threshold"]: + num_error.append({"node":"total", "value": ratio}) + if len(num_error) > 0: + res.append(("error", num_error)) result[bucket] = res result["cluster"] = util.pretty_float(cluster / len(stats_buffer.buckets)) return result class OpsRatio: - def run(self, accessor): + def run(self, accessor): result = {} for bucket, stats_info in stats_buffer.buckets.iteritems(): ops_avg = { @@ -82,11 +93,11 @@ def run(self, accessor): write_total += write_ratio del_ratio = delete[1] * 100 / count del_total += del_ratio - res.append((read[0], "{0}:{1}:{2}".format(read_ratio, write_ratio, del_ratio))) + res.append((read[0], "{0}:{1}:{2}".format(int(read_ratio+.5), int(write_ratio+.5), int(del_ratio+.5)))) read_total /= len(ops_avg['cmd_get']) write_total /= len(ops_avg['cmd_set']) del_total /= len(ops_avg['delete_hits']) - res.append(("total", "{0}:{1}:{2}".format(read_total, write_total, del_total))) + res.append(("total", "{0}:{1}:{2}".format(int(read_total+.5), int(write_total+.5), int(del_total+.5)))) result[bucket] = res return result @@ -104,18 +115,24 @@ def run(self, accessor): trend = [] total = 0 data = [] + num_error = [] for node, vals in nodeStats.iteritems(): - a, b = util.linreg(timestamps, vals) - value = a * timestamps[-1] + b + #a, b = util.linreg(timestamps, vals) + value = sum(vals) / samplesCount total += value + if value > accessor["threshold"]: + num_error.append({"node":node, "value":value}) trend.append((node, util.pretty_float(value))) data.append(value) total /= len(nodeStats) trend.append(("total", util.pretty_float(total))) trend.append(("variance", util.two_pass_variance(data))) + if len(num_error) > 0: + trend.append(("error", num_error)) cluster += total result[bucket] = trend - result["cluster"] = util.pretty_float(cluster / len(stats_buffer.buckets)) + if len(stats_buffer.buckets) > 0: + result["cluster"] = util.pretty_float(cluster / len(stats_buffer.buckets)) return result class MemUsed: @@ -133,8 +150,9 @@ def run(self, accessor): data = [] for node, vals in nodeStats.iteritems(): avg = sum(vals) / samplesCount - trend.append((node, util.pretty_float(avg))) + trend.append((node, util.size_label(avg))) data.append(avg) + #print data trend.append(("variance", util.two_pass_variance(data))) result[bucket] = trend return result @@ -142,6 +160,8 @@ def run(self, accessor): class ItemGrowth: def run(self, accessor): result = {} + start_cluster = 0 + end_cluster = 0 for bucket, stats_info in stats_buffer.buckets.iteritems(): trend = [] values = stats_info[accessor["scale"]][accessor["counter"]] @@ -155,16 +175,17 @@ def run(self, accessor): trend.append((node, 0)) else: start_val = b + start_cluster += b end_val = a * timestamps[-1] + b + end_cluster += end_val rate = (end_val * 1.0 / b - 1.0) * 100 - trend.append((node, util.pretty_float(rate))) + trend.append((node, util.pretty_float(rate) + "%")) result[bucket] = trend + if len(stats_buffer.buckets) > 0: + rate = (end_cluster * 1.0 / start_cluster - 1.0) * 100 + result["cluster"] = util.pretty_float(rate) + "%" return result -class AvgItemSize: - def run(self, accessor): - return 0 - class NumVbuckt: def run(self, accessor): result = {} @@ -174,21 +195,98 @@ def run(self, accessor): nodeStats = values["nodeStats"] for node, vals in nodeStats.iteritems(): if vals[-1] < accessor["threshold"]: - num_error.append({"node":node, "value":vals[-1]}) + num_error.append({"node":node, "value": int(vals[-1])}) if len(num_error) > 0: result[bucket] = {"error" : num_error} return result +class RebalanceStuck: + def run(self, accessor): + result = {} + for bucket, bucket_stats in stats_buffer.node_stats.iteritems(): + num_error = [] + for node, stats_info in bucket_stats.iteritems(): + for key, value in stats_info.iteritems(): + if key.find(accessor["counter"]) >= 0: + if accessor.has_key("threshold"): + if int(value) > accessor["threshold"]: + num_error.append({"node":node, "value": (key, value)}) + else: + num_error.append({"node":node, "value": (key, value)}) + if len(num_error) > 0: + result[bucket] = {"error" : num_error} + return result + +class MemoryFramentation: + def run(self, accessor): + result = {} + for bucket, bucket_stats in stats_buffer.node_stats.iteritems(): + num_error = [] + for node, stats_info in bucket_stats.iteritems(): + for key, value in stats_info.iteritems(): + if key.find(accessor["counter"]) >= 0: + if accessor.has_key("threshold"): + if int(value) > accessor["threshold"]: + if accessor.has_key("unit"): + if accessor["unit"] == "time": + num_error.append({"node":node, "value": (key, util.time_label(value))}) + elif accessor["unit"] == "size": + num_error.append({"node":node, "value": (key, util.size_label(value))}) + else: + num_error.append({"node":node, "value": (key, value)}) + else: + num_error.append({"node":node, "value": (key, value)}) + if len(num_error) > 0: + result[bucket] = {"error" : num_error} + return result + +class EPEnginePerformance: + def run(self, accessor): + result = {} + for bucket, bucket_stats in stats_buffer.node_stats.iteritems(): + num_error = [] + for node, stats_info in bucket_stats.iteritems(): + for key, value in stats_info.iteritems(): + if key.find(accessor["counter"]) >= 0: + if accessor.has_key("threshold"): + if accessor["counter"] == "flusherState" and value != accessor["threshold"]: + num_error.append({"node":node, "value": (key, value)}) + elif accessor["counter"] == "flusherCompleted" and value == accessor["threshold"]: + num_error.append({"node":node, "value": (key, value)}) + else: + if value > accessor["threshold"]: + num_error.append({"node":node, "value": (key, value)}) + if len(num_error) > 0: + result[bucket] = {"error" : num_error} + return result + +class TotalDataSize: + def run(self, accessor): + result = [] + total = 0 + for node, nodeinfo in stats_buffer.nodes.iteritems(): + if nodeinfo["StorageInfo"].has_key("hdd"): + total += nodeinfo['StorageInfo']['hdd']['usedByData'] + result.append(util.size_label(total)) + return result + +class AvailableDiskSpace: + def run(self, accessor): + result = [] + total = 0 + for node, nodeinfo in stats_buffer.nodes.iteritems(): + if nodeinfo["StorageInfo"].has_key("hdd"): + total += nodeinfo['StorageInfo']['hdd']['free'] + result.append(util.size_label(total)) + return result + ClusterCapsule = [ {"name" : "TotalDataSize", "ingredients" : [ { "name" : "totalDataSize", "description" : "Total Data Size across cluster", - "type" : "SQL", - "stmt" : "SELECT sum(usedbyData) FROM StorageInfo WHERE type='hdd'", - "code" : "ExecSQL", - "unit" : "GB", + "code" : "TotalDataSize", } ], "clusterwise" : True, @@ -200,10 +298,7 @@ def run(self, accessor): { "name" : "availableDiskSpace", "description" : "Available disk space", - "type" : "SQL", - "stmt" : "SELECT sum(free) FROM StorageInfo WHERE type='hdd'", - "code" : "ExecSQL", - "unit" : "GB", + "code" : "AvailableDiskSpace", } ], "clusterwise" : True, @@ -216,17 +311,19 @@ def run(self, accessor): "name" : "cacheMissRatio", "description" : "Cache miss ratio", "counter" : "ep_cache_miss_rate", - "type" : "python", "scale" : "hour", "code" : "CacheMissRatio", - "unit" : "percentage", "threshold" : 2, }, ], "clusterwise" : True, "perNode" : True, "perBucket" : True, - "indicator" : False, + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + }, "nodeDisparate" : True, }, {"name" : "DGM", @@ -234,7 +331,6 @@ def run(self, accessor): { "name" : "dgm", "description" : "Disk to Memory Ratio", - "type" : "pythonSQL", "code" : "DGMRatio" }, ], @@ -246,28 +342,33 @@ def run(self, accessor): "ingredients" : [ { "name" : "activeReplicaResidencyRatio", - "description" : "Active and Replica Residentcy Ratio", - "type" : "python", + "description" : "Active and Replica Resident Ratio", "counter" : ["curr_items", "vb_replica_curr_items"], "scale" : "minute", "code" : "ARRatio", + "threshold" : 1, }, ], "clusterwise" : True, "perNode" : True, "perBucket" : True, + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + }, }, {"name" : "OPSPerformance", "ingredients" : [ { "name" : "opsPerformance", "description" : "Read/Write/Delete ops ratio", - "type" : "python", "scale" : "minute", "counter" : ["cmd_get", "cmd_set", "delete_hits"], "code" : "OpsRatio", }, - ] + ], + "perBucket" : True, }, {"name" : "GrowthRate", "ingredients" : [ @@ -275,23 +376,12 @@ def run(self, accessor): "name" : "dataGrowthRateForItems", "description" : "Data Growth rate for items", "counter" : "curr_items", - "type" : "python", "scale" : "day", "code" : "ItemGrowth", "unit" : "percentage", }, - ] - }, - {"name" : "AverageDocumentSize", - "ingredients" : [ - { - "name" : "averageDocumentSize", - "description" : "Average Document Size", - "type" : "python", - "code" : "AvgItemSize", - "unit" : "KB", - }, - ] + ], + "clusterwise" : True, }, {"name" : "VBucketNumber", "ingredients" : [ @@ -299,7 +389,6 @@ def run(self, accessor): "name" : "activeVbucketNumber", "description" : "Active VBucket number is less than expected", "counter" : "vb_active_num", - "type" : "python", "scale" : "hour", "code" : "NumVbuckt", "threshold" : 1024, @@ -308,28 +397,136 @@ def run(self, accessor): "name" : "replicaVBucketNumber", "description" : "Replica VBucket number is less than expected", "counter" : "vb_replica_num", - "type" : "python", "scale" : "hour", "code" : "NumVbuckt", "threshold" : 1024, }, ], - "indicator" : True, + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + }, }, {"name" : "MemoryUsage", "ingredients" : [ { "name" : "memoryUsage", - "description" : "Check if memory usage and/or fragmentaion", - "type" : "python", + "description" : "Check memory usage", "counter" : "mem_used", "scale" : "hour", "code" : "MemUsed", }, ], - "perNode" : True, "nodeDisparate" : True, }, + {"name" : "RebalancePerformance", + "ingredients" : [ + { + "name" : "rebalanceStuck", + "description" : "Check if rebalance is stuck", + "counter" : "idle", + "code" : "RebalanceStuck", + }, + { + "name" : "highBackfillRemaing", + "description" : "Tap queue backfilll remaining is too high", + "counter" : "ep_tap_queue_backfillremaining", + "code" : "RebalanceStuck", + "threshold" : 1000, + }, + ], + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + } + }, + {"name" : "MemoryFragmentation", + "ingredients" : [ + { + "name" : "totalFragmentation", + "description" : "Total memory fragmentation", + "counter" : "total_fragmentation_bytes", + "code" : "MemoryFramentation", + "unit" : "size", + "threshold" : 1073741824, # 1GB + }, + { + "name" : "diskDelete", + "description" : "Averge disk delete time", + "counter" : "disk_del", + "code" : "MemoryFramentation", + "unit" : "time", + "threshold" : 1000 #1ms + }, + { + "name" : "diskUpdate", + "description" : "Averge disk update time", + "counter" : "disk_update", + "code" : "MemoryFramentation", + "unit" : "time", + "threshold" : 1000 #1ms + }, + { + "name" : "diskInsert", + "description" : "Averge disk insert time", + "type" : "python", + "counter" : "disk_insert", + "code" : "MemoryFramentation", + "unit" : "time", + "threshold" : 1000 #1ms + }, + { + "name" : "diskCommit", + "description" : "Averge disk commit time", + "counter" : "disk_commit", + "code" : "MemoryFramentation", + "unit" : "time", + "threshold" : 5000000 #10s + }, + ], + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + }, + }, + {"name" : "EPEnginePerformance", + "ingredients" : [ + { + "name" : "flusherState", + "description" : "Engine flusher state", + "counter" : "ep_flusher_state", + "code" : "EPEnginePerformance", + "threshold" : "running", + }, + { + "name" : "flusherCompleted", + "description" : "Flusher completed", + "counter" : "ep_flusher_num_completed", + "code" : "EPEnginePerformance", + "threshold" : 0 + }, + { + "name" : "avgItemLoadTime", + "description" : "Average item loaded time", + "counter" : "ep_bg_load_avg", + "code" : "EPEnginePerformance", + "threshold" : 100, + }, + { + "name" : "avgItemWaitTime", + "description" : "Averge item waited time", + "counter" : "ep_bg_wait_avg", + "code" : "EPEnginePerformance", + "threshold" : 100 + }, + ], + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + }, + }, ] - - diff --git a/configure.ac b/configure.ac index aeda7d6..7a97088 100755 --- a/configure.ac +++ b/configure.ac @@ -1,10 +1,10 @@ -# workload-generator -# Copyright (C) 2011 Couchbase, INC +# health-checker +# Copyright (C) 2012 Couchbase, INC # All rights reserved. # AC_PREREQ(2.59) m4_include([m4/version.m4]) -AC_INIT(workload-generator, VERSION_NUMBER, bin@couchbase.com) +AC_INIT(healthChecker, VERSION_NUMBER, bin@couchbase.com) AC_CONFIG_AUX_DIR(config) AM_INIT_AUTOMAKE AC_CONFIG_FILES(Makefile wrapper/wrapper) diff --git a/dbaccessor.py b/dbaccessor.py index c39fb83..4c134da 100755 --- a/dbaccessor.py +++ b/dbaccessor.py @@ -41,14 +41,6 @@ def create_databases(self): self.cursor.execute(""" CREATE UNIQUE INDEX IF NOT EXISTS server_idx on ServerNode(host, port, master) """) - self.cursor.execute(""" CREATE TABLE IF NOT EXISTS DiskInfo ( - diskInfoId INTEGER PRIMARY KEY, - path TEXT NOT NULL, - sizeBytes INTEGER, - usagePercent INTEGER, - serverId INTEGER, - FOREIGN KEY(serverId) REFERENCES ServerNode(serverId))""") - self.cursor.execute(""" CREATE TABLE IF NOT EXISTS MemoryInfo ( memoryInfoId INTEGER PRIMARY KEY, allocated INTEGER, @@ -163,7 +155,7 @@ def process_node_stats(self, nodeId, nodeInfo): hdd['usedByData'], nodeId)); ram = nodeInfo['storageTotals']['ram'] - if hdd is not None: + if ram is not None: self.cursor.execute(sqlstmt.format('ram', hdd['free'], hdd['quotaTotal'], diff --git a/diskqueue_stats.py b/diskqueue_stats.py index 9ca6839..38555c9 100755 --- a/diskqueue_stats.py +++ b/diskqueue_stats.py @@ -1,7 +1,5 @@ -import dbaccessor import stats_buffer -import util -counter_name = 'disk_write_queue' +import util_cli as util class AvgDiskQueue: def run(self, accessor): @@ -102,20 +100,18 @@ def run(self, accessor): "counter" : "disk_write_queue", "pernode" : True, "scale" : "minute", - "type" : "python", "code" : "AvgDiskQueue", "threshold" : { "low" : 50000000, "high" : 1000000000 }, - }, + }, { "name" : "diskQueueTrend", "description" : "Persistence severely behind - disk write queue continues growing", "counter" : "disk_write_queue", "pernode" : True, "scale" : "hour", - "type" : "python", "code" : "DiskQueueTrend", "threshold" : { "low" : 0, @@ -123,7 +119,11 @@ def run(self, accessor): }, }, ], - "indicator" : True, + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + }, }, {"name" : "ReplicationTrend", "ingredients" : [ @@ -133,7 +133,6 @@ def run(self, accessor): "counter" : "ep_tap_total_total_backlog_size", "pernode" : True, "scale" : "hour", - "type" : "python", "code" : "TapQueueTrend", "threshold" : { "low" : 0, @@ -141,7 +140,11 @@ def run(self, accessor): }, } ], - "indicator" : True, + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + }, }, {"name" : "DiskQueueDrainingAnalysis", "description" : "", @@ -152,20 +155,18 @@ def run(self, accessor): "counter" : ["vb_active_queue_drain", "disk_write_queue"], "pernode" : True, "scale" : "minute", - "type" : "python", "code" : "DiskQueueDrainingRate", "threshold" : { "drainRate" : 0, "diskLength" : 100000, }, - }, + }, { "name" : "replicaDiskQueueDrainRate", "description" : "Persistence severely behind - replica disk queue draining rate is below threshold", "counter" : ["vb_replica_queue_drain", "disk_write_queue"], "pernode" : True, "scale" : "minute", - "type" : "python", "code" : "DiskQueueDrainingRate", "threshold" : { "drainRate" : 0, @@ -173,6 +174,10 @@ def run(self, accessor): }, }, ], - "indicator" : True, + "indicator" : { + "cause" : "blah", + "impact" : "blah", + "action" : "blah", + } }, ] \ No newline at end of file diff --git a/healthChecker.py b/healthChecker.py index c1930b9..c99585d 100644 --- a/healthChecker.py +++ b/healthChecker.py @@ -6,32 +6,32 @@ import os import traceback import copy +import logging -import dbaccessor +import collector import analyzer import stats_buffer -import util - -import listservers -import buckets -import node -import info import util_cli as util -import mc_bin_client -import simplejson import node_map +log = logging.getLogger('healthChecker') +log.setLevel(logging.INFO) +log.addHandler(logging.StreamHandler()) + def parse_opt(): - (cluster, user, password) = ('', '','') + (cluster, user, password, txtfile, htmlfile, verbose) = ('', '', '', 'kpi_report.txt', 'health_report.html', True) try: (opts, _args) = getopt.getopt(sys.argv[1:], - 'c:dp:u:', [ + 'c:dvp:u:t:h:', [ 'cluster=', 'debug', + 'verbose', 'password=', - 'user=' + 'user=', + 'txt=', + 'html=', ]) except getopt.GetoptError, err: usage(err) @@ -44,129 +44,47 @@ def parse_opt(): if opt in ('-p', '--password'): password = arg if opt in ('-d', '--debug'): - debug = True + log.setLevel(logging.DEBUG) + if opt in ('-t', '--txt'): + txtfile = arg + if opt in ('-h', '--html'): + htmlfile = arg + if not cluster: - usage("please provide a CLUSTER, or use -h for more help.") - return (cluster, user, password, opts) + usage() + return (cluster, user, password, txtfile, htmlfile, verbose, opts) -def get_stats(mc, stats): - try: - node_stats = mc.stats('') - if node_stats: - for key, val in node_stats.items(): - stats[key] = val - except Exception, err: - #print "ERROR: command: %s: %s:%d, %s" % ('stats all', server, port, err) - traceback.print_exc() - #sys.exit(1) - - try: - node_stats = mc.stats('tap') - if node_stats: - for key, val in node_stats.items(): - stats[key] = val - except Exception, err: - #print "ERROR: command: %s: %s:%d, %s" % ('stats tap', server, port, err) - traceback.print_exc() - #sys.exit(1) - -def stats_formatter(stats, prefix=" ", cmp=None): - if stats: - longest = max((len(x) + 2) for x in stats.keys()) - for stat, val in sorted(stats.items(), cmp=cmp): - s = stat + ":" - print "%s%s%s" % (prefix, s.ljust(longest), val) - -def collect_data(): - - (cluster, user, password, opts) = parse_opt() - server, port = util.hostport(cluster) - - nodes = [] - commands = { - 'host-list' : listservers.ListServers, - 'server-info' : info.Info, - 'bucket-list' : buckets.Buckets, - 'bucket-stats' : buckets.BucketStats, - 'bucket-node-stats' : buckets.BucketNodeStats, - } - - accessor = dbaccessor.DbAccesor() - - accessor.connect_db() - accessor.create_databases(); - - #get node list and its status - try: - cmd = 'host-list' - c = commands[cmd]() - nodes = c.runCmd(cmd, server, port, user, password, opts) - except Exception, err: - print "ERROR: command: %s: %s:%d, %s" % (cmd, server, port, err) - sys.exit(1) - - #get each node information - try: - cmd = 'server-info' - c = commands[cmd]() - for node in nodes: - (node_server, node_port) = util.hostport(node['hostname']) - if node_map.address_map.has_key(node_server): - node_server = node_map.address_map[node_server] - nodeid = accessor.create_or_update_node(node_server, node_port, node['status'], server) - if node['status'] == 'healthy': - node_info = c.runCmd(cmd, node_server, node_port, user, password, opts) - accessor.process_node_stats(nodeid, node_info) - #stats = {} - #mc = mc_bin_client.MemcachedClient(node_server, node['ports']['direct']) - #get_stats(mc, stats) - else: - print "Unhealthy node: %s:%s" %(node_server, node['status']) - except Exception, err: - traceback.print_exc() - #print "ERROR: command: %s: %s:%d, %s" % (cmd, server, port, err) - sys.exit(1) - - #get each bucket information - try: - cmd = 'bucket-list' - c = commands[cmd]() - json = c.runCmd(cmd, server, port, user, password, opts) - for bucket in json: - (bucket_name, bucket_id) = accessor.process_bucket(bucket, server) - - # get bucket related stats - cmd = 'bucket-stats' - c = buckets.BucketStats(bucket_name) - json = c.runCmd(cmd, server, port, user, password, opts) - stats_buffer.buckets_summary[bucket_name] = json - - #retrieve bucket stats per node - stats_buffer.buckets[bucket_name] = copy.deepcopy(stats_buffer.stats) - cmd = 'bucket-node-stats' - for scale, stat_set in stats_buffer.buckets[bucket_name].iteritems(): - for stat in stat_set.iterkeys(): - print "retieving: ", stat, " scale:", scale - c = buckets.BucketNodeStats(bucket_name, stat, scale) - json = c.runCmd(cmd, server, port, user, password, opts) - stats_buffer.buckets[bucket_name][scale][stat] = json - #accessor.process_bucket_node_stats(bucket_id, server, stat, json) - except Exception, err: - traceback.print_exc() - #print "ERROR: command: %s: %s:%d, %s" % (cmd, server, port, err) - sys.exit(1) - - accessor.close() +def usage(error_msg=''): + if error_msg: + print "ERROR: %s" % error_msg + sys.exit(2) + + print """healthChecker - cluster key performance indicator stats + +usage: healthChecker CLUSTER OPTIONS + +CLUSTER: + --cluster=HOST[:PORT] or -c HOST[:PORT] + +OPTIONS: + -u USERNAME, --user=USERNAME admin username of the cluster + -p PASSWORD, --password=PASSWORD admin password of the cluster + -o FILENAME, --output=FILENAME Default output filename is 'kpi_report.txt' + -d --debug + -v --verbose Display detailed node level information +""" + sys.exit(2) def main(): - + (cluster, user, password, txtfile, htmlfile, verbose, opts) = parse_opt() #make snapshot for the current cluster status - collect_data() + retriever = collector.StatsCollector(log) + retriever.collect_data(cluster, user, password, opts) #analyze the snapshot and historic data - performer = analyzer.StatsAnalyzer() + performer = analyzer.StatsAnalyzer(log) performer.run_analysis() - performer.run_report() + performer.run_report(txtfile, htmlfile, verbose) if __name__ == '__main__': main() diff --git a/htmlreport.tmpl b/htmlreport.tmpl deleted file mode 100755 index 7f67a72..0000000 --- a/htmlreport.tmpl +++ /dev/null @@ -1,237 +0,0 @@ - -
Couchbase Cluster Health Check Report
-Tool Version: $globals['versions']
-Execution Time: $globals['report_time']
-Overall cluster health: $globals.cluster_health
-Section 1 - Couchbase – Alerts
-Cluster-wide metrics
- -1. Persistence severely behind - Immediate Action Needed
-• | -Symptomo | -1 million items | -
- | Disk write queue has reached | -- |
oDrain rate has slowed down to
-•Causes - Disk write queue is backed-up, I/O rates unable to sustain write rates
-•Impact - If the node goes down, data will be lost
-•Action -
-Section 2 - Couchbase Cluster Overview
-• Node list
-Node IP | -Couchbase Server Version | -Status | -
$node["ip"] | -$node["version"] | -$node["status"] | -
• Total number of nodes in the cluster: $node_list["numNodes"]["value"] o Number of nodes down: $node_list["numDownNodes"]["value"]
-o Number of nodes warming up: $node_list["numWarmupNodes"]["value"] o Number of nodes failed over: $node_list["numFailedOverNodes"]["value"]
-Cluster-wide metrics
-$value["description"] | -$value["value"] | -
Bucket metrics
-#for $bucket in $bucket_list -Bucket name: $bucket
-Status – Attention needed
-- | $symptom["description"] | -$symptom["value"] | -
- | Node-level information | -- |
- | IP address: $node | -- |
- | Status – OK | -- |
- | - | - |
- | $node_value["description"] | -$node_value["value"] | -
Section 3 - Couchbase – Warning Indicators
-Cluster-wide metrics
-1.Replica Resident ratio approaching alert levels
-•Symptom - Replica Resident ratio decreased over 24 hours to 0.50
-•Cause -
-•Impact - Failing over a node will slow down cluster severely because a backfill from disk will be required and will result in eviction of active items on node)
-•Action -
-Section 1 - Couchbase – Alerts
Cluster-wide metrics
#for $counter, $error_values in $indicator_error.iteritems(): -1. $error_values["description"]
+. $error_values["description"]
• | Symptomo | -+ | + | |
+ | Node: | +$err_val["node"] | ||
- | Disk write queue has reached | -+ | Value: | +$err_val["value"] | +
• | +Causes - | +$error_values["cause"] | +||
• | +Impact - | +$error_values["impact"] | +||
• | +Action - | +$error_values["action"] |
oDrain rate has slowed down to
-•Causes -
-•Impact -
-•Action -
#end forSection 2 - Couchbase Cluster Overview
• Node list
@@ -168,8 +182,10 @@ body {margin-top: 0px;margin-left: 0px;} #end for -• Total number of nodes in the cluster: $node_list["numNodes"]["value"] o Number of nodes down: $node_list["numDownNodes"]["value"]
-o Number of nodes warming up: $node_list["numWarmupNodes"]["value"] o Number of nodes failed over: $node_list["numFailedOverNodes"]["value"]
+• Total number of nodes in the cluster: $node_list["numNodes"]["value"]
+• Number of down nodes: $node_list["numDownNodes"]["value"]
+• Number of warmingup nodes: $node_list["numWarmupNodes"]["value"]
+• Number of failed over nodes: $node_list["numFailedOverNodes"]["value"]
Section 3 - Couchbase – Warning Indicators
Cluster-wide metrics
#for $counter, $warn_values in $indicator_warn.iteritems(): -1.$warn_values["description"]
-•Symptom -
+ . $warn_values["description"] •Cause - •Impact - •Action -
+
#end for
+
#for $warn_val in $warn_values["value"]
-($warn_val["node"], $warn_val["value"])
+•
+ Symptomo
+
+
+
+
+ Node:
+ $warn_val["node"]
+
+
#end for
-
-
+ Value:
+ $warn_val["value"]
+
+
+•
+ Causes -
+ $warn_values["cause"]
+
+
+•
+ Impact -
+ $warn_values["impact"]
+
+
+•
+ Action -
+ $warn_values["action"]
+