Permalink
Browse files

Create low/high threshold for performance counters

Treat as warning if counters just cross low threshold and error
if above high watermark.
  • Loading branch information...
1 parent 5f63260 commit 655fe26eb3dbc60fb508b2fd5e28af6ec282cfc6 @bcui6611 bcui6611 committed Aug 17, 2012
Showing with 113 additions and 39 deletions.
  1. +64 −19 cluster_stats.py
  2. +0 −8 node_stats.py
  3. +1 −1 prescription.py
  4. +28 −7 threshold.py
  5. +20 −4 util_cli.py
View
83 cluster_stats.py
@@ -461,25 +461,31 @@ def run(self, accessor, scale, threshold=None):
threshold_val = accessor["threshold"]
for bucket, bucket_stats in stats_buffer.node_stats.iteritems():
num_error = []
+ num_warn = []
trend = []
for node, stats_info in bucket_stats.iteritems():
for key, value in stats_info.iteritems():
if key == accessor["counter"]:
if accessor.has_key("threshold") and not isinstance(value, dict):
value = int(value)
- if value > threshold_val:
+ if value > threshold_val["low"]:
+ val_threshold = threshold_val["low"]
+ if value > threshold_val["high"]:
+ val_threshold = threshold_val["high"]
symptom = ""
if accessor.has_key("unit"):
if accessor["unit"] == "time":
- symptom = accessor["symptom"].format(util.time_label(value), util.time_label(threshold_val))
+ symptom = accessor["symptom"].format(util.time_label(value), util.time_label(val_threshold))
elif accessor["unit"] == "size":
- symptom = accessor["symptom"].format(util.size_label(value), util.size_label(threshold_val))
+ symptom = accessor["symptom"].format(util.size_label(value), util.size_label(val_threshold))
else:
- symptom = accessor["symptom"].format(value, threshold_val)
- num_error.append({"node":node, "value": symptom})
+ symptom = accessor["symptom"].format(value, val_threshold)
else:
- symptom = accessor["symptom"].format(value, threshold_val)
+ symptom = accessor["symptom"].format(value, val_threshold)
+ if value > threshold_val["high"]:
num_error.append({"node":node, "value": symptom})
+ else:
+ num_warn.append({"node":node, "value": symptom})
if accessor.has_key("unit"):
if accessor["unit"] == "time":
trend.append((node, {"value":util.time_label(value), "raw":value}))
@@ -490,6 +496,10 @@ def run(self, accessor, scale, threshold=None):
if len(num_error) > 0:
trend.append(("error", num_error))
result[bucket] = trend
+ elif len(num_warn) > 0:
+ trend.append(("warn", num_warn))
+ result[bucket] = trend
+
return result
class EPEnginePerformance:
@@ -501,6 +511,7 @@ def run(self, accessor, scale, threshold=None):
threshold_val = accessor["threshold"]
for bucket, bucket_stats in stats_buffer.node_stats.iteritems():
num_error = []
+ num_warn = []
for node, stats_info in bucket_stats.iteritems():
for key, value in stats_info.iteritems():
if key.find(accessor["counter"]) >= 0:
@@ -512,19 +523,29 @@ def run(self, accessor, scale, threshold=None):
if int(value) == threshold_val:
num_error.append({"node":node, "value": accessor["symptom"]})
else:
- if value > threshold_val:
+ value = int(value)
+ if value > threshold_val["low"]:
+ val_threshold = threshold_val["low"]
+ if value > threshold_val["high"]:
+ val_threshold = threshold_val["high"]
if accessor.has_key("unit"):
if accessor["unit"] == "time":
- symptom = accessor["symptom"].format(util.time_label(int(value)), util.time_label(threshold_val))
+ symptom = accessor["symptom"].format(util.time_label(value), util.time_label(val_threshold))
elif accessor["unit"] == "size":
- symptom = accessor["symptom"].format(util.size_label(int(value)), util.size_label(threshold_val))
+ symptom = accessor["symptom"].format(util.size_label(value), util.size_label(val_threshold))
else:
- symptom = accessor["symptom"].format(value, threshold_val)
+ symptom = accessor["symptom"].format(value, val_threshold)
else:
- symptom = accessor["symptom"].format(value, threshold_val)
- num_error.append({"node":node, "value": symptom})
+ symptom = accessor["symptom"].format(value, val_threshold)
+ if value > threshold_val["high"]:
+ num_error.append({"node":node, "value": symptom})
+ else:
+ num_warn.append({"node":node, "value": symptom})
if len(num_error) > 0:
result[bucket] = {"error" : num_error}
+ elif len(num_warn) > 0:
+ result[bucket] = {"warn" : num_warn}
+
return result
class TotalDataSize:
@@ -793,12 +814,17 @@ def run(self, accessor, scale, threshold=None):
"counter" : "total_fragmentation_bytes",
"code" : "CalcFragmentation",
"unit" : "size",
- "threshold" : 1073741824, # 1GB
+ "threshold" : {
+ "low" : 1073741824, # 1GB
+ "high" : 2147483648, # 2GB
+ },
"symptom" : "Total memory fragmentation '{0}' is larger than '{1}'",
"formula" : "total_fragmentation_bytes > threshold",
},
],
"indicator" : True,
+ "perNode" : True,
+ "perBucket" : True,
},
{"name" : "DiskFragmentation",
"ingredients" : [
@@ -808,7 +834,10 @@ def run(self, accessor, scale, threshold=None):
"counter" : "disk_del",
"code" : "CalcFragmentation",
"unit" : "time",
- "threshold" : 1000, #1ms
+ "threshold" : {
+ "low" : 1000, #1ms
+ "high" : 5000,
+ },
"symptom" : "Average disk delete time '{0}' is slower than '{1}'",
"formula" : "Avg(disk_del) > threshold",
},
@@ -818,7 +847,10 @@ def run(self, accessor, scale, threshold=None):
"counter" : "disk_update",
"code" : "CalcFragmentation",
"unit" : "time",
- "threshold" : 1000, #1ms
+ "threshold" : {
+ "low" : 1000, #1ms
+ "high" : 5000,
+ },
"symptom" : "Average disk update time '{0}' is slower than '{1}'",
"formula" : "Avg(disk_update) > threshold",
},
@@ -829,7 +861,10 @@ def run(self, accessor, scale, threshold=None):
"counter" : "disk_insert",
"code" : "CalcFragmentation",
"unit" : "time",
- "threshold" : 1000, #1ms
+ "threshold" : {
+ "low" : 1000, #1ms
+ "high" : 5000,
+ },
"symptom" : "Average disk insert time '{0}' is slower than '{1}'",
"formula" : "Avg(disk_insert) > threshold",
},
@@ -839,13 +874,17 @@ def run(self, accessor, scale, threshold=None):
"counter" : "disk_commit",
"code" : "CalcFragmentation",
"unit" : "time",
- "threshold" : 5000000, #10s
+ "threshold" : {
+ "low" : 5000000,
+ "high" : 10000000,
+ },
"symptom" : "Average disk commit time '{0}' is slower than '{1}'",
"formula" : "Avg(disk_commit) > threshold",
},
],
"indicator" : True,
"perBucket" : True,
+ "perNode" : True,
},
{"name" : "EPEnginePerformance",
"ingredients" : [
@@ -873,7 +912,10 @@ def run(self, accessor, scale, threshold=None):
"counter" : "ep_bg_load_avg",
"code" : "EPEnginePerformance",
"unit" : "time",
- "threshold" : 100,
+ "threshold" : {
+ "low" : 100,
+ "high" : 500,
+ },
"symptom" : "Average time '{0}' for items to be loaded is slower than '{1}'",
"formula" : "Avg(ep_bg_load_avg) > threshold",
},
@@ -883,7 +925,10 @@ def run(self, accessor, scale, threshold=None):
"counter" : "ep_bg_wait_avg",
"code" : "EPEnginePerformance",
"unit" : "time",
- "threshold" : 100,
+ "threshold" : {
+ "low" : 100,
+ "high" : 500,
+ },
"symptom" : "Average waiting time '{0}' for items serviced by dispatcher is slower than '{1}'",
"formula" : "Avg(ep_bg_wait_avg) > threshold",
},
View
8 node_stats.py
@@ -278,14 +278,6 @@ def run(self, accessor, scale, threshold=None):
"formula" : "total_heap_bytes",
},
{
- "name" : "totalFragmentation",
- "description" : "Total memory fragmentation",
- "counter" : "total_fragmentation_bytes",
- "code" : "NodePerformanceStats",
- "unit" : "size",
- "formula" : "total_fragmentation_bytes",
- },
- {
"name" : "totalInternalMemory",
"description" : "Total internal memory usage",
"counter" : "mem_used",
View
2 prescription.py
@@ -39,7 +39,7 @@
},
"EPEnginePerformance" : {
"cause" : "Poor ep-engine key performance indicators",
- "impact" : "To be defined",
+ "impact" : "Server performance is below expectation.",
"action" : "Please contact support@couchbase.com",
},
"DiskQueueDiagnosis" : {
View
35 threshold.py
@@ -17,17 +17,38 @@
"tapNack" : 500,
},
"MemoryFragmentation" : {
- "totalFragmentation" : 1073741824,
- "diskDelete" : 1000,
- "diskUpdate" : 1000,
- "diskInsert" : 1000,
- "diskCommit" : 5000000,
+ "totalFragmentation" : {
+ "low" : 1073741824, # 1GB
+ "high" : 2147483648, # 2GB
+ },
+ "diskDelete" : {
+ "low" : 1000, #1ms
+ "high" : 5000,
+ },
+ "diskUpdate" : {
+ "low" : 1000, #1ms
+ "high" : 5000,
+ },
+ "diskInsert" : {
+ "low" : 1000, #1ms
+ "high" : 5000,
+ },
+ "diskCommit" : {
+ "low" : 5000000,
+ "high" : 10000000,
+ },
},
"EPEnginePerformance" : {
"flusherState" : "running",
"flusherCompleted" : 0,
- "avgItemLoadTime" : 100,
- "avgItemWaitTime" : 100,
+ "avgItemLoadTime" : {
+ "low" : 100,
+ "high" : 500,
+ },
+ "avgItemWaitTime" : {
+ "low" : 100,
+ "high" : 500,
+ },
},
}
View
24 util_cli.py
@@ -8,6 +8,11 @@
BIG_VALUE = 2 ** 60
SMALL_VALUE = - (2 ** 60)
+def devisible(a, b):
+ if b == 0:
+ return False
+ return a % b == 0
+
def hostport(hoststring, default_port=8091):
""" finds the host and port given a host:port string """
try:
@@ -39,7 +44,10 @@ def time_label(s):
product = sz * product
sizeMap.insert(0, (l, product))
lbl, factor = itertools.dropwhile(lambda x: x[1] > s, sizeMap).next()
- return '%.*f %s' % (2, s * 1.0/factor, lbl)
+ if devisible(s, factor):
+ return '%d %s' % (s / factor, lbl)
+ else:
+ return '%.*f %s' % (3, s * 1.0/factor, lbl)
def size_label(s):
if type(s) in (int, long, float, complex) :
@@ -48,18 +56,26 @@ def size_label(s):
sizes=['', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
e = math.floor(math.log(abs(s), 1024))
suffix = sizes[int(e)]
- return "%.*f %s" % (2, s *1.0/(1024 ** math.floor(e)), suffix)
+ if devisible(s, 1024 ** math.floor(e)):
+ return '%d %s' % ( s / (1024 ** math.floor(e)), suffix)
+ else:
+ return "%.*f %s" % (3, s *1.0/(1024 ** math.floor(e)), suffix)
else:
return s
def number_label(s):
if type(s) in (int, long, float, complex) :
- if s == 0:
+ if s < 1:
return "0"
sizes=['', 'thousand', 'million', 'billion', 'trillion', 'quadrillion', 'quintillion']
e = math.floor(math.log(abs(s), 1000))
+ if e < 0:
+ e = 0
suffix = sizes[int(e)]
- return "%.*f %s" % (2, s *1.0/(1000 ** math.floor(e)), suffix)
+ if devisible(s, 1000 ** math.floor(e)):
+ return "%d %s" % (s / (1000 ** match.floor(e)), suffix)
+ else:
+ return "%.*f %s" % (2, s *1.0/(1000 ** math.floor(e)), suffix)
else:
return s

0 comments on commit 655fe26

Please sign in to comment.