Browse files

Add fine grained analysis for cache miss ratio

Since cache miss ratio tends to be volatile, it is quite misleading
to just use averge value for a time span, say, one day samplings.
Here we use an approach to find many abnormal segments and correlate
them to current items and active residential ratio. If all of them
are above thresholds, the abnormal segment will be marked as a
questionable one. As such, a time notion is introduced.
  • Loading branch information...
1 parent fdc6312 commit cb68ffaab32cf80a7a53a69cad82db50062e03f3 @bcui6611 bcui6611 committed Aug 31, 2012
Showing with 83 additions and 37 deletions.
  1. +46 −34 cluster_stats.py
  2. +0 −1 node_stats.py
  3. +1 −1 prescription.py
  4. +5 −1 threshold.py
  5. +31 −0 util_cli.py
View
80 cluster_stats.py
@@ -161,49 +161,58 @@ def run(self, accessor, scale, threshold=None):
class CacheMissRatio:
def run(self, accessor, scale, threshold=None):
result = {}
- cluster = []
thresholdval = accessor["threshold"]
if threshold.has_key("CacheMissRatio"):
thresholdval = threshold["CacheMissRatio"]
+
for bucket, stats_info in stats_buffer.buckets.iteritems():
- values = stats_info[scale][accessor["counter"]]
+ values = stats_info[scale][accessor["counter"][0]]
+ arr_values = stats_info[scale][accessor["counter"][1]]
+ curr_values = stats_info[scale][accessor["counter"][2]]
+
timestamps = values["timestamp"]
- timestamps = [x - timestamps[0] for x in timestamps]
nodeStats = values["nodeStats"]
samplesCount = values["samplesCount"]
+
trend = []
- total = []
- data = []
- num_error = []
+ num_warn = []
for node, vals in nodeStats.iteritems():
- #a, b = util.linreg(timestamps, vals)
+ arr_vals = arr_values["nodeStats"][node]
+ curr_vals = curr_values["nodeStats"][node]
if samplesCount > 0:
- value = sum(vals) / samplesCount
+ node_avg_curr = sum(curr_vals) / samplesCount
else:
- value = 0
- value = max(0, value)
- total.append(value)
- if value > thresholdval:
- symptom = accessor["symptom"].format(util.pretty_float(value), thresholdval)
- num_error.append({"node":node, "value":symptom})
- trend.append((node, {"value" : util.pretty_float(value) + "%",
- "raw" : vals,
- }))
- data.append(value)
- if len(nodeStats) > 0:
- trend.append(("total", {"value" : util.pretty_float(sum(total) / len(nodeStats)) + "%",
- "raw" : total}))
- else:
- trend.append(("total", util.pretty_float(sum(total)) + "%"))
- trend.append(("variance", util.two_pass_variance(data)))
- if len(num_error) > 0:
- trend.append(("error", num_error))
+ node_avg_curr = 0
+ # Fine grained analysis
+ abnormal_segs = util.abnormal_extract(vals, thresholdval["CacheMissRate"])
+ abnormal_vals = []
+ for seg in abnormal_segs:
+ begin_index = seg[0]
+ seg_total = seg[1]
+ if seg_total < thresholdval["recurrence"]:
+ continue
+ end_index = begin_index + seg_total
- cluster.append(sum(total))
+ cmr_avg = sum(vals[begin_index : end_index]) / seg_total
+ arr_avg = sum(arr_vals[begin_index : end_index]) / seg_total
+ curr_avg = sum(curr_vals[begin_index : end_index]) / seg_total
+
+ if arr_avg < thresholdval["ActiveResidentItemsRatio"] and curr_avg > node_avg_curr:
+ symptom = accessor["symptom"].format(util.pretty_datetime(timestamps[begin_index]),
+ util.pretty_datetime(timestamps[end_index], True),
+ util.number_label(int(curr_avg)),
+ util.pretty_float(cmr_avg),
+ util.pretty_float(arr_avg))
+ num_warn.append({"node":node, "value":symptom})
+ abnormal_vals.append(cmr_avg)
+ if len(abnormal_vals) > 0:
+ trend.append((node, {"value" : util.pretty_float(sum(abnormal_vals)/len(abnormal_vals)) + "%",
+ "raw" : abnormal_vals}
+ ))
+ if len(num_warn) > 0:
+ trend.append(("warn", num_warn))
result[bucket] = trend
- if len(stats_buffer.buckets) > 0:
- result["cluster"] = {"value" : util.pretty_float(sum(cluster) / len(stats_buffer.buckets)) + "%",
- "raw" : cluster}
+
return result
class ResidentItemRatio:
@@ -628,11 +637,14 @@ def run(self, accessor, scale, threshold=None):
{
"name" : "cacheMissRatio",
"description" : "Cache miss ratio",
- "symptom" : "Cache miss ratio '{0}%' is higher than threshold '{1}%'",
- "counter" : "ep_cache_miss_rate",
- "scale" : "hour",
+ "symptom" : "From {0} to {1}, a higher item count '{2}' leads to high cache miss ratio '{3}%' and low residential ratio '{4}%'",
+ "counter" : ["ep_cache_miss_rate", "vb_active_resident_items_ratio", "curr_items"],
"code" : "CacheMissRatio",
- "threshold" : 2,
+ "threshold" : {
+ "CacheMissRate" : 2, # 2%
+ "ActiveResidentItemsRatio" : 30, # 30%
+ "recurrence" : 10
+ },
"formula" : "Avg(ep_cache_miss_rate)",
},
],
View
1 node_stats.py
@@ -108,7 +108,6 @@ def run(self, accessor, scale, threshold=None):
samplesCount = values["samplesCount"]
trend = []
for node, vals in nodeStats.iteritems():
- #a, b = util.linreg(timestamps, vals)
if samplesCount > 0:
avg = sum(vals) / samplesCount
else:
View
2 prescription.py
@@ -3,7 +3,7 @@
Capsules = {
"CacheMissRatio" : {
- "cause" : "Too many requests for information that has already been ejected to disk.",
+ "cause" : "Too many requests for information that has already been ejected to disk",
"impact" : "Results in too many fetches from disk, causing poor performance and slower I/O.",
"action" : "Increase disk quota for buckets, or add nodes to cluster. If issue persists please contact support@couchbase.com",
},
View
6 threshold.py
@@ -2,7 +2,11 @@
# -*- coding: utf-8 -*-
ClusterCapsule = {
- "CacheMissRatio" : 2,
+ "CacheMissRatio" : {
+ "CacheMissRate" : 2,
+ "ActiveResidentItemsRatio" : 30,
+ "recurrence": 8,
+ },
"ActiveReplicaResidentRatio" : {
"activeReplicaResidentRatio" : 4,
"activeResidentRatio" : 30,
View
31 util_cli.py
@@ -3,7 +3,11 @@
import json
import math
+import datetime
import itertools
+import locale
+
+locale.setlocale(locale.LC_ALL, '')
BIG_VALUE = 2 ** 60
SMALL_VALUE = - (2 ** 60)
@@ -120,8 +124,35 @@ def two_pass_variance(data):
variance = sum2/(n - 1)
return variance
+def abnormal_extract(vals, threshold):
+ abnormal = []
+ begin_index = -1
+ seg_count = 0
+
+ for index, sample in enumerate(vals):
+ if sample > threshold:
+ if begin_index < 0:
+ begin_index = index
+ seg_count += 1
+ else:
+ if begin_index >= 0:
+ abnormal.append((begin_index, seg_count))
+ begin_index = -1
+ seg_count = 0
+
+ if begin_index >= 0:
+ abnormal.append((begin_index, seg_count))
+ return abnormal
+
def pretty_float(number, precision=2):
return '%.*f' % (precision, number)
def pretty_print(obj):
return json.dumps(obj, indent=4, sort_keys=True)
+
+def pretty_datetime(number, timeonly=False):
+ if timeonly:
+ return str(datetime.datetime.fromtimestamp(number/1000).time())
+ else:
+ timestamp = datetime.datetime.fromtimestamp(number/1000)
+ return timestamp.strftime('%x') + ' ' + str(timestamp.time())

0 comments on commit cb68ffa

Please sign in to comment.