Permalink
Browse files

Add missing counter definitions

  • Loading branch information...
1 parent b4506f9 commit 7821500c11c6a52bb550b3e97e2cd65006c5a275 @bcui6611 bcui6611 committed Dec 7, 2012
Showing with 330 additions and 1 deletion.
  1. +330 −1 cluster_stats.py
View
@@ -865,7 +865,336 @@ def run(self, accessor, scale, threshold=None):
"perNode" : False,
"perBucket" : False,
},
-
+ {"name" : "CacheMissRatio",
+ "ingredients" : [
+ {
+ "name" : "cacheMissRatio",
+ "description" : "Cache miss ratio",
+ "symptom" : "From {0} to {1}, a higher item count '{2}' leads to high cache miss ratio '{3}%' and low residential ratio '{4}%'",
+ "counter" : ["ep_cache_miss_rate", "vb_active_resident_items_ratio", "curr_items"],
+ "code" : "CacheMissRatio",
+ "threshold" : {
+ "CacheMissRate" : 3, # 2%
+ "ActiveResidentItemsRatio" : 25, # 30%
+ "recurrence" : 10
+ },
+ "formula" : "Avg(ep_cache_miss_rate)",
+ },
+ ],
+ "clusterwise" : False,
+ "perNode" : True,
+ "perBucket" : True,
+ "indicator" : True,
+ "nodeDisparate" : True,
+ },
+ {"name" : "DGM",
+ "ingredients" : [
+ {
+ "name" : "dgm",
+ "description" : "Disk to memory ratio",
+ "code" : "DGMRatio",
+ "formula" : "Total(Storage['hdd']['usedByData']) / Total(Storage['ram']['usedByData'])",
+ },
+ ],
+ "clusterwise" : True,
+ "perNode" : False,
+ "perBucket" : False,
+ },
+ {"name" : "ActiveReplicaResidentRatio",
+ "ingredients" : [
+ {
+ "name" : "activeReplicaResidentRatio",
+ "description" : "Active to replica resident ratio",
+ "counter" : ["curr_items", "vb_replica_curr_items"],
+ "scale" : "minute",
+ "code" : "ARRatio",
+ "threshold" : 5,
+ "symptom" : "Active to replica resident ratio '{0}%' is bigger than '{1}%'",
+ "formula" : "Avg(curr_items) / Avg(vb_replica_curr_items)",
+ },
+ ],
+ "clusterwise" : True,
+ "perNode" : True,
+ "perBucket" : True,
+ "indicator" : True,
+ },
+ {"name" : "ResidentRatio",
+ "ingredients" : [
+ {
+ "name" : "activeResidentRatio",
+ "description" : "Active resident ratio",
+ "counter" : "vb_active_resident_items_ratio",
+ "scale" : "minute",
+ "code" : "ResidentItemRatio",
+ "threshold" : 30,
+ "symptom" : "Active resident item ratio '{0}' is below '{1}'",
+ "formula" : "Last(vb_active_resident_items_ratio)",
+ },
+ {
+ "name" : "replicaResidentRatio",
+ "description" : "Replica resident ratio",
+ "counter" : "vb_replica_resident_items_ratio",
+ "scale" : "minute",
+ "code" : "ResidentItemRatio",
+ "threshold" : 20,
+ "symptom" : "Replica resident item ratio '{0}' is below '{1}'",
+ "formula" : "Last(vb_replica_resident_items_ratio)",
+ },
+ ],
+ "clusterwise" : True,
+ "perNode" : True,
+ "perBucket" : True,
+ "indicator" : True,
+ },
+ {"name" : "OPSPerformance",
+ "ingredients" : [
+ {
+ "name" : "opsPerformance",
+ "description" : "Read/Write/Delete ops ratio",
+ "scale" : "week",
+ "counter" : ["cmd_get", "cmd_set", "delete_hits"],
+ "code" : "OpsRatio",
+ "formula" : "Avg(cmd_get) : Avg(cmd_set) : Avg(delete_hits)",
+ },
+ ],
+ "perBucket" : True,
+ "clusterwise" : True,
+ },
+ {"name" : "GrowthRate",
+ "ingredients" : [
+ {
+ "name" : "dataGrowthRateForItems",
+ "description" : "Average data growth rate for items",
+ "counter" : "curr_items",
+ "scale" : "day",
+ "code" : "ItemGrowth",
+ "formula" : "Linear(curr_items)",
+ },
+ ],
+ "clusterwise" : True,
+ "perBucket" : True,
+ "perNode" : True,
+ },
+ {"name" : "VBucketNumber",
+ "ingredients" : [
+ {
+ "name" : "activeVbucketNumber",
+ "description" : "Active VBucket number",
+ "counter" : "vb_active_num",
+ "scale" : "hour",
+ "code" : "NumVbuckt",
+ "threshold" : 1024,
+ "symptom" : "Number of active vBuckets '{0}' is less than '{1}' per node",
+ "formula" : "Avg(vb_active_num)",
+ },
+ {
+ "name" : "replicaVBucketNumber",
+ "description" : "Replica VBucket number",
+ "counter" : "vb_replica_num",
+ "scale" : "hour",
+ "code" : "NumVbuckt",
+ "threshold" : 1024,
+ "symptom" : "Number of replica vBuckets '{0}' is less than '{1}' per node",
+ "formula" : "Avg(vb_replica_num)",
+ },
+ ],
+ "indicator" : True,
+ "perBucket" : True,
+ "perNode" : True,
+ },
+ {"name" : "VBucketServerMap",
+ "ingredients" : [
+ {
+ "name" : "vbucketMap",
+ "description" : "Sanity checks for vBucket map",
+ "code" : "VbucketMapSanity",
+ "threshold" : 1024,
+ "formula" : "",
+ },
+ {
+ "name" : "vbucketServerList",
+ "description" : "Sanity checks for vBucket server list",
+ "code" : "VbucketServerListSanity",
+ "formula" : "",
+ },
+ ],
+ "indicator" : True,
+ "perBucket" : True,
+ },
+ {"name" : "MemoryUsage",
+ "ingredients" : [
+ {
+ "name" : "memoryUsage",
+ "description" : "Memory usage",
+ "counter" : "mem_used",
+ "scale" : "hour",
+ "code" : "MemUsed",
+ "formula" : "Avg(mem_used)",
+ },
+ ],
+ "perNode" : True,
+ "perBucket" : True,
+ "nodeDisparate" : True,
+ },
+ {"name" : "RebalancePerformance",
+ "ingredients" : [
+ {
+ "name" : "highBackfillRemaing",
+ "description" : "Tap queue backfill remaining is too high",
+ "counter" : "ep_tap_queue_backfillremaining",
+ "code" : "RebalanceStuck",
+ "threshold" : 10000,
+ "symptom" : "'{0}' occurrences showing tap queue backfill remainings higher than threshold '{1}'",
+ "formula" : "Total(ep_tap_queue_backfillremaining > threshold)",
+ },
+ {
+ "name" : "tapNack",
+ "description" : "Number of Tap stream backoff",
+ "counter" : "num_tap_nack",
+ "code" : "RebalanceStuck",
+ "threshold" : 500,
+ "symptom" : "'{0}' occurrences showing tap stream backoffs received higher than threshold '{1}'",
+ "formula" : "Total(num_tap_nack > threshold)",
+ },
+ ],
+ "indicator" : True,
+ "perBucket" : True,
+ },
+ {"name" : "MemoryFragmentation",
+ "ingredients" : [
+ {
+ "name" : "totalFragmentation",
+ "description" : "Total memory fragmentation",
+ "counter" : "total_fragmentation_bytes",
+ "code" : "CalcFragmentation",
+ "unit" : "size",
+ "threshold" : {
+ "low" : 1073741824, # 1GB
+ "high" : 2147483648, # 2GB
+ },
+ "symptom" : "Total memory fragmentation '{0}' is larger than '{1}'",
+ "formula" : "total_fragmentation_bytes > threshold",
+ },
+ ],
+ "indicator" : True,
+ "perNode" : True,
+ "perBucket" : True,
+ },
+ {"name" : "DiskPerformance",
+ "ingredients" : [
+ {
+ "name" : "diskPerformance",
+ "description" : "Disk IO Performance",
+ "counter" : ["disk_del", "disk_update", "disk_insert", "disk_commit"],
+ "code" : "DiskPerformance",
+ "unit" : "time",
+ "threshold" : {
+ "disk_del" : {"low": 1000, "high": 5000},
+ "disk_update" : {"low": 1000, "high": 5000},
+ "disk_insert" : {"low": 1000, "high": 5000},
+ "disk_commit" : {"low": 5000000, "high": 10000000},
+ },
+ "symptom" : {
+ "disk_del": "Average disk delete time '{0}' is slower than '{1}'",
+ "disk_update": "Average disk update time '{0}' is slower than '{1}'",
+ "disk_insert": "Average disk insert time '{0}' is slower than '{1}'",
+ "disk_commit": "Average disk commit time '{0}' is slower than '{1}'",
+ },
+ "formula" : "Avg(%counter) > threshold",
+ },
+ ],
+ "clusterwise" : False,
+ "perNode" : True,
+ "perBucket" : True,
+ "indicator" : True,
+ },
+ {"name" : "EPEnginePerformance",
+ "ingredients" : [
+ {
+ "name" : "flusherState",
+ "description" : "Engine flusher state",
+ "counter" : "ep_flusher_state",
+ "code" : "EPEnginePerformance",
+ "threshold" : "running",
+ "symptom" : "The flusher is not running",
+ "formula" : "ep_flusher_state == True",
+ },
+ {
+ "name" : "flusherCompleted",
+ "description" : "Flusher completed",
+ "counter" : "ep_flusher_num_completed",
+ "code" : "EPEnginePerformance",
+ "threshold" : 0,
+ "symptom" : "The flusher is not persisting any items",
+ "formula" : "ep_flusher_num_completed == 0",
+ },
+ {
+ "name" : "avgItemLoadTime",
+ "description" : "Average item loaded time",
+ "counter" : "ep_bg_load_avg",
+ "code" : "EPEnginePerformance",
+ "unit" : "time",
+ "threshold" : {
+ "low" : 100,
+ "high" : 500,
+ },
+ "symptom" : "Average item loaded time '{0}' is slower than '{1}'",
+ "formula" : "Avg(ep_bg_load_avg) > threshold",
+ },
+ {
+ "name" : "avgItemWaitTime",
+ "description" : "Average item waited time",
+ "counter" : "ep_bg_wait_avg",
+ "code" : "EPEnginePerformance",
+ "unit" : "time",
+ "threshold" : {
+ "low" : 100,
+ "high" : 500,
+ },
+ "symptom" : "Average waiting time '{0}' for items serviced by dispatcher is slower than '{1}'",
+ "formula" : "Avg(ep_bg_wait_avg) > threshold",
+ },
+ ],
+ "indicator" : True,
+ },
+ {"name" : "OutgoingXDCRPerformance",
+ "ingredients" : [
+ {
+ "name" : "outgoingXdrOps",
+ "description" : "Cross data center replication operation per sec",
+ "counter" : "xdc_ops",
+ "code" : "CalcTrend",
+ "unit" : "number",
+ },
+ {
+ "name" : "xdcrReplicationQueue",
+ "description" : "XDCR replication queue",
+ "counter" : "replication_changes_left",
+ "code" : "CalcTrend",
+ "unit" : "size",
+ },
+ ],
+ "perNode" : True,
+ "perBucket" : True,
+ },
+ {"name" : "IncomingXDCRPerformance",
+ "ingredients" : [
+ {
+ "name" : "incomingXdrPerformance",
+ "description" : "Incoming XDCR Get/Set ops ratio",
+ "counter" : ["ep_num_ops_get_meta", "ep_num_ops_set_meta"],
+ "code" : "XdrOpsPerformance",
+ "threshold" : {
+ "low" : 2,
+ "high" : 10
+ },
+ "symptom" : "Get to Set ops ratio '{0}' is bigger than '{1}'. Too few set operations.",
+ "formula" : "Avg(ep_num_ops_get_meta) / Avg(ep_num_ops_set_meta)",
+ },
+ ],
+ "perNode" : True,
+ "perBucket" : True,
+ },
{"name" : "CompactionPerformance",
"ingredients" : [
{

0 comments on commit 7821500

Please sign in to comment.