Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Finish Symdrome detector

It can be simply applied to view fragmentation and doc fragmentation
analysis
  • Loading branch information...
commit b4506f94cabe22a6e8ade5c336d6f3be42386122 1 parent 2f4c0b6
Bin Cui authored
373  cluster_stats.py
@@ -18,16 +18,14 @@ def run(self, accessor, scale, threshold=None):
18 18
             #First one is the main counter we run against
19 19
             timestamps = values[main_counter]["timestamp"]
20 20
             nodeStats = values[main_counter]["nodeStats"]
21  
-            samplesCount = values[main_counter]["samplesCount"]
  21
+            #samplesCount = values[main_counter]["samplesCount"]
22 22
 
23 23
             trend = []
24 24
             num_warn = []
25 25
             for node, vals in nodeStats.iteritems():
26  
-                #arr_vals = arr_values["nodeStats"][node]
27  
-                #curr_vals = curr_values["nodeStats"][node]
28 26
                 vals = {}
29 27
                 for counter in accessor["counter"]:
30  
-                    vals[counter] = values["nodeStats"][node]
  28
+                    vals[counter] = values[counter]["nodeStats"][node]
31 29
 
32 30
                 node_avg = {}
33 31
                 #if samplesCount > 0:
@@ -36,7 +34,7 @@ def run(self, accessor, scale, threshold=None):
36 34
                 #    node_avg_curr = 0
37 35
 
38 36
                 # Fine grained analysis
39  
-                abnormal_segs = util.abnormal_extract(vals[main_counter], thresholdval[main_counter], accessor.get("op", ">="))
  37
+                abnormal_segs = util.abnormal_extract(vals[main_counter], thresholdval[main_counter][1], thresholdval[main_counter][0])
40 38
                 abnormal_vals = []
41 39
                 for seg in abnormal_segs:
42 40
                     begin_index = seg[0]
@@ -47,23 +45,21 @@ def run(self, accessor, scale, threshold=None):
47 45
 
48 46
                     seg_avg = {}
49 47
                     b = True
  48
+                    seg_tuple = ()
50 49
                     for counter in accessor["counter"]:
51  
-                        #cmr_avg = sum(vals[begin_index : end_index]) / seg_total
52  
-                        #arr_avg = sum(arr_vals[begin_index : end_index]) / seg_total
53  
-                        #curr_avg = sum(curr_vals[begin_index : end_index]) / seg_total
54 50
                         seg_avg[counter] = sum(vals[counter][begin_index : end_index]) / seg_total
55  
-                        b &= util.evalfunc(thresholdval[counter][0], seg_avg[counter], thresholdval[counter][1])
  51
+                        seg_tuple += (util.pretty_float(seg_avg[counter]), )
  52
+                        b &= util.evalfunc(thresholdval[counter][1], seg_avg[counter], thresholdval[counter][0])
56 53
                         if not b:
57 54
                             break
58 55
 
59 56
                     if b:
60  
-                        symptom = accessor["symptom"].format(util.pretty_datetime(timestamps[begin_index]), 
61  
-                                                             util.pretty_datetime(timestamps[end_index-1], True), 
62  
-                                                             util.number_label(int(curr_avg)), 
63  
-                                                             util.pretty_float(cmr_avg), 
64  
-                                                             util.pretty_float(arr_avg))
  57
+                        seg_tuple = (util.pretty_datetime(timestamps[begin_index]),) + seg_tuple
  58
+                        seg_tuple = (util.pretty_datetime(timestamps[end_index-1], True),) + seg_tuple
  59
+                        symptom = accessor["symptom"] % seg_tuple
  60
+
65 61
                         num_warn.append({"node":node, "value":symptom})
66  
-                        abnormal_vals.append(cmr_avg)
  62
+                        abnormal_vals.append(seg_avg[main_counter])
67 63
                 if len(abnormal_vals) > 0:
68 64
                     trend.append((node, {"value" : util.pretty_float(sum(abnormal_vals)/len(abnormal_vals)) + "%",
69 65
                                          "raw" : abnormal_vals}
@@ -869,360 +865,31 @@ def run(self, accessor, scale, threshold=None):
869 865
      "perNode" : False,
870 866
      "perBucket" : False,
871 867
     },
872  
-   {"name" : "CacheMissRatio",
873  
-     "ingredients" : [
874  
-        {
875  
-            "name" : "cacheMissRatio",
876  
-            "description" : "Cache miss ratio",
877  
-            "symptom" : "From {0} to {1}, a higher item count '{2}' leads to high cache miss ratio '{3}%' and low residential ratio '{4}%'",
878  
-            "counter" : ["ep_cache_miss_rate", "vb_active_resident_items_ratio", "curr_items"],
879  
-            "code" : "CacheMissRatio",
880  
-            "threshold" : {
881  
-                "CacheMissRate" : 3, # 2%
882  
-                "ActiveResidentItemsRatio" : 25, # 30%
883  
-                "recurrence" : 10
884  
-            },
885  
-            "formula" : "Avg(ep_cache_miss_rate)",
886  
-        },
887  
-     ],
888  
-     "clusterwise" : False,
889  
-     "perNode" : True,
890  
-     "perBucket" : True,
891  
-     "indicator" : True,
892  
-     "nodeDisparate" : True,
893  
-    },
894  
-    {"name" : "DGM",
895  
-     "ingredients" : [
896  
-        {
897  
-            "name" : "dgm",
898  
-            "description" : "Disk to memory ratio",
899  
-            "code" : "DGMRatio",
900  
-            "formula" : "Total(Storage['hdd']['usedByData']) / Total(Storage['ram']['usedByData'])",
901  
-        },
902  
-     ],
903  
-     "clusterwise" : True,
904  
-     "perNode" : False,
905  
-     "perBucket" : False,
906  
-    },
907  
-    {"name" : "ActiveReplicaResidentRatio",
908  
-     "ingredients" : [
909  
-        {
910  
-            "name" : "activeReplicaResidentRatio",
911  
-            "description" : "Active to replica resident ratio",
912  
-            "counter" : ["curr_items", "vb_replica_curr_items"],
913  
-            "scale" : "minute",
914  
-            "code" : "ARRatio",
915  
-            "threshold" : 5,
916  
-            "symptom" : "Active to replica resident ratio '{0}%' is bigger than '{1}%'",
917  
-            "formula" : "Avg(curr_items) / Avg(vb_replica_curr_items)",
918  
-        },
919  
-     ],
920  
-     "clusterwise" : True,
921  
-     "perNode" : True,
922  
-     "perBucket" : True,
923  
-     "indicator" : True,
924  
-    },
925  
-    {"name" : "ResidentRatio",
926  
-     "ingredients" : [
927  
-        {
928  
-            "name" : "activeResidentRatio",
929  
-            "description" : "Active resident ratio",
930  
-            "counter" : "vb_active_resident_items_ratio",
931  
-            "scale" : "minute",
932  
-            "code" : "ResidentItemRatio",
933  
-            "threshold" : 30,
934  
-            "symptom" : "Active resident item ratio '{0}' is below '{1}'",
935  
-            "formula" : "Last(vb_active_resident_items_ratio)",
936  
-        },
937  
-        {
938  
-            "name" : "replicaResidentRatio",
939  
-            "description" : "Replica resident ratio",
940  
-            "counter" : "vb_replica_resident_items_ratio",
941  
-            "scale" : "minute",
942  
-            "code" : "ResidentItemRatio",
943  
-            "threshold" : 20,
944  
-            "symptom" : "Replica resident item ratio '{0}' is below '{1}'",
945  
-            "formula" : "Last(vb_replica_resident_items_ratio)",
946  
-        },
947  
-     ],
948  
-     "clusterwise" : True,
949  
-     "perNode" : True,
950  
-     "perBucket" : True,
951  
-     "indicator" : True,
952  
-    },
953  
-    {"name" : "OPSPerformance",
954  
-     "ingredients" : [
955  
-        {
956  
-            "name" : "opsPerformance",
957  
-            "description" : "Read/Write/Delete ops ratio",
958  
-            "scale" : "week",
959  
-            "counter" : ["cmd_get", "cmd_set", "delete_hits"],
960  
-            "code" : "OpsRatio",
961  
-            "formula" : "Avg(cmd_get) : Avg(cmd_set) : Avg(delete_hits)",
962  
-        },
963  
-     ],
964  
-     "perBucket" : True,
965  
-     "clusterwise" : True,
966  
-    },
967  
-    {"name" : "GrowthRate",
968  
-     "ingredients" : [
969  
-        {
970  
-            "name" : "dataGrowthRateForItems",
971  
-            "description" : "Average data growth rate for items",
972  
-            "counter" : "curr_items",
973  
-            "scale" : "day",
974  
-            "code" : "ItemGrowth",
975  
-            "formula" : "Linear(curr_items)",
976  
-        },
977  
-     ],
978  
-     "clusterwise" : True,
979  
-     "perBucket" : True,
980  
-     "perNode" : True,
981  
-    },
982  
-    {"name" : "VBucketNumber",
983  
-     "ingredients" : [
984  
-        {
985  
-            "name" : "activeVbucketNumber",
986  
-            "description" : "Active VBucket number",
987  
-            "counter" : "vb_active_num",
988  
-            "scale" : "hour",
989  
-            "code" : "NumVbuckt",
990  
-            "threshold" : 1024,
991  
-            "symptom" : "Number of active vBuckets '{0}' is less than '{1}' per node",
992  
-            "formula" : "Avg(vb_active_num)",
993  
-        },
994  
-        {
995  
-            "name" : "replicaVBucketNumber",
996  
-            "description" : "Replica VBucket number",
997  
-            "counter" : "vb_replica_num",
998  
-            "scale" : "hour",
999  
-            "code" : "NumVbuckt",
1000  
-            "threshold" : 1024,
1001  
-            "symptom" : "Number of replica vBuckets '{0}' is less than '{1}' per node", 
1002  
-            "formula" : "Avg(vb_replica_num)",
1003  
-        },
1004  
-     ],
1005  
-     "indicator" : True,
1006  
-     "perBucket" : True,
1007  
-     "perNode" : True,
1008  
-    },
1009  
-    {"name" : "VBucketServerMap",
1010  
-     "ingredients" : [
1011  
-        {
1012  
-            "name" : "vbucketMap",
1013  
-            "description" : "Sanity checks for vBucket map",
1014  
-            "code" : "VbucketMapSanity",
1015  
-            "threshold" : 1024,
1016  
-            "formula" : "",
1017  
-        },
1018  
-        {
1019  
-            "name" : "vbucketServerList",
1020  
-            "description" : "Sanity checks for vBucket server list",
1021  
-            "code" : "VbucketServerListSanity",
1022  
-            "formula" : "",
1023  
-        },
1024  
-     ],
1025  
-     "indicator" : True,
1026  
-     "perBucket" : True,
1027  
-    },
1028  
-    {"name" : "MemoryUsage",
1029  
-     "ingredients" : [
1030  
-        {
1031  
-            "name" : "memoryUsage",
1032  
-            "description" : "Memory usage",
1033  
-            "counter" : "mem_used",
1034  
-            "scale" : "hour",
1035  
-            "code" : "MemUsed",
1036  
-            "formula" : "Avg(mem_used)",
1037  
-        },
1038  
-     ],
1039  
-     "perNode" : True,
1040  
-     "perBucket" : True,
1041  
-     "nodeDisparate" : True,
1042  
-    },
1043  
-    {"name" : "RebalancePerformance",
1044  
-     "ingredients" : [
1045  
-        {
1046  
-            "name" : "highBackfillRemaing",
1047  
-            "description" : "Tap queue backfill remaining is too high",
1048  
-            "counter" : "ep_tap_queue_backfillremaining",
1049  
-            "code" : "RebalanceStuck",
1050  
-            "threshold" : 10000,
1051  
-            "symptom" : "'{0}' occurrences showing tap queue backfill remainings higher than threshold '{1}'",
1052  
-            "formula" : "Total(ep_tap_queue_backfillremaining > threshold)",
1053  
-        },
1054  
-        {
1055  
-            "name" : "tapNack",
1056  
-            "description" : "Number of Tap stream backoff",
1057  
-            "counter" : "num_tap_nack",
1058  
-            "code" : "RebalanceStuck",
1059  
-            "threshold" : 500,
1060  
-            "symptom" : "'{0}' occurrences showing tap stream backoffs received higher than threshold '{1}'",
1061  
-            "formula" : "Total(num_tap_nack > threshold)",
1062  
-        },
1063  
-     ],
1064  
-     "indicator" : True,
1065  
-     "perBucket" : True,
1066  
-    },
1067  
-    {"name" : "MemoryFragmentation",
1068  
-     "ingredients" : [
1069  
-        {
1070  
-            "name" : "totalFragmentation",
1071  
-            "description" : "Total memory fragmentation",
1072  
-            "counter" : "total_fragmentation_bytes",
1073  
-            "code" : "CalcFragmentation",
1074  
-            "unit" : "size",
1075  
-            "threshold" : {
1076  
-                "low" : 1073741824, # 1GB
1077  
-                "high" : 2147483648, # 2GB
1078  
-            },
1079  
-            "symptom" : "Total memory fragmentation '{0}' is larger than '{1}'",
1080  
-            "formula" : "total_fragmentation_bytes > threshold",
1081  
-        },
1082  
-      ],
1083  
-      "indicator" : True,
1084  
-      "perNode" : True,
1085  
-      "perBucket" : True,
1086  
-    },
1087  
-    {"name" : "DiskPerformance",
1088  
-     "ingredients" : [
1089  
-        {
1090  
-            "name" : "diskPerformance",
1091  
-            "description" : "Disk IO Performance",
1092  
-            "counter" : ["disk_del", "disk_update", "disk_insert", "disk_commit"],
1093  
-            "code" : "DiskPerformance",
1094  
-            "unit" : "time",
1095  
-            "threshold" : {
1096  
-                "disk_del" : {"low": 1000, "high": 5000},
1097  
-                "disk_update" : {"low": 1000, "high": 5000},
1098  
-                "disk_insert" : {"low": 1000, "high": 5000},
1099  
-                "disk_commit" : {"low": 5000000, "high": 10000000},
1100  
-            },
1101  
-            "symptom" : {
1102  
-                "disk_del": "Average disk delete time '{0}' is slower than '{1}'",
1103  
-                "disk_update": "Average disk update time '{0}' is slower than '{1}'",
1104  
-                "disk_insert": "Average disk insert time '{0}' is slower than '{1}'",
1105  
-                "disk_commit": "Average disk commit time '{0}' is slower than '{1}'",
1106  
-            },
1107  
-            "formula" : "Avg(%counter) > threshold",
1108  
-        },
1109  
-     ],
1110  
-     "clusterwise" : False,
1111  
-     "perNode" : True,
1112  
-     "perBucket" : True,
1113  
-     "indicator" : True,
1114  
-    },
1115  
-    {"name" : "EPEnginePerformance",
1116  
-     "ingredients" : [
1117  
-        {
1118  
-            "name" : "flusherState",
1119  
-            "description" : "Engine flusher state",
1120  
-            "counter" : "ep_flusher_state",
1121  
-            "code" : "EPEnginePerformance",
1122  
-            "threshold" : "running",
1123  
-            "symptom" : "The flusher is not running",
1124  
-            "formula" : "ep_flusher_state == True",
1125  
-        },
1126  
-        {
1127  
-            "name" : "flusherCompleted",
1128  
-            "description" : "Flusher completed",
1129  
-            "counter" : "ep_flusher_num_completed",
1130  
-            "code" : "EPEnginePerformance",
1131  
-            "threshold" : 0,
1132  
-            "symptom" : "The flusher is not persisting any items",
1133  
-            "formula" : "ep_flusher_num_completed == 0",
1134  
-        },
1135  
-        {
1136  
-            "name" : "avgItemLoadTime",
1137  
-            "description" : "Average item loaded time",
1138  
-            "counter" : "ep_bg_load_avg",
1139  
-            "code" : "EPEnginePerformance",
1140  
-            "unit" : "time",
1141  
-            "threshold" : {
1142  
-                "low" : 100,
1143  
-                "high" : 500,
1144  
-            },
1145  
-            "symptom" : "Average item loaded time '{0}' is slower than '{1}'",
1146  
-            "formula" : "Avg(ep_bg_load_avg) > threshold",
1147  
-        },
1148  
-        {
1149  
-            "name" : "avgItemWaitTime",
1150  
-            "description" : "Average item waited time",
1151  
-            "counter" : "ep_bg_wait_avg",
1152  
-            "code" : "EPEnginePerformance",
1153  
-            "unit" : "time",
1154  
-            "threshold" : {
1155  
-                "low" : 100,
1156  
-                "high" : 500,
1157  
-            },
1158  
-            "symptom" : "Average waiting time '{0}' for items serviced by dispatcher is slower than '{1}'",
1159  
-            "formula" : "Avg(ep_bg_wait_avg) > threshold",
1160  
-        },
1161  
-     ],
1162  
-     "indicator" : True,
1163  
-    },
1164  
-    {"name" : "OutgoingXDCRPerformance",
1165  
-     "ingredients" : [
1166  
-        {
1167  
-            "name" : "outgoingXdrOps",
1168  
-            "description" : "Cross data center replication operation per sec",
1169  
-            "counter" : "xdc_ops",
1170  
-            "code" : "CalcTrend",
1171  
-            "unit" : "number",
1172  
-        },
1173  
-        {
1174  
-            "name" : "xdcrReplicationQueue",
1175  
-            "description" : "XDCR replication queue",
1176  
-            "counter" : "replication_changes_left",
1177  
-            "code" : "CalcTrend",
1178  
-            "unit" : "size",
1179  
-        },
1180  
-     ],
1181  
-     "perNode" : True,
1182  
-     "perBucket" : True,
1183  
-    },
1184  
-    {"name" : "IncomingXDCRPerformance",
1185  
-     "ingredients" : [
1186  
-        {
1187  
-            "name" : "incomingXdrPerformance",
1188  
-            "description" : "Incoming XDCR Get/Set ops ratio",
1189  
-            "counter" : ["ep_num_ops_get_meta", "ep_num_ops_set_meta"],
1190  
-            "code" : "XdrOpsPerformance",
1191  
-            "threshold" : {
1192  
-                "low" : 2,
1193  
-                "high" : 10
1194  
-            },
1195  
-            "symptom" : "Get to Set ops ratio '{0}' is bigger than '{1}'. Too few set operations.",
1196  
-            "formula" : "Avg(ep_num_ops_get_meta) / Avg(ep_num_ops_set_meta)",
1197  
-        },
1198  
-     ],
1199  
-     "perNode" : True,
1200  
-     "perBucket" : True,
1201  
-    },
  868
+
1202 869
     {"name" : "CompactionPerformance",
1203 870
      "ingredients" : [
1204 871
         {
1205 872
             "name" : "viewCompactPercentage",
1206 873
             "description" : "Views fragmentation %",
1207  
-            "counter" : "couch_views_fragmentation",
1208  
-            "code" : "CalcTrend",
  874
+            "counter" : ["couch_views_fragmentation"],
  875
+            "code" : "SyndromeDetector",
1209 876
             "threshold" : {
1210  
-                "couch_views_fragmentation" : 90,
  877
+                "couch_views_fragmentation" : [">=", 90],
1211 878
                 "recurrence" : 15,
1212 879
             },
1213  
-            "symptom" : "From {0} to {1}, views fragmentation '{2}%' is contineously higher than '{3}%'.",
  880
+            "symptom" : "From %s to %s, views fragmentation '%.2f%' is contineously higher than '%d%'.",
1214 881
             "formula" : "Avg(couch_views_fragmentation) > threshold",
1215 882
         },
1216 883
         {
1217 884
             "name" : "docCompactPercentage",
1218 885
             "description" : "Docs fragmentation %",
1219  
-            "counter" : "couch_docs_fragmentation",
1220  
-            "code" : "CalcTrend",
  886
+            "counter" : ["couch_docs_fragmentation"],
  887
+            "code" : "SyndromeDetector",
1221 888
             "threshold" : {
1222  
-                "couch_views_fragmentation" : 50,
  889
+                "couch_docs_fragmentation" : [">=", 50],
1223 890
                 "recurrence" : 15,
1224 891
             },
1225  
-            "symptom" : "From {0} to {1}, docs fragmentation '{2}%' is contineously higher than '{3}%'.",
  892
+            "symptom" : "From %s to %s, docs fragmentation '%.2f%' is contineously higher than '%d%'.",
1226 893
             "formula" : "Avg(couch_docs_fragmentation) > threshold",
1227 894
         },
1228 895
      ],
6  diskqueue_stats.py
@@ -274,14 +274,14 @@ def run(self, accessor, scale, threshold=None):
274 274
                         symptom = accessor["symptom"].format(util.pretty_datetime(timestamps[begin_index]), 
275 275
                                                              util.pretty_datetime(timestamps[end_index-1]),
276 276
                                                              util.number_label(int(curr_avg)),
277  
-                                                             util.number_label(int(mem_avg)),
  277
+                                                             util.size_label(int(mem_avg)),
278 278
                                                              util.pretty_float(cmr_avg), 
279 279
                                                              util.pretty_float(arr_avg),
280 280
                                                              util.number_label(int(diskread_avg)))
281 281
                         num_warn.append({"node":node, "value":symptom})
282 282
                         abnormal_vals.append(diskread_avg)
283 283
                 if len(abnormal_vals) > 0:
284  
-                    trend.append((node, {"value" : util.pretty_float(sum(abnormal_vals)/len(abnormal_vals)) + "%",
  284
+                    trend.append((node, {"value" : util.pretty_float(sum(abnormal_vals)/len(abnormal_vals)),
285 285
                                          "raw" : abnormal_vals}
286 286
                                     ))
287 287
             if len(num_warn) > 0:
@@ -408,7 +408,7 @@ def run(self, accessor, scale, threshold=None):
408 408
      "ingredients" : [
409 409
         {
410 410
             "name" : "performanceDiagnosis_diskread",
411  
-            "description" : "Diagnosis lots of disk reads",
  411
+            "description" : "Lots of disk reads",
412 412
             "symptom" : "From {0} to {1}, a high item count '{2}', high memory used '{3}', " \
413 413
                         "high cache miss ratio '{4}%', and low residential ratio '{5}%' lead to above average disk reads '{6}'.",
414 414
             "counter" : ["ep_bg_fetched","ep_cache_miss_rate", "vb_active_resident_items_ratio", "mem_used", "curr_items", "cmd_set"],

0 notes on commit b4506f9

Please sign in to comment.
Something went wrong with that request. Please try again.