Track query stats bug (#70273)

The query cache stats can currently report negative memory usage, which breaks REST responses for indices stats, node stats, etc, see #55434 as one such example. The reason why negative memory usage is reported is because of the following trappy calculation: https://github.com/elastic/elasticsearch/blob/1de0b616ebb92de6060510e92269ef87fc6a8649/server/src/main/java/org/elasticsearch/indices/IndicesQueryCache.java#L98-L101 - weight can be `Double.POSITIVE_INFINITY` when `totalSize == 0` and `stats.size() == 0` - if we then also have `sharedRamBytesUsed > 0`, then `additionalRamBytesUsed` will be `Long.MAX_VALUE` - if you then sum up multiple of these `Long.MAX_VALUE` values (one for each shard), this leads to an overflow, resulting in negative numbers as seen in #55434. The reason sharedRamBytesUsed can be > 0 when there are no shard stats at all is because of the memory occupied by `LRUQueryCache.uniqueQueries`, where the lifetime can extend even closing of shards (where they are removed from shardStats). Note that a workaround to the above bug is to [clear the cache](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-clearcache.html), as it makes `sharedRamBytesUsed == 0`, see https://github.com/elastic/elasticsearch/blob/1de0b616ebb92de6060510e92269ef87fc6a8649/server/src/main/java/org/elasticsearch/indices/IndicesQueryCache.java#L168-L174 Closes #55434
elastic · Mar 11, 2021 · 4675711 · 4675711
1 parent 93918a3
commit 4675711
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 9 deletions.
diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesQueryCache.java b/server/src/main/java/org/elasticsearch/indices/IndicesQueryCache.java
@@ -89,17 +89,22 @@ public QueryCacheStats getStats(ShardId shard) {
         }
         shardStats.add(info);
 
-        // We also have some shared ram usage that we try to distribute to
-        // proportionally to their number of cache entries of each shard
-        long totalSize = 0;
-        for (QueryCacheStats s : stats.values()) {
-            totalSize += s.getCacheSize();
-        }
-        final double weight = totalSize == 0
+        // We also have some shared ram usage that we try to distribute
+        // proportionally to their number of cache entries of each shard.
+        // Sometimes it's not possible to do this when there are no shard entries at all,
+        // which can happen as the shared ram usage can extend beyond the closing of all shards.
+        if (stats.isEmpty() == false) {
+            long totalSize = 0;
+            for (QueryCacheStats s : stats.values()) {
+                totalSize += s.getCacheSize();
+            }
+            final double weight = totalSize == 0
                 ? 1d / stats.size()
                 : ((double) shardStats.getCacheSize()) / totalSize;
-        final long additionalRamBytesUsed = Math.round(weight * sharedRamBytesUsed);
-        shardStats.add(new QueryCacheStats(additionalRamBytesUsed, 0, 0, 0, 0));
+            final long additionalRamBytesUsed = Math.round(weight * sharedRamBytesUsed);
+            assert additionalRamBytesUsed >= 0L : additionalRamBytesUsed;
+            shardStats.add(new QueryCacheStats(additionalRamBytesUsed, 0, 0, 0, 0));
+        }
         return shardStats;
     }
 

diff --git a/server/src/test/java/org/elasticsearch/indices/IndicesQueryCacheTests.java b/server/src/test/java/org/elasticsearch/indices/IndicesQueryCacheTests.java
@@ -275,12 +275,14 @@ public void testTwoShards() throws IOException {
         assertEquals(0L, stats1.getCacheCount());
         assertEquals(0L, stats1.getHitCount());
         assertEquals(0L, stats1.getMissCount());
+        assertEquals(0L, stats1.getMemorySizeInBytes());
 
         stats2 = cache.getStats(shard2);
         assertEquals(0L, stats2.getCacheSize());
         assertEquals(0L, stats2.getCacheCount());
         assertEquals(0L, stats2.getHitCount());
         assertEquals(0L, stats2.getMissCount());
+        assertEquals(0L, stats2.getMemorySizeInBytes());
 
         cache.close(); // this triggers some assertions
     }