diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java index 1160589e43966..1970dd82b8ebe 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java @@ -28,6 +28,8 @@ */ final class JvmErgonomics { + static final double DIRECT_MEMORY_TO_HEAP_FACTOR = 0.5; + private JvmErgonomics() { throw new AssertionError("No instances intended"); } @@ -44,7 +46,7 @@ static List choose(final List userDefinedJvmOptions, Settings no final long heapSize = JvmOption.extractMaxHeapSize(finalJvmOptions); final long maxDirectMemorySize = JvmOption.extractMaxDirectMemorySize(finalJvmOptions); if (maxDirectMemorySize == 0) { - ergonomicChoices.add("-XX:MaxDirectMemorySize=" + heapSize / 2); + ergonomicChoices.add("-XX:MaxDirectMemorySize=" + (long) (DIRECT_MEMORY_TO_HEAP_FACTOR * heapSize)); } final boolean tuneG1GCForSmallHeap = tuneG1GCForSmallHeap(heapSize); diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java index 26fd7294ed557..b68e374bbdb94 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java @@ -11,6 +11,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodeRole; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.node.NodeRoleSettings; import java.io.IOException; @@ -37,6 +38,8 @@ public class MachineDependentHeap { protected static final long MAX_HEAP_SIZE = GB * 31; // 31GB protected static final long MIN_HEAP_SIZE = 1024 * 1024 * 128; // 128MB + private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation"); + public MachineDependentHeap() {} /** @@ -76,12 +79,16 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av /* * Machine learning only node. * - *

Heap is computed as: - *

+ * The memory reserved for Java is computed as: + * - 40% of total system memory when total system memory 16 gigabytes or less. + * - 40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes. + * - The absolute maximum heap size is 31 gigabytes. + * + * This Java memory is divided as follows: + * - 2/3 of the Java memory is reserved for the Java heap. + * - 1/3 of the Java memory is reserved for the Java direct memory. + * + * The direct memory being half of the heap is set by the JvmErgonomics class. * * In all cases the result is rounded down to the next whole multiple of 4 megabytes. * The reason for doing this is that Java will round requested heap sizes to a multiple @@ -95,13 +102,22 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av * * If this formula is changed then corresponding changes must be made to the {@code NativeMemoryCalculator} and * {@code MlAutoscalingDeciderServiceTests} classes in the ML plugin code. Failure to keep the logic synchronized - * could result in repeated autoscaling up and down. + * could result in ML processes crashing with OOM errors or repeated autoscaling up and down. */ case ML_ONLY -> { - if (availableMemory <= (GB * 16)) { - yield mb((long) (availableMemory * .4), 4); + double heapFractionBelow16GB = 0.4; + double heapFractionAbove16GB = 0.1; + if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) { + heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); + heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); + } + if (availableMemory <= GB * 16) { + yield mb((long) (availableMemory * heapFractionBelow16GB), 4); } else { - yield mb((long) min((GB * 16) * .4 + (availableMemory - GB * 16) * .1, MAX_HEAP_SIZE), 4); + yield mb( + (long) min(GB * 16 * heapFractionBelow16GB + (availableMemory - GB * 16) * heapFractionAbove16GB, MAX_HEAP_SIZE), + 4 + ); } } /* diff --git a/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java b/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java index e2dc9f32647e0..2611791563b6f 100644 --- a/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java +++ b/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java @@ -58,13 +58,13 @@ public void testMasterOnlyOptions() throws Exception { } public void testMlOnlyOptions() throws Exception { - assertHeapOptions(1, containsInAnyOrder("-Xmx408m", "-Xms408m"), "ml"); - assertHeapOptions(4, containsInAnyOrder("-Xmx1636m", "-Xms1636m"), "ml"); - assertHeapOptions(32, containsInAnyOrder("-Xmx8192m", "-Xms8192m"), "ml"); - assertHeapOptions(64, containsInAnyOrder("-Xmx11468m", "-Xms11468m"), "ml"); + assertHeapOptions(1, containsInAnyOrder("-Xmx272m", "-Xms272m"), "ml"); + assertHeapOptions(4, containsInAnyOrder("-Xmx1092m", "-Xms1092m"), "ml"); + assertHeapOptions(32, containsInAnyOrder("-Xmx5460m", "-Xms5460m"), "ml"); + assertHeapOptions(64, containsInAnyOrder("-Xmx7644m", "-Xms7644m"), "ml"); // We'd never see a node this big in Cloud, but this assertion proves that the 31GB absolute maximum // eventually kicks in (because 0.4 * 16 + 0.1 * (263 - 16) > 31) - assertHeapOptions(263, containsInAnyOrder("-Xmx31744m", "-Xms31744m"), "ml"); + assertHeapOptions(263, containsInAnyOrder("-Xmx21228m", "-Xms21228m"), "ml"); } public void testDataNodeOptions() throws Exception { diff --git a/docs/changelog/128742.yaml b/docs/changelog/128742.yaml new file mode 100644 index 0000000000000..ce974301f2dfc --- /dev/null +++ b/docs/changelog/128742.yaml @@ -0,0 +1,5 @@ +pr: 128742 +summary: "Account for Java direct memory on machine learning nodes to prevent out-of-memory crashes." +area: Machine Learning +type: bug +issues: [] diff --git a/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java b/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java index a3639214a1b9d..c1bf7fd7bb0d8 100644 --- a/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java +++ b/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java @@ -46,14 +46,7 @@ public class JvmInfo implements ReportingService.Info { long nonHeapInit = memoryMXBean.getNonHeapMemoryUsage().getInit() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getInit(); long nonHeapMax = memoryMXBean.getNonHeapMemoryUsage().getMax() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getMax(); long directMemoryMax = 0; - try { - Class vmClass = Class.forName("sun.misc.VM"); - directMemoryMax = (Long) vmClass.getMethod("maxDirectMemory").invoke(null); - } catch (Exception t) { - // ignore - } String[] inputArguments = runtimeMXBean.getInputArguments().toArray(new String[runtimeMXBean.getInputArguments().size()]); - Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax); String bootClassPath; try { @@ -133,6 +126,11 @@ public class JvmInfo implements ReportingService.Info { configuredMaxHeapSize = Long.parseLong((String) valueMethod.invoke(maxHeapSizeVmOptionObject)); } catch (Exception ignored) {} + try { + Object maxDirectMemorySizeVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "MaxDirectMemorySize"); + directMemoryMax = Long.parseLong((String) valueMethod.invoke(maxDirectMemorySizeVmOptionObject)); + } catch (Exception ignored) {} + try { Object useSerialGCVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "UseSerialGC"); useSerialGC = (String) valueMethod.invoke(useSerialGCVmOptionObject); @@ -142,6 +140,8 @@ public class JvmInfo implements ReportingService.Info { } + Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax); + INSTANCE = new JvmInfo( ProcessHandle.current().pid(), System.getProperty("java.version"), @@ -510,5 +510,8 @@ public ByteSizeValue getHeapMax() { return ByteSizeValue.ofBytes(heapMax); } + public ByteSizeValue getTotalMax() { + return ByteSizeValue.ofBytes(heapMax + nonHeapMax + directMemoryMax); + } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java index 6d21654f9e161..19315f00ec83a 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java @@ -43,6 +43,7 @@ import org.elasticsearch.common.settings.SettingsModule; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.Processors; +import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.common.util.concurrent.EsExecutors; import org.elasticsearch.core.TimeValue; import org.elasticsearch.env.Environment; @@ -562,6 +563,8 @@ public class MachineLearning extends Plugin License.OperationMode.PLATINUM ); + private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation"); + @Override public Map getProcessors(Processor.Parameters parameters) { if (this.enabled == false) { @@ -848,7 +851,12 @@ public Settings additionalSettings() { machineMemoryAttrName, Long.toString(OsProbe.getInstance().osStats().getMem().getAdjustedTotal().getBytes()) ); - addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(Runtime.getRuntime().maxMemory())); + + long jvmSize = Runtime.getRuntime().maxMemory(); + if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) { + jvmSize = JvmInfo.jvmInfo().getMem().getTotalMax().getBytes(); + } + addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(jvmSize)); addMlNodeAttribute( additionalSettings, deprecatedAllocatedProcessorsAttrName,