Skip to content

Commit

Permalink
[ML] Detect 8.x ML nodes correctly (#105653)
Browse files Browse the repository at this point in the history
It's possible that a 7.17 master node needs to assign ML jobs
to 8.x ML nodes. Currently this does not work, as the 7.x mechanism
for detecting ML nodes does not work with 8.x ML nodes. Currently
a 7.17 master node will not assign jobs to 8.x ML nodes, so the
jobs sit in limbo until the master node is upgraded to 8.x, and
then they get assigned and pick up where they left off.

This change allows the 7.17 master node to correctly identify 8.x
ML nodes in the cluster, allowing the ML jobs to be reassigned
more quickly during a rolling upgrade from 7.17 to 8.x where
master nodes are upgraded last (as recommended).
  • Loading branch information
droberts195 committed Feb 20, 2024
1 parent 85259a1 commit edea203
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 0 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/105653.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 105653
summary: Detect 8.x ML nodes correctly
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,12 @@ protected XPackLicenseState getLicenseState() {
}

public static boolean isMlNode(DiscoveryNode node) {
// Post 7.3.0 nodes will have this role, so if the ML nodes have been upgraded we can use this information.
if (node.getRoles().contains(ML_ROLE)) {
return true;
}
// Pre 7.3.0 nodes might still be ML nodes despite not having the ML role (as pluggable roles didn't exist
// then). So we use the old method to detect ML nodes for this case.
Map<String, String> nodeAttributes = node.getAttributes();
try {
return Integer.parseInt(nodeAttributes.get(MAX_OPEN_JOBS_NODE_ATTR)) > 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@
package org.elasticsearch.xpack.ml;

import org.apache.lucene.util.SetOnce;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.license.XPackLicenseState;
import org.elasticsearch.monitor.os.OsStats;
import org.elasticsearch.test.ESTestCase;
Expand All @@ -24,6 +27,7 @@
import org.elasticsearch.xpack.core.ml.action.SetUpgradeModeAction;

import java.io.IOException;
import java.net.InetAddress;
import java.util.Collections;
import java.util.Map;

Expand Down Expand Up @@ -227,6 +231,51 @@ public void testMachineMemory_givenCgroupLowLimit() throws IOException {
assertEquals(7_516_192_768L, MachineLearning.machineMemoryFromStats(stats));
}

public void testIsMlNode_given812MlNode() {
DiscoveryNode mlNode812 = new DiscoveryNode(
"name",
"id",
"ephemeralId",
"hostName",
"hostAddress",
new TransportAddress(InetAddress.getLoopbackAddress(), randomIntBetween(1024, 65535)),
Collections.emptyMap(),
Collections.singleton(MachineLearning.ML_ROLE),
Version.fromString("8.12.0")
);
assertTrue(MachineLearning.isMlNode(mlNode812));
}

public void testIsMlNode_given717MlNode() {
DiscoveryNode mlNode717 = new DiscoveryNode(
"name",
"id",
"ephemeralId",
"hostName",
"hostAddress",
new TransportAddress(InetAddress.getLoopbackAddress(), randomIntBetween(1024, 65535)),
Collections.singletonMap(MachineLearning.MAX_OPEN_JOBS_NODE_ATTR, "512"),
Collections.singleton(MachineLearning.ML_ROLE),
Version.V_7_17_0
);
assertTrue(MachineLearning.isMlNode(mlNode717));
}

public void testIsMlNode_given70MlNode() {
DiscoveryNode mlNode70 = new DiscoveryNode(
"name",
"id",
"ephemeralId",
"hostName",
"hostAddress",
new TransportAddress(InetAddress.getLoopbackAddress(), randomIntBetween(1024, 65535)),
Collections.singletonMap(MachineLearning.MAX_OPEN_JOBS_NODE_ATTR, "512"),
Collections.emptySet(),
Version.V_7_0_0
);
assertTrue(MachineLearning.isMlNode(mlNode70));
}

private MachineLearning createMachineLearning(Settings settings) {
XPackLicenseState licenseState = mock(XPackLicenseState.class);

Expand Down

0 comments on commit edea203

Please sign in to comment.