Skip to content

Commit

Permalink
[ML] add version information in case of crash of native ML process (#…
Browse files Browse the repository at this point in the history
…30674)

This change adds version information in case a native ML process
crashes, the version is important for choosing the right symbol files
when analyzing the crash. Adding the version combines all necessary
information on one line.

relates elastic/ml-cpp#94
  • Loading branch information
Hendrik Muhs committed May 18, 2018
1 parent cfd239a commit 7f48df0
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
Expand Down Expand Up @@ -84,20 +82,7 @@ public long getPid() throws TimeoutException {
}

public Map<String, Object> getNativeCodeInfo() throws TimeoutException {
String copyrightMessage = cppLogHandler.getCppCopyright(CONTROLLER_CONNECT_TIMEOUT);
Matcher matcher = Pattern.compile("Version (.+) \\(Build ([^)]+)\\) Copyright ").matcher(copyrightMessage);
if (matcher.find()) {
Map<String, Object> info = new HashMap<>(2);
info.put("version", matcher.group(1));
info.put("build_hash", matcher.group(2));
return info;
} else {
// If this happens it probably means someone has changed the format in lib/ver/CBuildInfo.cc
// in the machine-learning-cpp repo without changing the pattern above to match
String msg = "Unexpected native controller process copyright format: " + copyrightMessage;
LOGGER.error(msg);
throw new ElasticsearchException(msg);
}
return cppLogHandler.getNativeCodeInfo(CONTROLLER_CONNECT_TIMEOUT);
}

public void startProcess(List<String> command) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
Expand All @@ -30,10 +30,15 @@
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.Deque;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Handle a stream of C++ log messages that arrive via a named pipe in JSON format.
Expand Down Expand Up @@ -181,6 +186,26 @@ public String getCppCopyright(Duration timeout) throws TimeoutException {
return cppCopyright;
}

/**
* Extracts version information from the copyright string which assumes a certain format.
*/
public Map<String, Object> getNativeCodeInfo(Duration timeout) throws TimeoutException {
String copyrightMessage = getCppCopyright(timeout);
Matcher matcher = Pattern.compile("Version (.+) \\(Build ([^)]+)\\) Copyright ").matcher(copyrightMessage);
if (matcher.find()) {
Map<String, Object> info = new HashMap<>(2);
info.put("version", matcher.group(1));
info.put("build_hash", matcher.group(2));
return info;
} else {
// If this happens it probably means someone has changed the format in lib/ver/CBuildInfo.cc
// in the ml-cpp repo without changing the pattern above to match
String msg = "Unexpected native process copyright format: " + copyrightMessage;
LOGGER.error(msg);
throw new ElasticsearchException(msg);
}
}

/**
* Expected to be called very infrequently.
*/
Expand Down Expand Up @@ -281,8 +306,18 @@ private void parseMessage(XContent xContent, BytesReference bytesRef) {
} catch (XContentParseException e) {
String upstreamMessage = "Fatal error: '" + bytesRef.utf8ToString() + "'";
if (upstreamMessage.contains("bad_alloc")) {
upstreamMessage += ", process ran out of memory.";
upstreamMessage += ", process ran out of memory";
}

// add version information, so it's conveniently next to the crash log
upstreamMessage += ", version: ";
try {
Map<String, Object> versionInfo = getNativeCodeInfo(Duration.ofMillis(10));
upstreamMessage += String.format(Locale.ROOT, "%s (build %s)", versionInfo.get("version"), versionInfo.get("build_hash"));
} catch (TimeoutException timeoutException) {
upstreamMessage += "failed to retrieve";
}

storeError(upstreamMessage);
seenFatalError = true;
} catch (IOException e) {
Expand Down

0 comments on commit 7f48df0

Please sign in to comment.