Permalink
Browse files

Saving and restoring the NodeManager state

Summary: We are able to save and restore the NodeManager state now.

Test Plan:
I couldn't find a way to test the code completely as such. I tested in two basic ways: (a) Print out what was being read (b) After we recover from the safe mode and
reconstruct the state, persist the state again in a different file, and compare. The second method however requires turning off the compression and turning on pretty
printing.

Reviewers: dms, rvadali, aching

Reviewed By: dms

CC: security-diffs@lists

Task ID: 1112019
  • Loading branch information...
1 parent 603e1b6 commit 80128bb1fc187f29a1d7dd9b5f6ee9318c8f5f7b gauravmenghani committed with Alex Feinberg Jul 31, 2012
View
@@ -274,7 +274,7 @@
conf="common->master"/>
<dependency org="org.codehaus.jackson"
name="jackson-mapper-asl"
- rev="1.0.1"
+ rev="1.7.9"
conf="common->default"/>
</dependencies>
@@ -40,7 +40,7 @@
conf="common->default"/>
<dependency org="org.codehaus.jackson"
name="jackson-mapper-asl"
- rev="1.0.1"
+ rev="1.7.9"
conf="common->default"/>
</dependencies>
</ivy-module>
@@ -23,7 +23,7 @@ checkstyle.version=5.0
guava.version=r09
-jackson.version=1.0.1
+jackson.version=1.7.9
json.version=20090211
@@ -17,7 +17,7 @@
*/
package org.apache.hadoop.corona;
-import java.io.*;
+import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.*;
@@ -27,9 +27,11 @@
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.mapred.Clock;
import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.util.CoronaSerializer;
import org.apache.hadoop.util.HostsFileReader;
import org.apache.thrift.TApplicationException;
import org.apache.thrift.TException;
+import org.codehaus.jackson.JsonGenerator;
/**
* Manager of all the resources of the cluster.
@@ -87,20 +89,48 @@ public ClusterManager() { }
* Primary constructor.
*
* @param conf Configuration to be used
+ * @param recoverFromDisk True if we are restarting after going down while
+ * in Safe Mode
+ * @throws IOException
+ */
+ public ClusterManager(Configuration conf, boolean recoverFromDisk)
+ throws IOException {
+ this(new CoronaConf(conf), recoverFromDisk);
+ }
+
+ /**
+ * Constructor for ClusterManager, when it is not specified if we are
+ * restarting after persisting the state. In this case we assume the
+ * recoverFromDisk flag to be false.
+ *
+ * @param conf Configuration to be used
* @throws IOException
*/
public ClusterManager(Configuration conf) throws IOException {
- this(new CoronaConf(conf));
+ this(new CoronaConf(conf), false);
}
/**
* Construct ClusterManager given {@link CoronaConf}
*
* @param conf the configuration for the ClusterManager
+ * @param recoverFromDisk true if we are restarting after going down while
+ * in Safe Mode
* @throws IOException
*/
- public ClusterManager(CoronaConf conf) throws IOException {
+ public ClusterManager(CoronaConf conf, boolean recoverFromDisk)
+ throws IOException {
this.conf = conf;
+ HostsFileReader hostsReader =
+ new HostsFileReader(conf.getHostsFile(), conf.getExcludesFile());
+
+ if (recoverFromDisk) {
+ recoverClusterManagerFromDisk(hostsReader);
+ } else {
+ nodeManager = new NodeManager(this, hostsReader);
+ nodeManager.setConf(conf);
+ }
+
initLegalTypes();
metrics = new ClusterManagerMetrics(getTypes());
@@ -111,11 +141,6 @@ public ClusterManager(CoronaConf conf) throws IOException {
sessionHistoryManager = new SessionHistoryManager();
sessionHistoryManager.setConf(conf);
- HostsFileReader hostsReader =
- new HostsFileReader(conf.getHostsFile(), conf.getExcludesFile());
- nodeManager = new NodeManager(this, hostsReader);
- nodeManager.setConf(conf);
-
sessionNotifier = new SessionNotifier(sessionManager, this, metrics);
sessionNotifier.setConf(conf);
@@ -134,7 +159,34 @@ public ClusterManager(CoronaConf conf) throws IOException {
startTime = clock.getTime();
hostName = infoSocAddr.getHostName();
- safeMode = false;
+ setSafeMode(false);
+ }
+
+ /**
+ * This method starts the process to restore the CM state by reading back
+ * the serialized state from the CM state file.
+ * @param hostsReader The HostsReader instance
+ * @throws IOException
+ */
+ private void recoverClusterManagerFromDisk(HostsFileReader hostsReader)
+ throws IOException {
+ LOG.info("Recovering from Safe Mode");
+
+ // This will prevent the expireNodes thread from expiring the nodes
+ safeMode = true;
+
+ CoronaSerializer coronaSerializer = new CoronaSerializer(conf);
+
+ // Expecting the START_OBJECT token for ClusterManager
+ coronaSerializer.readStartObjectToken("ClusterManager");
+
+ coronaSerializer.readField("nodeManager");
+ nodeManager = new NodeManager(this, hostsReader, coronaSerializer);
+ nodeManager.setConf(conf);
+ nodeManager.restoreAfterSafeModeRestart();
+
+ // Expecting the END_OBJECT token for ClusterManager
+ coronaSerializer.readEndObjectToken("ClusterManager");
}
/**
@@ -452,6 +504,10 @@ public synchronized boolean setSafeMode(boolean safeMode) {
return true;
}
+ /**
+ * This function saves the state of the ClusterManager to disk.
+ * @return A boolean. True if saving the state succeeded, false otherwise.
+ */
@Override
public boolean persistState() {
if (!safeMode) {
@@ -460,6 +516,21 @@ public boolean persistState() {
return false;
}
+ try {
+ JsonGenerator jsonGenerator =
+ CoronaSerializer.createJsonGenerator(conf);
+ jsonGenerator.writeStartObject();
+
+ jsonGenerator.writeFieldName("nodeManager");
+ nodeManager.write(jsonGenerator);
+ // TODO Write the sessionManager and other objects
+
+ jsonGenerator.writeEndObject();
+ jsonGenerator.close();
+ } catch (IOException e) {
+ LOG.info("Could not persist the state: ", e);
+ return false;
+ }
return true;
}
@@ -5,6 +5,12 @@
import java.net.ServerSocket;
import java.net.Socket;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@@ -16,7 +22,7 @@
public class ClusterManagerServer extends Thread {
public static final Log LOG = LogFactory.getLog(ClusterManagerServer.class);
- static{
+ static {
Configuration.addDefaultResource("mapred-default.xml");
Configuration.addDefaultResource("mapred-site.xml");
Utilities.makeProcessExitOnUncaughtException(LOG);
@@ -62,10 +68,24 @@ public void run() {
}
public static void main(String[] args)
- throws IOException, TTransportException {
+ throws IOException, TTransportException, ParseException {
StringUtils.startupShutdownMessage(ClusterManager.class, args, LOG);
Configuration conf = new Configuration();
- ClusterManager cm = new ClusterManager(conf);
+ boolean recoverFromDisk = false;
+ // Check if we want to start the ClusterManager to restore the persisted
+ // state
+ Option recoverFromDiskOption =
+ new Option("recoverFromDisk",
+ "Used to restart the CM from the state persisted on disk");
+ Options options = new Options();
+ options.addOption(recoverFromDiskOption);
+ CommandLineParser parser = new GnuParser();
+ CommandLine line = parser.parse(options, args);
+
+ if (line.hasOption("recoverFromDisk")) {
+ recoverFromDisk = true;
+ }
+ ClusterManager cm = new ClusterManager(conf, recoverFromDisk);
try {
ClusterManagerServer server = new ClusterManagerServer(conf, cm);
server.start();
Oops, something went wrong.

0 comments on commit 80128bb

Please sign in to comment.