diff --git a/android-DL4JIrisClassifierDemo.md b/android-DL4JIrisClassifierDemo.md
index 7e45c9193..34256775a 100644
--- a/android-DL4JIrisClassifierDemo.md
+++ b/android-DL4JIrisClassifierDemo.md
@@ -189,7 +189,7 @@ The next step is to build the neural network using *nccBuilder*. The parameters
         NeuralNetConfiguration.Builder nncBuilder = new NeuralNetConfiguration.Builder();
         long seed = 6;
         nncBuilder.seed(seed);
-        nncBuilder.learningRate(0.1);
+        nncBuilder.updater(new Sgd(0.1))
         nncBuilder.activation(Activation.TANH);
         nncBuilder.weightInit(WeightInit.XAVIER);
         nncBuilder.regularization(true).l2(1e-4);
diff --git a/building-neural-net-with-dl4j.md b/building-neural-net-with-dl4j.md
index 02d7e25ef..6938c327b 100644
--- a/building-neural-net-with-dl4j.md
+++ b/building-neural-net-with-dl4j.md
@@ -30,7 +30,7 @@ MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
 	.seed(seed)
 	 .activation("tanh")
 	 .weightInit(WeightInit.XAVIER)
-	 .learningRate(0.1)
+	 .updater(new Sgd(0.1))
 	 .regularization(true).l2(1e-4)
 	 .list()
 	 .layer(0, new DenseLayer.Builder().nIn(numInputs).nOut(3)
@@ -61,8 +61,7 @@ Here is a configuration of a recurrent neural network taken, from our examples.
 ```
 ComputationGraphConfiguration configuration = new NeuralNetConfiguration.Builder()
    	.weightInit(WeightInit.XAVIER)
-	.learningRate(0.5)
-	.updater(Updater.RMSPROP)
+	.updater(new RmsProp(0.5))
 	.optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
 	.seed(seed)
 	.graphBuilder()
diff --git a/compgraph.md b/compgraph.md
index 3ed80cd45..07fe4facd 100644
--- a/compgraph.md
+++ b/compgraph.md
@@ -96,7 +96,7 @@ For the sake of this example, lets assume our input data is of size 5. Our confi
 
 ```java
 ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
-        .learningRate(0.01)
+		.updater(new Sgd(0.01))
         .graphBuilder()
         .addInputs("input") //can use any label for this
         .addLayer("L1", new GravesLSTM.Builder().nIn(5).nOut(5).build(), "input")
@@ -123,7 +123,7 @@ To build the above network, we use the following configuration:
 
 ```java
 ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
-        .learningRate(0.01)
+		.updater(new Sgd(0.01))
         .graphBuilder()
         .addInputs("input1", "input2")
         .addLayer("L1", new DenseLayer.Builder().nIn(3).nOut(4).build(), "input1")
@@ -145,7 +145,7 @@ In this case, the network configuration is:
 
 ```java
 ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
-        .learningRate(0.01)
+		.updater(new Sgd(0.01))
         .graphBuilder()
         .addInputs("input")
         .addLayer("L1", new DenseLayer.Builder().nIn(3).nOut(4).build(), "input")
diff --git a/core-concepts.md b/core-concepts.md
index a1b902792..4a5d1e3ea 100644
--- a/core-concepts.md
+++ b/core-concepts.md
@@ -68,8 +68,7 @@ DL4J gives data scientists and developers tools to build a deep neural networks
 MultiLayerConfiguration conf = 
 	new NeuralNetConfiguration.Builder()
 		.optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-		.updater(Updater.NESTEROVS).momentum(0.9)
-		.learningRate(learningRate)
+		.updater(new Nesterovs(learningRate, 0.9))
 		.list(
 			new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes).activation("relu").build(),
 			new OutputLayer.Builder(LossFunction.NEGATIVELOGLIKELIHOOD).activation("softmax").nIn(numHiddenNodes).nOut(numOutputs).build()
diff --git a/dbn-iris-tutorial.md b/dbn-iris-tutorial.md
index 047e3d0cc..7c04a2da0 100644
--- a/dbn-iris-tutorial.md
+++ b/dbn-iris-tutorial.md
@@ -119,13 +119,13 @@ With DL4J, creating a neural network of any kind involves several steps.
 First, we need to create a configuration object:
 
     NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
-    .hiddenUnit(RBM.HiddenUnit.RECTIFIED).momentum(5e-1f)
+    .hiddenUnit(RBM.HiddenUnit.RECTIFIED)
         .visibleUnit(RBM.VisibleUnit.GAUSSIAN).regularization(true)
         .regularizationCoefficient(2e-4f).dist(Distributions.uniform(gen))
         .activationFunction(Activations.tanh())
         .weightInit(WeightInit.DISTRIBUTION)
     .lossFunction(LossFunctions.LossFunction.RECONSTRUCTION_CROSSENTROPY).rng(gen)
-        .learningRate(1e-3f).nIn(4).nOut(3).build();
+        .updater(new Nesterovs(1e-3f,0.5)).nIn(4).nOut(3).build();
 
 This has everything that our DBN classifier will need. As you can see, there are a lot of parameters, or ‘knobs’, that you will learn to adjust over time to improve your nets’ performance. These are the pedals, clutch and steering wheel attached to DL4J's deep-learning engine. 
 
diff --git a/deep-learning-and-the-game-of-go.md b/deep-learning-and-the-game-of-go.md
index 27277d371..ffeda3fd5 100644
--- a/deep-learning-and-the-game-of-go.md
+++ b/deep-learning-and-the-game-of-go.md
@@ -171,10 +171,9 @@ With data processed and ready to go, you can now turn to building the actual mod
 
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
                 .seed(randomSeed)
-                .learningRate(.1)
+				.updater(new AdaGrad(0.1))
                 .weightInit(WeightInit.XAVIER)
                 .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-                .updater(Updater.ADAGRAD)
                 .list()
                 .layer(0, new ConvolutionLayer.Builder(3, 3)
                         .nIn(featurePlanes).stride(1, 1).nOut(50).activation(Activation.RELU).build())
diff --git a/deprecated/XXiris-flower-dataset-tutorial.md b/deprecated/XXiris-flower-dataset-tutorial.md
index 4d9cc5e26..e1e9183d5 100644
--- a/deprecated/XXiris-flower-dataset-tutorial.md
+++ b/deprecated/XXiris-flower-dataset-tutorial.md
@@ -125,7 +125,7 @@ public class DBNIrisExample {
      log.info("Build model....");		
      MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()		
              .seed(seed) // Locks in weight initialization for tuning		
-             .learningRate(1e-6f) // Optimization step size		
+             .updater(new Sgd(1e-6f)) // Optimization step size		
              .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT) // Backprop to calculate gradients		
              .l1(1e-1).regularization(true).l2(2e-4)		
              .useDropConnect(true)		
@@ -216,10 +216,10 @@ A *NeuralNetConfiguration* object is the fundamental object used to construct La
 ^ This line uses a specific, randomly generated weight initialization. If you run an example many times, and generate new, random weights each time, then your net's F1 score may vary a great deal, because different initial weights can lead algorithms to different local minima of the errorscape. Keeping the weights the same allows you see the effect of adjusting other hyperparameters more clearly. `seed` is a variable specified before we congifure the model. 
 
 ``` java
-.learningRate(1e-6f)
+.updater(new Sgd(1e-6f))
 ```
 
-^ This line sets the learning rate, which is the size of the adjustments made to the weights with each iteration. A high learning rate makes a net traverse the errorscape quickly, but makes it prone to overshoot the minima. A low learning rate is more likely to find the minimum, but it will do so very slowly.
+^ This line sets the learning rate and specifies an updater to use. The learning rate is the size of the adjustments made to the weights with each iteration. A high learning rate makes a net traverse the errorscape quickly, but makes it prone to overshoot the minima. A low learning rate is more likely to find the minimum, but it will do so very slowly.
 
 ``` java
 .optimizationAlgo(OptimizationAlgorithm.LBFGS) 
@@ -228,7 +228,7 @@ A *NeuralNetConfiguration* object is the fundamental object used to construct La
 ^ This line specifies your optimization algorithm as Limited-memory BFGS, a backpropagation method that helps calculate gradients. 
 
 ``` java
-.l2(2e-4).regularization(true).momentum(0.9).constrainGradientToUnitNorm(true)
+.l2(2e-4).regularization(true).updater(new Nesterovs(0.001,0.9)).constrainGradientToUnitNorm(true)
 ```
 
 ^ This line sets several parameters: 
diff --git a/doc_templates/template_doc_model.md b/doc_templates/template_doc_model.md
index 3b7c8b95a..f13736ae7 100644
--- a/doc_templates/template_doc_model.md
+++ b/doc_templates/template_doc_model.md
@@ -39,12 +39,9 @@ LearningRatePolicy provides decay alternatives during training. [[Explain what d
 double lr = 1e-2;
 double decayRate = 2;
 NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
-.learningRate(lr)
-.learningRateDecayPolicy(LearningRatePolicy.Exponential)
-.lrPolicyDecayRate(decayRate)
-.layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut)
-.updater(org.deeplearning4j.nn.conf.Updater.SGD).build())
-.build();
+    .updater(new Sgd(new ExponentialSchedule(ScheduleType.ITERATION, lr, decayRate)))
+    .layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).build())
+    .build();
 ```
 
 <b>Inverse</b>
@@ -54,35 +51,30 @@ NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
 double lr = 1e-2;
 double decayRate = 2;
 double power = 3;
-NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(lr)
-.learningRateDecayPolicy(LearningRatePolicy.Inverse)
-.lrPolicyDecayRate(decayRate).lrPolicyPower(power)
-.layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut)
-.updater(org.deeplearning4j.nn.conf.Updater.SGD).build())
-.build();
+NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
+    .updater(new Sgd(new InverseSchedule(ScheduleType.ITERATION,lr,decayRate,power)))
+    .layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).build())
+    .build();
 ```
 
 <b>Poly</b>
 
 [[Brief description]]
 ```
-NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(lr)
-.learningRateDecayPolicy(LearningRatePolicy.Poly).lrPolicyPower(power)
-.layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut)    
-.updater(org.deeplearning4j.nn.conf.Updater.SGD).build())                                      
-.build();
+NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
+    .updater(new Sgd(new PolySchedule(ScheduleType.ITERATION,lr,power,maxIter))
+    .layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).build())
+    .build();
 ```
 
 <b>Sigmoid</b>
 
 [[Brief description]]
 ```
-NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(lr)
-.learningRateDecayPolicy(LearningRatePolicy.Sigmoid)
-.lrPolicyDecayRate(decayRate).lrPolicySteps(steps)
-.layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut)
-.updater(org.deeplearning4j.nn.conf.Updater.SGD).build())                                
-.build();
+NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
+    .updater(new Sgd(new SigmoidSchedule(ScheduleType.ITERATION,lr,decayRate,steps)))
+    .layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).build())
+    .build();
 ```
 
 <b>Step</b>
@@ -92,33 +84,22 @@ NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(
 double lr = 1e-2;
 double decayRate = 2;
 double steps = 3;
-NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(lr)
-.learningRateDecayPolicy(LearningRatePolicy.Step).lrPolicyDecayRate(decayRate)
-.lrPolicySteps(steps)
-.layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut)
-.updater(org.deeplearning4j.nn.conf.Updater.SGD).build())
-.build();
+NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
+    .updater(new Sgd(new StepSchedule(ScheduleType.ITERATION,lr,decayRate,steps)))
+    .layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).build())
+    .build();
 ```
 
 <b>Schedule</b>
 
 Allows you to specify a schedule [[explain]].
 ```
-MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-.learningRateDecayPolicy(LearningRatePolicy.Schedule)                            
-.learningRateSchedule(learningRateSchedule)
-```
-
-<b>Score</b>
-
-[[Brief description]]
-```
-MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
-.learningRate(lr)
-.learningRateDecayPolicy(LearningRatePolicy.Score).lrPolicyDecayRate(lrScoreDecay).list()
+NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder()
+    .updater(new Sgd(new MapSchedule(ScheduleType.ITERATION,learningRateMapSchedule))
+    .layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).build())
+    .build();
 ```
 
-
 ## <a name="setup">How It Works</a>
 
    <b><a href="#prereqs">0. Prerequisites</a></b>
diff --git a/jcg.md b/jcg.md
index 7835874a3..b8b82ce8f 100644
--- a/jcg.md
+++ b/jcg.md
@@ -75,8 +75,7 @@ Now that the data is ready, we can set up the configuration of the neural networ
 MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
     .seed(seed)
     .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-    .learningRate(learningRate)
-    .updater(Updater.NESTEROVS)
+    .updater(new Nesterovs(learningRate))
     .list()
     .layer(0, new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes)
         .weightInit(WeightInit.XAVIER)
diff --git a/mnist-for-beginners.md b/mnist-for-beginners.md
index b9845a04e..0e6507bd7 100644
--- a/mnist-for-beginners.md
+++ b/mnist-for-beginners.md
@@ -68,8 +68,7 @@ layout: default
     MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
             .seed(rngSeed)
             .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-            .learningRate(0.006)
-            .updater(Updater.NESTEROVS).momentum(0.9)
+            .updater(new Nesterovs(0.006,0.9))
             .regularization(true).l2(1e-4)
             .list()
   </code></pre>
@@ -77,10 +76,8 @@ layout: default
   <p>This parameter uses a specific, randomly generated weight initialization. If you run an example many times, and generate new, random weights each time you begin, then your net’s results -- accuracy and F1 score -- may vary a great deal, because different initial weights can lead algorithms to different local minima in the errorscape. Keeping the same random weights allows you isolate the effect of adjusting other hyperparameters more clearly, while other conditions remain equal.</p>
   <h5>.optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)</h5>
   <p>Stochastic gradient descent (SGD) is a common method to optimize the cost function. To learn more about SGD and other optimization algorithms that help minimize error, we recommend <a href="https://www.coursera.org/learn/machine-learning" target="_blank">Andrew Ng’s Machine Learning course</a> and the SGD definition in our <a href="http://deeplearning4j.org/glossary#stochasticgradientdescent" target="_blank">glossary</a>.</p>
-  <h5>.learningRate(0.006)</h5>
-  <p>This line sets the learning rate, which is the size of the adjustments made to the weights with each iteration, the step size. A high learning rate makes a net traverse the errorscape quickly, but also makes it prone to overshoot the point of minimum error. A low learning rate is more likely to find the minimum, but it will do so very slowly, because it is taking small steps in adjusting the weights.</p>
-  <h5>.updater(Updater.NESTEROVS).momentum(0.9)</h5>
-  <p>Momentum is an additional factor in determining how fast an optimization algorithm converges on the optimum point. Momentum affects the direction that weights are adjusted in, so in the code we consider it a weight <code>updater</code>.</p>
+  <h5>.updater(new Nesterovs(0.006,0.9))</h5>
+  <p>This line sets the updater as nesterovs and the learning rate and the momentun. The learning rate, which is the size of the adjustments made to the weights with each iteration, the step size. A high learning rate makes a net traverse the errorscape quickly, but also makes it prone to overshoot the point of minimum error. A low learning rate is more likely to find the minimum, but it will do so very slowly, because it is taking small steps in adjusting the weights. Momentum is an additional factor in determining how fast an optimization algorithm converges on the optimum point. Momentum affects the direction that weights are adjusted in, so in the code we consider it a weight <code>updater</code>.</p>
   <h5>.regularization(true).l2(1e-4)</h5>
   <p>Regularization is a technique to prevent what’s called <b>overfitting</b>. Overfitting is when the model fits the training data really well, but performs poorly in real life as soon as it's exposed to data it hasn’t seen before.</p>
   <p>We use L2 regularization, which prevents individual weights from having too much influence on the overall results.</p>
diff --git a/multilayerperceptron.md b/multilayerperceptron.md
index 3d4e27f37..7450e7bfa 100644
--- a/multilayerperceptron.md
+++ b/multilayerperceptron.md
@@ -64,8 +64,7 @@ Eclipse Deeplearning4j includes [several examples of multilayer perceptrons](htt
         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
                 .seed(seed)
                 .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-                .learningRate(learningRate)
-                .updater(Updater.NESTEROVS)     //To configure: .updater(new Nesterovs(0.9))
+                .updater(new Nesterovs(learningRate,0.9))
                 .list()
                 .layer(0, new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes)
                         .weightInit(WeightInit.XAVIER)
diff --git a/programmingguide/03_code_structure.md b/programmingguide/03_code_structure.md
index fd8784c21..8dc153d28 100644
--- a/programmingguide/03_code_structure.md
+++ b/programmingguide/03_code_structure.md
@@ -153,7 +153,7 @@ The `MultiLayerConfiguration` can be used to define the parameter and structure
     MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
         .activation("relu")
         .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-        .learningRate(0.05)
+        .updater(new Sgd(0.05))
         // ... other hyperparameters
         .list()
         .backprop(true)
@@ -189,11 +189,11 @@ To create a `ComputationGraphConfiguration` use the following line:
 
         ComputationGraphConfiguration config = new NeuralNetConfiguration.Builder()
 
-Note that the code is slightly different for configuring a `ComputationGraph` rather than a `MultiLayerNetwork`. However, similar to before, `ComputationGraphConfiguration` can be used the define neural network parameters such as the learning rate, optimization algorithm, and network structure. For example, the following code chunk configures a `ComputationGraph` using stochastic gradient descent and a learning rate of 0.1. It also defines the network structure to include one hidden layer with 500 nodes and an output layer which outputs a probability. 
+Note that the code is slightly different for configuring a `ComputationGraph` rather than a `MultiLayerNetwork`. However, similar to before, `ComputationGraphConfiguration` can be used the define neural network parameters such as the learning rate, optimization algorithm, and network structure. For example, the following code chunk configures a `ComputationGraph` using stochastic gradient descent and a learning rate of 0.1 set via the updater which is chosen as SGD. It also defines the network structure to include one hidden layer with 500 nodes and an output layer which outputs a probability. 
 
     ComputationGraphConfiguration config = new NeuralNetConfiguration.Builder()
 	   .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-	   .learningRate(0.1)
+	   .updater(new Sgd(0.1))
 	   // other parameters
 	   .graphBuilder()
 	   .setInputTypes(InputType.feedForward(100))
diff --git a/programmingguide/04_convnet.md b/programmingguide/04_convnet.md
index 56fbf45b8..736e067ce 100644
--- a/programmingguide/04_convnet.md
+++ b/programmingguide/04_convnet.md
@@ -109,7 +109,7 @@ MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
     .seed(seed)
     .regularization(false).l2(0.005) 
     .activation(Activation.RELU)
-    .learningRate(0.0001)
+    .updater(new Sgd(0.0001))
     .weightInit(WeightInit.XAVIER)
     .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
     .updater(new Nesterovs(0.9))
diff --git a/programmingguide/05_lstm.md b/programmingguide/05_lstm.md
index 51d824f93..dee6f38d1 100644
--- a/programmingguide/05_lstm.md
+++ b/programmingguide/05_lstm.md
@@ -64,7 +64,7 @@ public static final int NB_INPUTS = 86;
 
 ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
     .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-    .learningRate(LEARNING_RATE)
+    .updater(new Sgd(LEARNING_RATE))
     .graphBuilder()
     .addInputs("trainFeatures")
     .setOutputs("predictMortality")
diff --git a/programmingguide/06_feedforwardnet.md b/programmingguide/06_feedforwardnet.md
index 6266aee2b..fbd63b531 100644
--- a/programmingguide/06_feedforwardnet.md
+++ b/programmingguide/06_feedforwardnet.md
@@ -51,8 +51,7 @@ Now that the data is ready, we can set up the configuration of the neural networ
 MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
     .seed(seed)
     .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-    .learningRate(learningRate)
-    .updater(Updater.NESTEROVS)
+    .updater(new Nesterovs(learningRate))
     .list()
     .layer(0, new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes)
         .weightInit(WeightInit.XAVIER)
diff --git a/programmingguide/08_deploy.md b/programmingguide/08_deploy.md
index befeebcbd..af487e15a 100644
--- a/programmingguide/08_deploy.md
+++ b/programmingguide/08_deploy.md
@@ -130,9 +130,8 @@ Setting up neural network configuration for a regression is similar to before. W
 MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
     .seed(seed)
     .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-    .learningRate(learningRate)
     .weightInit(WeightInit.XAVIER)
-    .updater(new Nesterovs(0.9))
+    .updater(new Nesterovs(learningRate,0.9))
     .list()
     .layer(0, new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes)
         .activation(Activation.TANH).build())
@@ -172,7 +171,7 @@ The code for the first example is shown below. We can see that both layers take
 
 ```
 ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
-    .learningRate(0.01)
+    .updater(new Sgd(0.01))
     .graphBuilder()
     .addInputs("input") //can use any label for this
     .addLayer("L1", new GravesLSTM.Builder().nIn(5).nOut(5).build(), "input")
@@ -188,7 +187,7 @@ The second example concatenates the two input arrays using a MergeVertex and the
 
 ```
 ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
-    .learningRate(0.01)
+    .updater(new Sgd(0.01))
     .graphBuilder()
     .addInputs("input1", "input2")
     .addLayer("L1", new DenseLayer.Builder().nIn(3).nOut(4).build(), "input1")
@@ -205,7 +204,7 @@ The third example is of a multi-task learning where multiple independent predict
 
 ```
 ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
-    .learningRate(0.01)
+    .updater(new Sgd(0.01))
     .graphBuilder()
     .addInputs("input")
     .addLayer("L1", new DenseLayer.Builder().nIn(3).nOut(4).build(), "input")
diff --git a/programmingguide/09_troubleshooting.md b/programmingguide/09_troubleshooting.md
index 8e7218344..bc1b8ce61 100644
--- a/programmingguide/09_troubleshooting.md
+++ b/programmingguide/09_troubleshooting.md
@@ -47,7 +47,7 @@ Below is an example of a neural network configuration using the Xavier weight in
 ```
 MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
     .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-    .learningRate(learningRate)
+    .updater(new Sgd(learningRate))
     .list()
     .layer(0, new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes)
         .weightInit(WeightInit.XAVIER)
@@ -140,7 +140,7 @@ Next, we define a `MultiLayerSpace` instance, which is similar to a `MultiLayerN
  MultiLayerSpace hyperparameterSpace = new MultiLayerSpace.Builder()
    .weightInit(WeightInit.XAVIER)
     //Learning rate hyperparameter: search over different values, applied to all models
-    .learningRate(learningRateHyperparam)
+	.updater(new SgdSpace(learningRateHyperparam))
     .addLayer( new DenseLayerSpace.Builder()
         .nIn(784)  
         .activation(Activation.LEAKYRELU)
diff --git a/quickstart.md b/quickstart.md
index 2c9944f6f..e5331dd16 100644
--- a/quickstart.md
+++ b/quickstart.md
@@ -30,7 +30,7 @@ Hyperparameters are variables that determine how a neural network learns. They i
         .weightInit(WeightInit.XAVIER)
         .activation("relu")
         .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-        .learningRate(0.05)
+        .updater(new Sgd(0.05))
         // ... other hyperparameters
         .list()
         .backprop(true)
diff --git a/spark-gpus.md b/spark-gpus.md
index 47e88b4cc..ca0c212e1 100644
--- a/spark-gpus.md
+++ b/spark-gpus.md
@@ -121,9 +121,8 @@ Then we configure the neural network:
         MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
                 .seed(seed)
                 .regularization(true).l2(0.0005)
-                .learningRate(0.1)
                 .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-                .updater(Updater.ADAGRAD)
+                .updater(new AdaGrad(0.1))
                 .list()
                 .layer(0, new ConvolutionLayer.Builder(5, 5)
                         .nIn(nChannels)
diff --git a/transfer-learning.md b/transfer-learning.md
index ee4fc9db0..c912132ea 100644
--- a/transfer-learning.md
+++ b/transfer-learning.md
@@ -40,9 +40,8 @@ ComputationGraph pretrainedNet = (ComputationGraph) zooModel.initPretrained(Pret
 #### II.  Set up a fine-tune configuration
 ```
 FineTuneConfiguration fineTuneConf = new FineTuneConfiguration.Builder()
-            .learningRate(5e-5)
             .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
-            .updater(Updater.NESTEROVS)
+            .updater(new Nesterovs(5e-5))
             .seed(seed)
             .build();
 ```
diff --git a/welldressed-recommendation-engine.md b/welldressed-recommendation-engine.md
index c7e27acb6..cd5fb6c0a 100644
--- a/welldressed-recommendation-engine.md
+++ b/welldressed-recommendation-engine.md
@@ -88,7 +88,7 @@ Here's the current setup for store-oriented nets:
     MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder()
                 .seed(seed)
                 .batchSize(batchSize)
-                .learningRate(1e-6)
+                .updater(new Sgd(1e-6))
                 .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
                 .l1(1e-6)
                 .regularization(true)