In [1]:
%use deeplearning4j

# Simple network from scrach and with Deeplearning4J
Inspired by 
- https://www.youtube.com/watch?v=kft1AJ9WVDk
- https://www.youtube.com/watch?v=SGZ6BttHMPw&list=PL6Xpj9I5qXYEcOhn7TqghAJ6NAPrNmUBH
- [deeplearning4j-examples:MultiClassLogit.java](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/dataexamples/MultiClassLogit.java)

### Prediction of an binary computation

#### Probleme set

Given some training data:

|           |  inputs ||| outputs |
|-----------|---|---|---|---|
| example 1 | 0 | 0 | 1 | 0 |
| example 2 | 1 | 1 | 1 | 1 |
| example 3 | 1 | 0 | 1 | 1 |
| example 4 | 0 | 1 | 1 | 0 |

What should the new output be for

|           |  inputs ||| outputs |
|-----------|---|---|---|---|
| new situation | 1 | 1 | 1 | ? |

Using a simple neural network

![simple neural network](./images/simple_neuron_classification.svg "simple neural network")


### Generate inputs / outputs

In [2]:
val training_inputs = Nd4j.create(arrayOf(
    floatArrayOf(0f, 0f, 1f), 
    floatArrayOf(1f, 1f, 1f), 
    floatArrayOf(1f, 0f, 1f), 
    floatArrayOf(0f, 1f, 1f)))

val training_outputs = Nd4j.create(arrayOf(
    floatArrayOf(0f), 
    floatArrayOf(1f), 
    floatArrayOf(1f), 
    floatArrayOf(0f)))

<hr>

## Starting from scrach
### Artificial neuron in action

<img align="right" width="400" src="./images/artificial_neuron.svg">
An articial neuron is a computational unit which will make a particular computation based on ohter units it's connected to.

- **x** : input
- **w** : connection weight
- **b** : neuron bias
- **g(.)** : activation function

<br/>
- Neuron pre-activation (or input activation):
\begin{equation*}
pa(x) = b + \sum_{n=1}^{d} w_i x_i 
\end{equation*}
- Neuron (output) activation:
\begin{equation*}
h(x) = g(pa(x)) = g(b + \sum_{n=1}^{d} w_i x_i)
\end{equation*}



In [3]:
import org.nd4j.linalg.ops.transforms.Transforms

interface ActivationFunction {
    operator fun invoke(a: INDArray): INDArray
}

In [4]:
fun pa(x: INDArray, w: INDArray, b: INDArray): INDArray {
    return b.add(x.mmul(w)) // matrix multiplication (apply weights for each x)
}

In [5]:
fun h(x: INDArray, w: INDArray, g: ActivationFunction, b: INDArray): INDArray {
    return g(pa(x,w,b)) // activation function
}

### Activation Function

In artificial neural networks, the activation function of a node defines the output of that node given an input or set of inputs. ([wikipedia](https://en.wikipedia.org/wiki/Activation_function))

#### Some activation functions:

<table align="left" border="0">
<tr>
    <td>Identity</td>
    <td><img src="./images/Activation_identity.svg.png"></td>
    <td>\begin{equation*}g(a) = a\end{equation*}</td>
</tr>
<tr>
    <td>Sigmoid</td>
    <td><img src="./images/Activation_logistic.svg.png"></td>
    <td>\begin{equation*}sigmoid(a) = \frac{\mathrm{1} }{\mathrm{1} + e^a }\end{equation*}</td>
</tr>
<tr>
    <td>TanH</td>
    <td><img src="./images/Activation_tanh.svg.png"></td>
    <td>\begin{equation*}tanh(a) = \frac{e^ a - e^{-a} }{e^a + e^{-a} }\end{equation*}</td>
</tr>
<tr>
    <td>ReLU</td>
    <td><img src="./images/Activation_rectified_linear.svg.png"></td>
    <td>\begin{equation*}relu(a) = max(0,a)\end{equation*}</td>
</tr>
</table>

In [6]:
class Sigmoid : ActivationFunction {
    override operator fun invoke(a: INDArray): INDArray {
        var a = a
        a = a.mul(-1.0)
        a = Transforms.exp(a, false)
        a = a.add(1.0)
        a = a.rdiv(1.0)
        return a
    }
}

### Update neuron parameters (weights and bias)

Once activation function is computed we need to update weights and bias parameters

#### Using stochastic gradient descent (SGD) 
<img align="right" width="400" src="./images/artificial_neuron_backprop.svg">

Iterative method for optimizing an objective function by performing updates after each example. It can be regarded as a stochastic approximation of [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).

<img align="right" width="330" src="./images/gradient_descent.svg">

Gradient descent is an iterative algorithm, that starts from a random point on a function and travels down its slope in steps until it reaches the lowest point of that function.

#### gradient descent:

\begin{equation*}
  gradiant() = (ŷ-y)x
\end{equation*}


\begin{equation*}
  w_{t+1} = w_t - learning\_rate * gradiant()
\end{equation*}


In [7]:
fun gradient(x: INDArray, y: INDArray,g: ActivationFunction, 
             weights: INDArray, bias: INDArray): INDArray {
    val m = x.size(0) //number of examples
    val pred = h(x, weights, g, bias) // prediction of the current network 
    val diff = pred.dup().sub(y) //diff between predicted and expected
    return x.dup()
            .transpose()
            .mmul(diff)
            //.mul(1.0 / m) // regularization 
}    

#### Algorithm that performs updates after each example (training)

##### training function:
 - for nIterations 
 - compute gradient
 - apply learningRate
 - propagate gradient to weights

In [8]:
Nd4j.getRandom().setSeed(1234)

val learningRate = 0.9.toDouble()
val nIterations = 10000
val bias = Nd4j.zeros(1,1) // no bias
val g = Sigmoid()

fun training(g: ActivationFunction, x: INDArray, y: INDArray, maxIterations: Int): INDArray {
    
    var weights = Nd4j.rand(x.size(1).toInt(), 1) // init random weights

    for (i in 0 until maxIterations) {
        var gradients = gradient(x, y, g, weights, bias)
        gradients = gradients.mul(learningRate)
        weights = weights.sub(gradients)
    }
    return weights
}

##### Train the data and retrieve the optimized_weights

In [9]:
val optimized_weights = training(g, training_inputs, training_outputs, nIterations)

##### compute using the optimized weights applied to the neural network

In [10]:
fun compute_neural_network(x1: Number, x2: Number, x3: Number) {
    val inputs = Nd4j.create(arrayOf(floatArrayOf(x1.toFloat(), x2.toFloat(), x3.toFloat())))
    val output = h(inputs, optimized_weights, g, bias)
    println("input: "+ x1 +", "+ x2 +", "+ x3 +" | output = "+ output)
}

In [11]:
compute_neural_network(0,0,1)
compute_neural_network(1,1,1)
compute_neural_network(1,0,1)
compute_neural_network(0,1,1)
println("Prediction:")
compute_neural_network(1,1,1)

input: 0, 0, 1 | output = [[0.0002]]
input: 1, 1, 1 | output = [[0.9999]]
input: 1, 0, 1 | output = [[0.9999]]
input: 0, 1, 1 | output = [[0.0001]]
Prediction:
input: 1, 1, 1 | output = [[0.9999]]


## loss func

loss function is a method of evaluating how well specific algorithm models the given data.

In [12]:
fun loss(oldParams: INDArray, newParams: INDArray): Double {
        val diffSum: Double = Transforms.abs(oldParams.sub(newParams)).sumNumber().toDouble()
        return diffSum / oldParams.size(0)
}

In [13]:
fun training_with_loss(g: ActivationFunction, x: INDArray, y: INDArray, maxIterations: Int): INDArray {
    
    var weights = Nd4j.rand(x.size(1).toInt(), 1) // init random weights

    for (i in 0 until maxIterations) {
        var gradients = gradient(x, y, g, weights, bias)
        gradients = gradients.mul(learningRate)
        val newWeights = weights.sub(gradients)
        println("Iteration " + (i+1) + " - loss: " + loss(weights, newWeights))
        weights = weights.sub(gradients)
    }
    return weights
}


In [14]:
val optimized_weights = training_with_loss(g, training_inputs, training_outputs, 1000)

Iteration 1 - loss: 0.5761516888936361
Iteration 2 - loss: 0.23400044441223145
Iteration 3 - loss: 0.23013293743133545
Iteration 4 - loss: 0.18769816557566324
Iteration 5 - loss: 0.16302559773127237
Iteration 6 - loss: 0.14307681719462076
Iteration 7 - loss: 0.12719833850860596
Iteration 8 - loss: 0.11430616180102031
Iteration 9 - loss: 0.10366164644559224
Iteration 10 - loss: 0.09474410613377889
Iteration 11 - loss: 0.08717785278956096
Iteration 12 - loss: 0.0806858241558075
Iteration 13 - loss: 0.07512431343396504
Iteration 14 - loss: 0.0703667402267456
Iteration 15 - loss: 0.06614719827969869
Iteration 16 - loss: 0.06238238016764323
Iteration 17 - loss: 0.059005339940389
Iteration 18 - loss: 0.055960655212402344
Iteration 19 - loss: 0.05320302645365397
Iteration 20 - loss: 0.050694803396860756
Iteration 21 - loss: 0.04840462406476339
Iteration 22 - loss: 0.04630579551060995
Iteration 23 - loss: 0.044375727574030556
Iteration 24 - loss: 0.0425956646601359
Iteration 25 - loss: 0.04094

Iteration 197 - loss: 0.00516401727994283
Iteration 198 - loss: 0.005137970050175984
Iteration 199 - loss: 0.005111604928970337
Iteration 200 - loss: 0.005086004734039307
Iteration 201 - loss: 0.005060454209645589
Iteration 202 - loss: 0.005034903685251872
Iteration 203 - loss: 0.005009939273198445
Iteration 204 - loss: 0.00498501459757487
Iteration 205 - loss: 0.004960745573043823
Iteration 206 - loss: 0.0049362679322560625
Iteration 207 - loss: 0.004912147919336955
Iteration 208 - loss: 0.004888176918029785
Iteration 209 - loss: 0.00486453374226888
Iteration 210 - loss: 0.004841357469558716
Iteration 211 - loss: 0.004817873239517212
Iteration 212 - loss: 0.004795153935750325
Iteration 213 - loss: 0.004772156476974487
Iteration 214 - loss: 0.004749963680903117
Iteration 215 - loss: 0.004727423191070557
Iteration 216 - loss: 0.004705369472503662
Iteration 217 - loss: 0.004683494567871094
Iteration 218 - loss: 0.0046618978182474775
Iteration 219 - loss: 0.004640330870946248
Iteration 22

Iteration 388 - loss: 0.002605944871902466
Iteration 389 - loss: 0.0025992393493652344
Iteration 390 - loss: 0.002592394749323527
Iteration 391 - loss: 0.0025856991608937583
Iteration 392 - loss: 0.002578993638356527
Iteration 393 - loss: 0.0025724669297536216
Iteration 394 - loss: 0.002566079298655192
Iteration 395 - loss: 0.0025593737761179605
Iteration 396 - loss: 0.002552678187688192
Iteration 397 - loss: 0.0025464495023091636
Iteration 398 - loss: 0.0025397340456644693
Iteration 399 - loss: 0.002533515294392904
Iteration 400 - loss: 0.002527127663294474
Iteration 401 - loss: 0.002520918846130371
Iteration 402 - loss: 0.0025143822034200034
Iteration 403 - loss: 0.0025079945723215737
Iteration 404 - loss: 0.0025017857551574707
Iteration 405 - loss: 0.002495567003885905
Iteration 406 - loss: 0.0024893383185068765
Iteration 407 - loss: 0.0024831295013427734
Iteration 408 - loss: 0.0024772187074025473
Iteration 409 - loss: 0.00247114896774292
Iteration 410 - loss: 0.0024649202823638916

Iteration 576 - loss: 0.0017505983511606853
Iteration 577 - loss: 0.001747588316599528
Iteration 578 - loss: 0.001744687557220459
Iteration 579 - loss: 0.001741488774617513
Iteration 580 - loss: 0.0017384688059488933
Iteration 581 - loss: 0.0017356276512145996
Iteration 582 - loss: 0.0017324189345041912
Iteration 583 - loss: 0.0017293890317281087
Iteration 584 - loss: 0.00172652800877889
Iteration 585 - loss: 0.0017236669858296712
Iteration 586 - loss: 0.0017204682032267253
Iteration 587 - loss: 0.001717597246170044
Iteration 588 - loss: 0.001714557409286499
Iteration 589 - loss: 0.0017117361227671306
Iteration 590 - loss: 0.0017088552316029866
Iteration 591 - loss: 0.0017059644063313801
Iteration 592 - loss: 0.001703113317489624
Iteration 593 - loss: 0.0017002324263254802
Iteration 594 - loss: 0.0016972124576568604
Iteration 595 - loss: 0.00169450044631958
Iteration 596 - loss: 0.0016914904117584229
Iteration 597 - loss: 0.0016886194547017415
Iteration 598 - loss: 0.001685897509256998

Iteration 765 - loss: 0.0013160506884257
Iteration 766 - loss: 0.0013142724831899006
Iteration 767 - loss: 0.0013127028942108154
Iteration 768 - loss: 0.0013108054796854656
Iteration 769 - loss: 0.001309057076772054
Iteration 770 - loss: 0.0013074278831481934
Iteration 771 - loss: 0.001305679480234782
Iteration 772 - loss: 0.0013040900230407715
Iteration 773 - loss: 0.0013023316860198975
Iteration 774 - loss: 0.0013007521629333496
Iteration 775 - loss: 0.0012988348801930745
Iteration 776 - loss: 0.0012973944346110027
Iteration 777 - loss: 0.0012955069541931152
Iteration 778 - loss: 0.0012940665086110432
Iteration 779 - loss: 0.0012921392917633057
Iteration 780 - loss: 0.0012906889120737712
Iteration 781 - loss: 0.0012888014316558838
Iteration 782 - loss: 0.0012873709201812744
Iteration 783 - loss: 0.0012854735056559246
Iteration 784 - loss: 0.00128402312596639
Iteration 785 - loss: 0.001282443602879842
Iteration 786 - loss: 0.001280665397644043
Iteration 787 - loss: 0.00127923488616943

Iteration 953 - loss: 0.0010553101698557537
Iteration 954 - loss: 0.0010543366273244221
Iteration 955 - loss: 0.0010529160499572754
Iteration 956 - loss: 0.001051942507425944
Iteration 957 - loss: 0.0010509888331095378
Iteration 958 - loss: 0.001049886147181193
Iteration 959 - loss: 0.0010485947132110596
Iteration 960 - loss: 0.0010476410388946533
Iteration 961 - loss: 0.0010466972986857097
Iteration 962 - loss: 0.0010452767213185628
Iteration 963 - loss: 0.0010442833105723064
Iteration 964 - loss: 0.0010433594385782878
Iteration 965 - loss: 0.0010422468185424805
Iteration 966 - loss: 0.0010409752527872722
Iteration 967 - loss: 0.001040021578470866
Iteration 968 - loss: 0.0010389089584350586
Iteration 969 - loss: 0.0010379552841186523
Iteration 970 - loss: 0.001036683718363444
Iteration 971 - loss: 0.0010356903076171875
Iteration 972 - loss: 0.0010347366333007812
Iteration 973 - loss: 0.0010336538155873616
Iteration 974 - loss: 0.0010323723157246907
Iteration 975 - loss: 0.001031418641

<hr>

## Deeplearning4J implementation

In [15]:
training_inputs

[[         0,         0,    1.0000], 
 [    1.0000,    1.0000,    1.0000], 
 [    1.0000,         0,    1.0000], 
 [         0,    1.0000,    1.0000]]

In [16]:
training_outputs

[[0], 
 [1.0000], 
 [1.0000], 
 [0]]

In [17]:
val ds = DataSet(training_inputs, training_outputs)

In [18]:
import org.nd4j.linalg.learning.config.Sgd
import org.nd4j.linalg.activations.Activation
import org.nd4j.linalg.learning.config.Nesterovs
import org.nd4j.linalg.lossfunctions.LossFunctions


val seed = 1234 // number used to initialize a pseudorandom number generator.
val nEpochs = 10000 // number of training epochs
val numHiddenNodes = 3


// https://medium.com/konvergen/momentum-method-and-nesterov-accelerated-gradient-487ba776c987
val conf = NeuralNetConfiguration.Builder()
        .weightInit(WeightInit.XAVIER)
        //.updater(Nesterovs(0.01, 0.9))
        .updater(Sgd(0.9))
        .seed(seed.toLong())
        .list()
        .layer(DenseLayer.Builder()
                .nIn(3)
                .nOut(numHiddenNodes)
                .activation(Activation.SIGMOID) // random initialize weights with values between 0 and 1
                .build())
        .layer(OutputLayer.Builder(LossFunctions.LossFunction.MSE) 
                .nIn(numHiddenNodes).nOut(1)
                .activation(Activation.IDENTITY)
                .build())
        .build()


In [19]:
import org.deeplearning4j.nn.api.OptimizationAlgorithm;

val net = MultiLayerNetwork(conf)
net.init()

// add an listener which outputs the error every 100 parameter updates
net.setListeners(ScoreIterationListener(100))

// C&P from LSTMCharModellingExample
// Print the number of parameters in the network (and for each layer)
println(net.summary())


LayerName (LayerType)   nIn,nOut   TotalParams   ParamsShape     
layer0 (DenseLayer)     3,3        12            W:{3,3}, b:{1,3}
layer1 (OutputLayer)    3,1        4             W:{3,1}, b:{1,1}
-----------------------------------------------------------------
            Total Parameters:  16
        Trainable Parameters:  16
           Frozen Parameters:  0



In [20]:
// here the actual learning takes place
for (i in 0 until nEpochs) {
    net.fit(ds)
}

In [21]:
fun compute_dl4j_neural_network(x1: Number, x2: Number, x3: Number) {
    val inputs = Nd4j.create(arrayOf(floatArrayOf(x1.toFloat(), x2.toFloat(), x3.toFloat())))
    val output = net.output(inputs)
    println("input: "+ x1 +", "+ x2 +", "+ x3 +" | dl4j output = "+ output)
}

In [22]:
compute_neural_network(0,0,1)
compute_neural_network(1,1,1)
compute_neural_network(1,0,1)
compute_neural_network(0,1,1)
println("Prediction:")
compute_neural_network(1,1,1)

input: 0, 0, 1 | output = [[0.0002]]
input: 1, 1, 1 | output = [[0.9999]]
input: 1, 0, 1 | output = [[0.9999]]
input: 0, 1, 1 | output = [[0.0001]]
Prediction:
input: 1, 1, 1 | output = [[0.9999]]
