In [1]:
suppressMessages({
    library(nnm)
})

### MNIST with DNN (Sequential)

In [2]:
mnist <- LoadMnist()
train <- mnist$train
test <- mnist$test
layerSpec <- Sequential(
  Dense(784, 128),
  Dropout(128, keepProb=0.8),
  Dense(128, 10, Activation.Identity),
  Softmax)
layerSpec

 Sequential Layer 784 -> 10 
   | Dense: 784 -> 128 ReLU 
   | Dropout, keepProb =  0.8 
   | Dense: 128 -> 10 Identity 
   | Softmax, numClasses =  10 

In [3]:
modTime <- system.time(
    mod <- nnm(train$x, train$y, layerSpec, verbose=1)
)

iter  1 total loss =  230.283 
iter  101 total loss =  62.13019 
iter  201 total loss =  44.5404 
iter  301 total loss =  34.62881 
iter  401 total loss =  37.23956 
iter  501 total loss =  21.77938 
iter  601 total loss =  16.0151 
iter  701 total loss =  29.71946 
iter  801 total loss =  38.90523 
iter  901 total loss =  17.81867 
iter  1001 total loss =  17.86292 
iter  1101 total loss =  13.97953 
iter  1201 total loss =  21.40744 
iter  1301 total loss =  17.84026 
iter  1401 total loss =  19.79315 
iter  1501 total loss =  12.29914 
iter  1601 total loss =  15.76222 
iter  1701 total loss =  17.89719 
iter  1801 total loss =  10.90802 
iter  1901 total loss =  12.09158 
iter  2001 total loss =  8.551515 
iter  2101 total loss =  14.64607 
iter  2201 total loss =  19.91964 
iter  2301 total loss =  10.36869 
iter  2401 total loss =  9.112068 
iter  2501 total loss =  19.04041 
iter  2601 total loss =  8.175551 
iter  2701 total loss =  9.280241 
iter  2801 total loss =  14.18195 


In [4]:
print(modTime)

   user  system elapsed 
 96.803   1.084  97.901 


In [5]:
# accuracy on test set
cat("accuracy = ", mean(test$y == predict(mod, test$x, type="label")), "\n")

accuracy =  0.9692 


### MNIST with Directed acyclic graph (DAG)

We demo a simple DAG with residual connections. This DAG
has similar performance to the above full-connection graph
but with only about half parameters.

In [6]:
layers <- list(
  Dense(784, 64),
  Dropout(64, keepProb=0.8),
  Dense(64, 32),
  Dense(32, 16),
  Dense(48, 10, Activation.Identity),
  Softmax(10))
edges <- c(1, 2,
           2, 3,
           3, 4,
           4, 5,
           3, 5,
           5, 6)
dag <- DAG(layers, edges)
dag

 Directed Acycle Graph 784 -> 10 
   | node 1:  Dense: 784 -> 64 ReLU 
   | node 2:  Dropout, keepProb =  0.8 
   | node 3:  Dense: 64 -> 32 ReLU 
   | node 4:  Dense: 32 -> 16 ReLU 
   | node 5:  Dense: 48 -> 10 Identity 
   | node 6:  Softmax, numClasses =  10 
   | edge: node 1 -> node 2 
   | edge: node 2 -> node 3 
   | edge: node 3 -> node 4 
   | edge: node 4 -> node 5 
   | edge: node 3 -> node 5 
   | edge: node 5 -> node 6 

In [7]:
modTime2 <- system.time(
dagMod <- nnm(train$x, train$y, dag, verbose=1)
)

iter  1 total loss =  230.2648 
iter  101 total loss =  230.3848 
iter  201 total loss =  227.8851 
iter  301 total loss =  134.9185 
iter  401 total loss =  91.72305 
iter  501 total loss =  59.4139 
iter  601 total loss =  48.71088 
iter  701 total loss =  23.96979 
iter  801 total loss =  43.81935 
iter  901 total loss =  26.50421 
iter  1001 total loss =  36.47608 
iter  1101 total loss =  20.68648 
iter  1201 total loss =  26.20561 
iter  1301 total loss =  13.14982 
iter  1401 total loss =  25.17576 
iter  1501 total loss =  18.39282 
iter  1601 total loss =  22.62202 
iter  1701 total loss =  23.18602 
iter  1801 total loss =  16.77054 
iter  1901 total loss =  42.64111 
iter  2001 total loss =  8.084379 
iter  2101 total loss =  20.28254 
iter  2201 total loss =  11.74359 
iter  2301 total loss =  10.39085 
iter  2401 total loss =  19.70342 
iter  2501 total loss =  32.49685 
iter  2601 total loss =  11.35954 
iter  2701 total loss =  15.22414 
iter  2801 total loss =  20.78675

In [8]:
print(modTime2)

   user  system elapsed 
 65.075   0.956  66.049 


In [9]:
# accuracy on test set
cat("accuracy = ", mean(test$y == predict(dagMod, test$x, type="label")), "\n")

accuracy =  0.9653 


### Demo of embedding columns

In [10]:
n <- 1000
x <- data.frame(x1 = rnorm(n),
                x2 = sample(letters, size=n, replace=TRUE),
                x3 = sample(letters, size=n, replace=TRUE))
y <- x$x1 + x$x2 %in% c("a", "d")  + rnorm(n)
embeddingCols <- c("x2" ,"x3")
embeddingDims <- c(2, 4)
layerSpecs <- list(Dense(1 + sum(embeddingDims), 2), Dense(2, 1, Activation.Identity))
mod2 <- nnm(x, y, layerSpecs, embeddingCols, embeddingDims)
mod2

EmbeddingCols: x2, x3.
numEmbeddingDims: 2, 4.

DNN, type = regression 
Loss:  MSE 
 Sequential Layer 3 -> 1 
   | Paralleled Layers 3 -> 7 
   |   | Identity 1 -> 1 
   |   | Paralleled Layers 2 -> 6 
   |   |   | Embedding: 1 -> 2 
   |   |   | Embedding: 1 -> 4 
   | Sequential Layer 7 -> 1 
   |   | Dense: 7 -> 2 ReLU 
   |   | Dense: 2 -> 1 Identity 
sample response and predictions
              y      fitted
1   1.634137657 -0.18547513
2  -0.717665895  0.69230233
3  -1.728895119 -2.14726326
4  -1.284986898 -1.26461826
5   0.005282155  0.24517230
6   1.057496575  0.53635903
7   1.336650211  0.22451976
8  -0.879975161 -0.16794991
9   1.146940030  1.10987720
10 -1.957243947 -1.77471453
11 -1.931963792 -1.37772841
12 -1.774479063 -0.23718287
13 -0.284182205  0.04057911
14 -0.736987279 -0.78516626
15 -0.697900694 -1.40473964
16 -0.985800362 -0.83597289
17  1.280547374  0.95029142
18  0.884801306  0.94554062
19 -0.243123261 -1.11471176
20 -0.793728848 -0.48605138

In [11]:
 cor(y, predict(mod2, x))

0
0.7048263
