In [1]:
suppressMessages({
    library(nnm)
})

### MNIST with DNN (Sequential)

In [2]:
mnist <- LoadMnist()
train <- mnist$train
test <- mnist$test
layerSpec <- Sequential(
  Dense(784, 128),
  Dropout(128, keepProb=0.8),
  Dense(128, 10, Activation.Identity),
  Softmax)
layerSpec

 Sequential Layer 784 -> 10 
   | Dense: 784 -> 128 ReLU 
   | Dropout, keepProb =  0.8 
   | Dense: 128 -> 10 Identity 
   | Softmax, numClasses =  10 

In [3]:
modTime <- system.time(
    mod <- nnm(train$x, train$y, layerSpec, verbose=1)
)

iter  1 total loss =  230.4417 
iter  101 total loss =  54.82057 
iter  201 total loss =  52.64025 
iter  301 total loss =  36.74286 
iter  401 total loss =  27.63097 
iter  501 total loss =  21.57751 
iter  601 total loss =  30.33582 
iter  701 total loss =  20.93598 
iter  801 total loss =  20.97486 
iter  901 total loss =  23.93861 
iter  1001 total loss =  32.07045 
iter  1101 total loss =  25.03486 
iter  1201 total loss =  32.53601 
iter  1301 total loss =  13.55056 
iter  1401 total loss =  17.18514 
iter  1501 total loss =  14.77319 
iter  1601 total loss =  20.81168 
iter  1701 total loss =  17.39375 
iter  1801 total loss =  12.75528 
iter  1901 total loss =  14.53266 
iter  2001 total loss =  14.95641 
iter  2101 total loss =  16.14138 
iter  2201 total loss =  16.49437 
iter  2301 total loss =  5.623276 
iter  2401 total loss =  11.93723 
iter  2501 total loss =  7.739242 
iter  2601 total loss =  10.45309 
iter  2701 total loss =  17.25861 
iter  2801 total loss =  14.7828

In [4]:
print(modTime)

   user  system elapsed 
 88.377   0.968  89.346 


In [5]:
# number of parameters in the model
cat("num of parameters: ", NumParameters(mod), "\n")

# accuracy on test set
cat("accuracy = ", mean(test$y == predict(mod, test$x, type="label")), "\n")

num of parameters:  101770 
accuracy =  0.9692 


### MNIST with Directed acyclic graph (DAG)

We demo a simple DAG with residual connections. This DAG
has similar performance to the above full-connection graph
but with only about half parameters.

In [6]:
layers <- list(
  Dense(784, 64),
  Dropout(64, keepProb=0.8),
  Dense(64, 32),
  Dense(32, 16),
  Dense(48, 10, Activation.Identity),
  Softmax(10))
edges <- c(1, 2,
           2, 3,
           3, 4,
           4, 5,
           3, 5,
           5, 6)
dag <- DAG(layers, edges)
dag

 Directed Acycle Graph 784 -> 10 
   | node 1:  Dense: 784 -> 64 ReLU 
   | node 2:  Dropout, keepProb =  0.8 
   | node 3:  Dense: 64 -> 32 ReLU 
   | node 4:  Dense: 32 -> 16 ReLU 
   | node 5:  Dense: 48 -> 10 Identity 
   | node 6:  Softmax, numClasses =  10 
   | edge: node 1 -> node 2 
   | edge: node 2 -> node 3 
   | edge: node 3 -> node 4 
   | edge: node 4 -> node 5 
   | edge: node 3 -> node 5 
   | edge: node 5 -> node 6 

In [7]:
modTime2 <- system.time(
dagMod <- nnm(train$x, train$y, dag, verbose=1)
)

iter  1 total loss =  230.2588 
iter  101 total loss =  230.0958 
iter  201 total loss =  228.9939 
iter  301 total loss =  140.5326 
iter  401 total loss =  97.68631 
iter  501 total loss =  62.97778 
iter  601 total loss =  42.8144 
iter  701 total loss =  31.89813 
iter  801 total loss =  36.41382 
iter  901 total loss =  39.09989 
iter  1001 total loss =  22.98892 
iter  1101 total loss =  18.75162 
iter  1201 total loss =  19.77463 
iter  1301 total loss =  10.29939 
iter  1401 total loss =  34.59066 
iter  1501 total loss =  17.2062 
iter  1601 total loss =  26.68019 
iter  1701 total loss =  10.97525 
iter  1801 total loss =  23.90741 
iter  1901 total loss =  12.78519 
iter  2001 total loss =  21.66174 
iter  2101 total loss =  21.60756 
iter  2201 total loss =  28.62931 
iter  2301 total loss =  31.64293 
iter  2401 total loss =  20.0359 
iter  2501 total loss =  16.49298 
iter  2601 total loss =  25.57688 
iter  2701 total loss =  19.76294 
iter  2801 total loss =  8.166451 


In [8]:
print(modTime2)

   user  system elapsed 
 61.422   0.828  62.248 


In [9]:
# number of parameters in the model
cat("num of parameters: ", NumParameters(dagMod), "\n")

# accuracy on test set
cat("accuracy = ", mean(test$y == predict(dagMod, test$x, type="label")), "\n")

num of parameters:  53338 
accuracy =  0.9604 


### Demo of embedding columns

In [10]:
n <- 1000
x <- data.frame(x1 = rnorm(n),
                x2 = sample(letters, size=n, replace=TRUE),
                x3 = sample(letters, size=n, replace=TRUE))
y <- x$x1 + x$x2 %in% c("a", "d")  + rnorm(n)
embeddingCols <- c("x2" ,"x3")
embeddingDims <- c(2, 4)
layerSpecs <- list(Dense(1 + sum(embeddingDims), 2), Dense(2, 1, Activation.Identity))
mod2 <- nnm(x, y, layerSpecs, embeddingCols, embeddingDims)
mod2

EmbeddingCols: x2, x3.
numEmbeddingDims: 2, 4.

DNN, type = regression 
Loss:  MSE 
 Sequential Layer 3 -> 1 
   | Paralleled Layers 3 -> 7 
   |   | Identity 1 -> 1 
   |   | Paralleled Layers 2 -> 6 
   |   |   | Embedding: 1 -> 2 
   |   |   | Embedding: 1 -> 4 
   | Sequential Layer 7 -> 1 
   |   | Dense: 7 -> 2 ReLU 
   |   | Dense: 2 -> 1 Identity 
sample response and predictions
            y       fitted
1   1.9071253  0.842755239
2   1.3252411  2.176316829
3   1.4663919 -0.103369807
4  -0.9201589 -0.507212289
5   0.5934152  1.322064758
6  -0.5463789  1.170592107
7  -1.1221130 -0.022001247
8  -1.7532354 -1.367446041
9   0.9627592  1.547368443
10 -1.2764272  0.008830899
11 -1.6930950 -0.352822402
12 -1.0440762 -0.234526676
13 -0.8179057  0.318121617
14 -1.2137760 -0.878978979
15 -1.2540897 -0.748568565
16 -0.2041840  0.968665061
17  0.5145738  0.950052371
18 -0.1714448 -1.798866429
19 -0.0995794 -0.703402988
20 -0.8884164 -0.942933604

In [11]:
 cor(y, predict(mod2, x))

0
0.7483163
