In [None]:
import TensorFlow

In [None]:
struct TestModel: Layer {
    public var layer1: Dense<Float>
    public var layer2: Dense<Float>
    
    public init(nIn: Int, nHid: Int, nOut: Int){
        layer1 = Dense(inputSize: nIn, outputSize: nHid, activation: relu)
        layer2 = Dense(inputSize: nHid, outputSize: nOut)
    }
    
    @differentiable
    public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        return input.sequenced(through: layer1, layer2)
    }
}

In [None]:
//Initialize a tensor to use the GPU and load the libraries needed
let y = Tensor<Int32>(zeros: [1])

427Mb used PyTorch takes 787Mb so S4TF is better there

In [None]:
var model = TestModel(nIn: 4096, nHid: 2048, nOut: 512)

555Mb so new usage of 128Mb. Model uses (4096x2048 + 2048 + 2048x512 + 512) x 4 bytes = 36Mb so we're using 4 times more memory than theoretically necessary.

PyTorch uses 52Mb at this step.

In [None]:
for _ in 0..<100 {
    let x = Tensor<Float>(randomNormal: [32, 4096])
    let y = Tensor<Int32>(zeros: [32])
    let out = model(x)
    let loss = softmaxCrossEntropy(logits: out, labels: y)
}

659Mb used so new usage of 104Mb. There might be some library loading in those, hard to test what exactly is our usage. Those 104Mb don't change with the size of the model (unless we go to much bigger sizes) but the activations tensors aren't very heavy compared to the model weights.

PyTorch new usage is 64Mb so 40Mb less, but it might be because it alrady loaded some of those libraries in the big init.

In [None]:
for _ in 0..<100 {
    let x = Tensor<Float>(randomNormal: [32, 4096])
    let y = Tensor<Int32>(zeros: [32])
    let out = model(x)
    let (loss, grads) = model.valueWithGradient {
        softmaxCrossEntropy(logits: $0(x), labels: y)
    }
}

787Mb used so new usage of 128Mb (almost 4 times what's necessary for gradients, normally 36Mb)

PyTorch uses 64Mb more

In [None]:
var opt = Adam(for: model)
for _ in 0..<100 {
    let x = Tensor<Float>(randomNormal: [32, 4096])
    let y = Tensor<Int32>(zeros: [32])
    let out = model(x)
    let (loss, grads) = model.valueWithGradient {
        softmaxCrossEntropy(logits: $0(x), labels: y)
    }
    opt.update(&model, along: grads)
}

1043MB so new usage of 256Mb (again, almost 4 times what's necessary since we add two new saved tensors, averages and square averages for the model, so normally 2x36Mb).

PyTorch uses 96Mb more