In [None]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/FastaiNotebook_01a_fastai_layers")' FastaiNotebook_01a_fastai_layers

Installing packages:
	.package(path: "/home/clattner/fastai_docs/dev_swift/FastaiNotebook_01a_fastai_layers")
		FastaiNotebook_01a_fastai_layers
With SwiftPM flags: []
Working in: /tmp/tmpms8xwjbz
/home/clattner/swift/usr/bin/swift-build: /home/clattner/anaconda3/envs/swift/lib/libuuid.so.1: no version information available (required by /home/clattner/swift/usr/lib/swift/linux/libFoundation.so)
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 2.38s
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
/home/clattner/swift/usr/bin/swiftc: /home/clattner/anaconda3/envs/swift/lib/libuuid.so.1: no version information available (required by /home/clattner/swift/usr/bin/swiftc)
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'Just' (1 sources)
/home/clattner/swift/usr/bin/swiftc: /hom

: 

In [None]:
//export
import Path
import TensorFlow

In [None]:
import FastaiNotebook_01a_fastai_layers

## The forward and backward passes

In [None]:
// export

// Typing Tensor<Float> all the time is tedious.  The S4TF team expects to 
// make "Float" be the default so we can just say "Tensor".  Until that happens
// though, we can define our own alias.
public typealias TF=Tensor<Float>

In [None]:
// export
public func normalize(_ x:TF, mean:TF, std:TF) -> TF {
    return (x-mean)/std
}

In [None]:
var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)

Normalize the training and validation sets.

In [None]:
let trainMean = xTrain.mean()
let trainStd  = xTrain.standardDeviation()
print(trainMean, trainStd)

0.13066047 0.3081078


In [None]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [None]:
//export
public func testNearZero(_ a: TF, tolerance: Float = 1e-3) {
    assert(abs(a) < tolerance, "Near zero: \(a)")
}

In [None]:
testNearZero(xTrain.mean())
testNearZero(xTrain.standardDeviation() - 1.0)

In [None]:
let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max()+1
print(n, m, c)

60000 784 10


## Foundations version

### Basic architecture

In [None]:
//num hidden
let nh = 50

In [None]:
// simplified kaiming init / he init
let w1 = TF(randomNormal: [m, nh]) / sqrt(Float(m))
let b1 = TF(repeating: 0.0, shape: [nh])           // could also use zeros:
let w2 = TF(randomNormal: [nh,1]) / sqrt(Float(nh))
let b2 = TF(zeros: [1])

In [None]:
testNearZero(w1.mean())
testNearZero(w1.standardDeviation()-1/sqrt(Float(m)))

In [None]:
// This should be ~ (0,1) (mean,std)...
(xValid.mean(),xValid.standardDeviation())

▿ 2 elements
  - .0 : 0.0060178353
  - .1 : 1.0077001


Instead of `@` in python we use `•` (or `matmul`) in Swift:

In [None]:
func lin(_ x: TF, _ w: TF, _ b: TF) -> TF { return x•w+b }

In [None]:
let t = lin(xValid, w1, b1)

In [None]:
//...so should this, because we used kaiming init, which is designed to do this
print(t.mean(), t.standardDeviation())

0.36849484 0.5603862


In [None]:
func myRelu(_ x:TF) -> TF { return max(x, 0) }

In [None]:
let t = myRelu(lin(xValid, w1, b1))

In [None]:
//...actually it really should be this!
print(t.mean(),t.standardDeviation())

0.36849484 0.5603862


In [None]:
// kaiming init / he init for relu
let w1 = TF(randomNormal: [m,nh]) * sqrt(2.0/Float(m))

In [None]:
print(w1.mean(), w1.standardDeviation())

0.00019601914 0.05068941


In [None]:
let t = myRelu(lin(xValid, w1, b1))
print(t.mean(), t.standardDeviation())

0.61064214 0.87908095


In [None]:
func model(_ xb: TF) -> TF {
    let l1 = lin(xb, w1, b1)
    let l2 = myRelu(l1)
    let l3 = lin(l2, w2, b2)
    return l3
}

In [None]:
time(repeating: 10) { _ = model(xValid) }

average: 0.4166424999999999 ms,   min: 0.361097 ms,   max: 0.534531 ms


### Loss function

In [None]:
let preds = model(xTrain)

In [None]:
// export
public func mse(_ out: TF, _ targ: TF) -> TF {
    return (out.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
// Convert these to Float dtype.
var yTrainF = TF(yTrain)
var yValidF = TF(yValid)

In [None]:
mse(preds, yTrainF)

26.742607


# Gradients and backward pass

Here we should how to calculate gradients for a simple model the hard way, manually.

To store the gradients a bit like in PyTorch we introduce a `TensorWithGrad` class that has two attributes: the original tensor and the gradient. We choose a class to easily replicate the Python notebook: classes are reference types (which means they are mutable) while structures are value types.

In fact, since this is the first time we're discovering Swift classes, let's jump into a [sidebar discussion about Value Semantics vs Reference Semantics](https://docs.google.com/presentation/d/1dc6o2o-uYGnJeCeyvgsgyk05dBMneArxdICW5vF75oU/edit#slide=id.g5669969ead_0_145) since it is a pretty fundamental part of the programming model and a huge step forward that Swift takes.

When we get back, we'll keep charging on, even though this is very non-idiomatic Swift code!


In [None]:
/// WARNING: This is designed to be similar to the PyTorch 02_fully_connected lesson,
/// this isn't idiomatic Swift code.
class TensorWithGrad {
    var inner, grad:  TF
    
    init(_ x: TF) {
        inner = x
        grad = TF(zeros: x.shape)
    } 
}

In [None]:
// Redefine our functions on TensorWithGrad.
func lin(_ x: TensorWithGrad, _ w: TensorWithGrad, _ b: TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(matmul(x.inner, w.inner) + b.inner)
}
func myRelu(_ x: TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(max(x.inner, 0))
}
func mse(_ inp: TensorWithGrad, _ targ: TF) -> TF {
    //grad of loss with respect to output of previous layer
    return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
// Define our gradient functions.
func mseGrad(_ inp: TensorWithGrad, _ targ: TF) {
    //grad of loss with respect to output of previous layer
    inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])
}

func reluGrad(_ inp: TensorWithGrad, _ out: TensorWithGrad) {
    //grad of relu with respect to input activations
    inp.grad = (inp.inner .> 0).selecting(out.grad, TF(zeros: inp.inner.shape))
}

This is our python version (we've renamed the python `g` to `grad` for consistency):

```python
def lin_grad(inp, out, w, b):
    inp.grad = out.grad @ w.t()
    w.grad = (inp.unsqueeze(-1) * out.grad.unsqueeze(1)).sum(0)
    b.grad = out.grad.sum(0)
```

In Swift `@` is spelled `•`, which is <kbd>option</kbd>-<kbd>8</kbd> on Mac or <kbd>compose</kbd>-<kbd>.</kbd>-<kbd>=</kbd> elsewhere. Or just use the `matmul()` function we've seen already.

In [None]:
func linGrad(_ inp:TensorWithGrad, _ out:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad){
    // grad of linear layer with respect to input activations, weights and bias
    inp.grad = out.grad • w.inner.transposed()
    w.grad = inp.inner.transposed() • out.grad
    b.grad = out.grad.sum(squeezingAxes: 0)
}

In [None]:
let w1a = TensorWithGrad(w1)
let b1a = TensorWithGrad(b1)
let w2a = TensorWithGrad(w2)
let b2a = TensorWithGrad(b2)

In [None]:
func forwardAndBackward(_ inp:TensorWithGrad, _ targ:TF){
    //forward pass:
    let l1 = lin(inp, w1a, b1a)
    let l2 = myRelu(l1)
    let out = lin(l2, w2a, b2a)
    //we don't actually need the loss in backward!
    let loss = mse(out, targ)
    
    //backward pass:
    mseGrad(out, targ)
    linGrad(l2, out, w2a, b2a)
    reluGrad(l1, l2)
    linGrad(inp, l1, w1a, b1a)
}

In [None]:
let inp = TensorWithGrad(xTrain)

In [None]:
forwardAndBackward(inp, yTrainF)

# Using the S4TF Language Integrated Autodiff

Let's compare to the language-integrated Swift for TensorFlow autodiff now. We have to mark the function as `@differentiable`.  This informs the compiler that we want it to automatically generate its gradients, and causes it to emit errors if there is anything contributing to the result of the function that cannot be differentiated.

The `@differentiable` attribute is normally optional in a S4TF standalone environment, but is currently required in Jupyter notebooks.  The S4TF team is planning to relax this limitation over time.

In [None]:
@differentiable
func forward(_ inp: TF, _ targ: TF, w1: TF, b1: TF, w2: TF, b2: TF) -> TF {
    let l1 = matmul(inp, w1) + b1
    let l2 = relu(l1)
    let l3 = matmul(l2, w2) + b2
    return (l3.squeezingShape(at: -1) - targ).squared().mean()
}

Then we can ask for the gradients w.r.t. any individual parameter like this:

In [None]:
let xGrad = gradient(at: xTrain) {xTrain in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let w1Grad = gradient(at: w1) {w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let b1Grad = gradient(at: b1) {b1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let w2Grad = gradient(at: w2) {w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let b2Grad = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

// Check that they agree with the manually calculated gradients.
testNearZero(xGrad - inp.grad)
testNearZero(w1Grad - w1a.grad)
testNearZero(b1Grad - b1a.grad)
testNearZero(w2Grad - w2a.grad)
testNearZero(b2Grad - b2a.grad)

You can also ask for gradients with respect to multiple things at the same time, but unfortunately, current AD bugs prevent getting more than two gradients at a time.  We can do a little bit better than the above code like so:

In [None]:
let (xGrad2, w1Grad2) = gradient(at: xTrain, w1) {
    xTrain, w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)
}
let (b1Grad2, w2Grad2) = gradient(at: b1, w2) {
    b1, w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)
}
let b2Grad2 = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

// Check that they agree.
testNearZero(xGrad-xGrad2)
testNearZero(w1Grad-w1Grad2)
testNearZero(b1Grad-b1Grad2)
testNearZero(w2Grad-w2Grad2)
testNearZero(b2Grad-b2Grad2)


This is currently pretty ugly, and even when the bugs are fixed, it still won't be very idiomatic.  A more common thing is to wrap up all your parameters into a struct, and differentiate w.r.t. all of them at the same time (which, when we refactor the code, will be our model itself).

Here is an example of that:


In [None]:
struct myParams: Differentiable {
    public var x, w1, b1, w2, b2: TF
}

let allParams = myParams(x: xTrain, w1: w1, b1: b1, w2: w2, b2: b2)

In [None]:
// We can now get all of the gradients at once with a single call, and a single forward computation.
let grads = gradient(at: allParams) {
  allParams in
    forward(allParams.x, yTrainF,
            w1: allParams.w1, 
            b1: allParams.b1,
            w2: allParams.w2, 
            b2: allParams.b2)
}

// Check that this still calculates the same thing.
testNearZero(xGrad  - grads.x)
testNearZero(w1Grad - grads.w1)
testNearZero(b1Grad - grads.b1)
testNearZero(w2Grad - grads.w2)
testNearZero(b2Grad - grads.b2)

If you wanted the value for your loss as well as the gradients, you just have to use `valueWithGradient`.

In [None]:
let (loss,grads) = valueWithGradient(at: allParams) { 
    allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
}

testNearZero(xGrad  - grads.x)
testNearZero(w1Grad - grads.w1)
testNearZero(b1Grad - grads.b1)
testNearZero(w2Grad - grads.w2)
testNearZero(b2Grad - grads.b2)

In terms of timing our implementation gives:

In [None]:
time(repeating: 10) { forwardAndBackward(inp, yTrainF) }

average: 8.455380100000001 ms,   min: 8.242054 ms,   max: 8.946601 ms


In [None]:
time(repeating: 10) {
    _ = valueWithGradient(at: allParams) { 
        allParams in forward(allParams.x, 
                             yTrainF, 
                             w1: allParams.w1, 
                             b1: allParams.b1, 
                             w2: allParams.w2, 
                             b2: allParams.b2)
    }
}

average: 10.136412199999999 ms,   min: 9.405311 ms,   max: 10.843894 ms


### Export

In [None]:
notebookToScript(fname: Path.cwd / "02_fully_connected.ipynb")