In [None]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/FastaiNotebook_01a_fastai_layers")' FastaiNotebook_01a_fastai_layers

Installing packages:
	.package(path: "/home/clattner/fastai_docs/dev_swift/FastaiNotebook_01a_fastai_layers")
		FastaiNotebook_01a_fastai_layers
With SwiftPM flags: []
Working in: /tmp/tmp1ofhyo8f
/home/clattner/swift/usr/bin/swift-build: /home/clattner/anaconda3/envs/swift/lib/libuuid.so.1: no version information available (required by /home/clattner/swift/usr/lib/swift/linux/libFoundation.so)
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Fetching https://github.com/latenitesoft/NotebookExport
Completed resolution in 2.04s
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Cloning https://github.com/latenitesoft/NotebookExport
Resolving https://github.com/latenitesoft/NotebookExport at 0.5.0
/home/clattner/swift/usr/bin/swiftc: /home/clattner/anaconda3/envs/swift/lib/libuuid.so.1: no version information avai

: 

In [None]:
//export
import Path
import TensorFlow

In [None]:
import FastaiNotebook_01a_fastai_layers

## The forward and backward passes

Typing `Tensor<Float>` all the time is tedious. The S4TF team expects to make `Float` be the default so we can just say `Tensor`.  Until that happens though, we can define our own alias.

In [None]:
// export
public typealias TF=Tensor<Float>

We will need to normalize our data.

In [None]:
// export
public func normalize(_ x:TF, mean:TF, std:TF) -> TF {
    return (x-mean)/std
}

In [None]:
var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)

Normalize the training and validation sets with the training set statistics.

In [None]:
let trainMean = xTrain.mean()
let trainStd  = xTrain.std()
print(trainMean, trainStd)

0.13066047 0.3081078


In [None]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

To test everything is going well:

In [None]:
//export
public func testNearZero(_ a: TF, tolerance: Float = 1e-3) {
    assert(abs(a) < tolerance, "Near zero: \(a)")
}

public func testSame(_ a: TF, _ b: TF) {
    // Check shapes match so broadcasting doesn't hide shape errors.
    assert(a.shape == b.shape)
    testNearZero(a-b)
}

In [None]:
testNearZero(xTrain.mean())
testNearZero(xTrain.std() - 1.0)

In [None]:
let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max()+1
print(n, m, c)

60000 784 10


## Foundations version

### Basic architecture

In [None]:
//num hidden
let nh = 50

In [None]:
// simplified kaiming init / he init
let w1 = TF(randomNormal: [m, nh]) / sqrt(Float(m))
let b1 = TF(zeros: [nh])
let w2 = TF(randomNormal: [nh,1]) / sqrt(Float(nh))
let b2 = TF(zeros: [1])

In [None]:
testNearZero(w1.mean())
testNearZero(w1.std()-1/sqrt(Float(m)))

In [None]:
// This should be ~ (0,1) (mean,std)...
print(xValid.mean(), xValid.std())

0.0060178353 1.0077001


Instead of `@` in python we use `‚Ä¢` (or `matmul`) in Swift:

In [None]:
func lin(_ x: TF, _ w: TF, _ b: TF) -> TF { return x‚Ä¢w+b }

In [None]:
let t = lin(xValid, w1, b1)

In [None]:
//...so should this, because we used kaiming init, which is designed to do this
print(t.mean(), t.std())

0.008671894 0.984596


In [None]:
func myRelu(_ x:TF) -> TF { return max(x, 0) }

In [None]:
let t = myRelu(lin(xValid, w1, b1))

In [None]:
//...actually it really should be this!
print(t.mean(),t.std())

0.39344302 0.5840166


In [None]:
// kaiming init / he init for relu
let w1 = TF(randomNormal: [m,nh]) * sqrt(2.0/Float(m))

In [None]:
print(w1.mean(), w1.std())

-0.00028694386 0.050297257


In [None]:
let t = myRelu(lin(xValid, w1, b1))
print(t.mean(), t.std())

0.48257005 0.7595384


Here is a simple basic model:

In [None]:
func model(_ xb: TF) -> TF {
    let l1 = lin(xb, w1, b1)
    let l2 = myRelu(l1)
    let l3 = lin(l2, w2, b2)
    return l3
}

In [None]:
time(repeating: 10) { _ = model(xValid) }

average: 0.3743267 ms,   min: 0.311244 ms,   max: 0.484276 ms


### Loss function

We begin with the mean squared error to have easier gradient computations.

In [None]:
let preds = model(xTrain)

In [None]:
// export
public func mse(_ out: TF, _ targ: TF) -> TF {
    return (out.squeezingShape(at: -1) - targ).squared().mean()
}

One more step comapred to Python, we have to make sure our labels are properly converted to floats.

In [None]:
// Convert these to Float dtype.
var yTrainF = TF(yTrain)
var yValidF = TF(yValid)

In [None]:
mse(preds, yTrainF)

37.37175


## Gradients and backward pass

Here we should how to calculate gradients for a simple model the hard way, manually.

To store the gradients a bit like in PyTorch we introduce a `TensorWithGrad` class that has two attributes: the original tensor and the gradient. We choose a class to easily replicate the Python notebook: classes are reference types (which means they are mutable) while structures are value types.

In fact, since this is the first time we're discovering Swift classes, let's jump into a [sidebar discussion about Value Semantics vs Reference Semantics](https://docs.google.com/presentation/d/1dc6o2o-uYGnJeCeyvgsgyk05dBMneArxdICW5vF75oU/edit#slide=id.g5669969ead_0_145) since it is a pretty fundamental part of the programming model and a huge step forward that Swift takes.

When we get back, we'll keep charging on, even though this is very non-idiomatic Swift code!


In [None]:
/// WARNING: This is designed to be similar to the PyTorch 02_fully_connected lesson,
/// this isn't idiomatic Swift code.
class TensorWithGrad {
    var inner, grad:  TF
    
    init(_ x: TF) {
        inner = x
        grad = TF(zeros: x.shape)
    } 
}

In [None]:
// Redefine our functions on TensorWithGrad.
func lin(_ x: TensorWithGrad, _ w: TensorWithGrad, _ b: TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(matmul(x.inner, w.inner) + b.inner)
}
func myRelu(_ x: TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(max(x.inner, 0))
}
func mse(_ inp: TensorWithGrad, _ targ: TF) -> TF {
    //grad of loss with respect to output of previous layer
    return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
// Define our gradient functions.
func mseGrad(_ inp: TensorWithGrad, _ targ: TF) {
    //grad of loss with respect to output of previous layer
    inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])
}

func reluGrad(_ inp: TensorWithGrad, _ out: TensorWithGrad) {
    //grad of relu with respect to input activations
    inp.grad = (inp.inner .> 0).selecting(out.grad, TF(zeros: inp.inner.shape))
}

This is our python version (we've renamed the python `g` to `grad` for consistency):

```python
def lin_grad(inp, out, w, b):
    inp.grad = out.grad @ w.t()
    w.grad = (inp.unsqueeze(-1) * out.grad.unsqueeze(1)).sum(0)
    b.grad = out.grad.sum(0)
```

In Swift `@` is spelled `‚Ä¢`, which is <kbd>option</kbd>-<kbd>8</kbd> on Mac or <kbd>compose</kbd>-<kbd>.</kbd>-<kbd>=</kbd> elsewhere. Or just use the `matmul()` function we've seen already.

In [None]:
func linGrad(_ inp:TensorWithGrad, _ out:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad){
    // grad of linear layer with respect to input activations, weights and bias
    inp.grad = out.grad ‚Ä¢ w.inner.transposed()
    w.grad = inp.inner.transposed() ‚Ä¢ out.grad
    b.grad = out.grad.sum(squeezingAxes: 0)
}

In [None]:
let w1a = TensorWithGrad(w1)
let b1a = TensorWithGrad(b1)
let w2a = TensorWithGrad(w2)
let b2a = TensorWithGrad(b2)

In [None]:
func forwardAndBackward(_ inp:TensorWithGrad, _ targ:TF){
    // forward pass:
    let l1 = lin(inp, w1a, b1a)
    let l2 = myRelu(l1)
    let out = lin(l2, w2a, b2a)
    //we don't actually need the loss in backward!
    let loss = mse(out, targ)
    
    // backward pass:
    mseGrad(out, targ)
    linGrad(l2, out, w2a, b2a)
    reluGrad(l1, l2)
    linGrad(inp, l1, w1a, b1a)
}

In [None]:
let inp = TensorWithGrad(xTrain)

In [None]:
forwardAndBackward(inp, yTrainF)

# Automatic Differentiation: the functional way

There are a few challenges with the code above:

 * It doesn't follow the principle of value semantics, because TensorGrad is a class.  Mutating a tensor would produce the incorrect results.
 * It doesn't compose very well - we need to keep track of values passed in the forward pass and also pass them in the backward pass.
 * It is fully dynamic, keeping track of gradients at runtime.  This interferes with the compiler's ability to perform fusion and other advanced optimizations.
 
Swift for TensorFlow uses a language and compiler integrated automatic differentiation system, which works in a different way.  Each differentiable function gets an associated "pullback" (described below) that defines its gradient.  When you write a function that like `model` that calls a bunch of these in sequence, the compiler calls the function and its collects its pullback, then stitches together the pullbacks using the chain rule from Calculus.

Let's remember the chain rule - it is written:

$$\frac{d}{dx}\left[f\left(g(x)\right)\right] = f'\left(g(x)\right)g'(x)$$

Notice how the chain rule requires mixing together expressions from both the forward pass (`g()`) and the backward pass (`f'()` and `g'()`) of a computation to get the derivative.  While it is possible to calculate all the forward versions of a computation, then recompute everything needed again on the backward pass, this would be incredibly inefficient - it makes more sense to save intermediate values from the forward pass and reuse them on the backward pass.

The Swift language provides the atoms we need to express this: we can represent math with function calls, and the pullback can be represented with closures.  This works out well because closures provide a natural way to capture interesting values from the forward pass.

## Basic expressions in MSE

To explore this, let's look at a really simple example of this, the inner computation of MSE.  The full body of MSE looks like this:

```swift
func mse(_ inp: TF, _ targ: TF) -> TF {
    //grad of loss with respect to output of previous layer
    return (inp.squeezingShape(at: -1) - targ).squared().mean()
}
```

For the purposes of our example, we're going to keep it super super simple and just focus on the `x.squared().mean()` part of the computation, which we'll write as `mseInner(x) = mean(square(x))` to align better with function composition notation.  We want a way to visualize what functions get called, so let's define a little helper that prints the name of its caller whenever it is called:

In [None]:
// This function takes and returns an arbitrary value (just passing
// it through as an identity function) but prints out the calling
// functions name.  This is useful to see what is going on in your program.
func trace<T>(_ v: T, function: String = #function) -> T {
  print(function)
  return v
}

// Try out the trace helper function.
func foo(a: Int, b: Int) -> Int {
  return trace(a+b)
}
func bar(x: Int) -> Int {
  return trace(x*42+17)
}

foo(a: 1, b: 2)
bar(x: 17)


foo(a:b:)
bar(x:)


731



Ok, given that, we start by writing the implementation and gradients of these functions, and we put print statements in them so we can tell when they are called.  This looks like:


In [None]:
func square(_ x: TF) -> TF {
    let result = x * x
    return trace(result)
}
func ùõÅsquare(_ x: TF) -> TF {
    let result = 2 * x
    return trace(result)
}

func mean(_ x: TF) -> TF {
    let result = x.mean()  // this is a reduction.  (can someone write this out longhand?)
    return trace(result)
}
func ùõÅmean(_ x: TF) -> TF {
    let result = TF(1).broadcast(to: x.shape) / Float(x.shape[0])
    return trace(result)
}

Given these definitions we can now compute the forward and derivative of the `mseInner` function that composes `square` and `mean`, using the chain rule:

$$\frac{d}{dx}\left[f\left(g(x)\right)\right] = f'\left(g(x)\right)g'(x)$$

where `f` is `mean` and `g` is `square`.  This gives us:


In [None]:
func mseInner(_ x: TF) -> TF {
    return mean(square(x))
}

func ùõÅmseInner(_ x: TF) -> TF {
    return ùõÅmean(square(x)) * ùõÅsquare(x)
}

This is all simple, but we have a small problem if (in the common case for deep nets) we want to calculate both the forward and the gradient computation at the same time: we end up redundantly computing `square(x)` in both the forward and backward paths!

In [None]:
func mseInnerAndGrad(_ x: TF) -> (TF, TF) {
  return (mseInner(x), ùõÅmseInner(x))    
}

let exampleData = TF([1, 2, 3, 4])

let (mseInnerResult1, mseInnerGrad1) = mseInnerAndGrad(exampleData)
print()

print("result:", mseInnerResult1)
print("gradient:", mseInnerGrad1)

// Check that our gradient matches builtin S4TF's autodiff.
let builtinGrad = gradient(at: exampleData, in: { x in (x*x).mean() })
testSame(mseInnerGrad1, builtinGrad)

square(_:)
mean(_:)
square(_:)
ùõÅmean(_:)
ùõÅsquare(_:)

result: 7.5
gradient: [0.5, 1.0, 1.5, 2.0]


Note above how `square` got called two times: once in the forward function and once in the gradient.  In more complicated cases, this can be an incredible amount of redundant computation, which would make performance unacceptably slow.

**Exercise:** take a look what happens when you use the same techniques to implement more complex functions.


## Reducing recomputation with Pullbacks and VJPs

We can fix this by refactoring our code.  We want to preserve the linear structure of `mseInner` that calls `square` and then `mean`, but we want to make it so the ultimate *user* of the computation can choose whether they want the gradient computation (or not) and if so, we want to minimize computation.  To do this, we have to slightly generalize our derivative functions.  While it is true that the derivative of $square(x)$ is `2*x`, this is only true for a given point `x`.

If we generalize the derivative of `square` to work with an arbitrary **function**, instead of point, then we need to remember that $\frac{d}{dx}x^2 = 2x\frac{d}{dx}$, and therefore the derivative for `square` needs to get $\frac{d}{dx}$ passed in from its nested function.  This form of gradient is known as a "pullback", and can be written like this:

In [None]:
// The pullback for the gradient of square(x).
func ùõÅsquarePB(x: TF, ddx: TF) -> TF {
  return ddx * trace(2*x)
}

// The pullback for the gradient of mean(x).
func ùõÅmeanPB(x: TF, ddx: TF) -> TF {
  return trace(ddx.broadcast(to: x.shape) / Float(x.shape[0]))
}

Given this very general way of describing gradients, we now want to pull them together in a single bundle that we can keep track of: we do this by changing each atom of computation to return both a normal value and a "pullback" closure that produces a piece of the gradient.

This form of representation is called a "Vector Jacobian Product" (cite??) so we will name them VJPs.  They look like this:

In [None]:
// Returns x*x and the pullback for the gradient of x*x.
func squareVJP(_ x: TF) -> (TF, 
                            (TF) -> TF) {
  return (trace(x*x), 
          { ddx in ùõÅsquarePB(x: x, ddx: ddx) })    
}

// Returns the mean of x and the pullback for the mean.
func meanVJP(_ x: TF) -> (TF,
                          (TF) -> TF) {
  return (trace(x.mean()),
          { ddx in ùõÅmeanPB(x: x, ddx: ddx) })
}

Given this, we can now implement `mseInner` in the same "VJP" form:

In [None]:
// We implement mean(square(x)) by calling each of the VJPs in turn.
func mseInnerVJP(_ x: TF) -> (TF, 
                              (TF) -> TF) {

  let (squareResult, squarePB) = squareVJP(x)
  let (  meanResult,   meanPB) = meanVJP(squareResult)

  // The result is the combination of the results and the pullbacks.
  return (meanResult,
          // The mseInner pullback calls the functions in reverse order.
          { v in squarePB(meanPB(v)) })
}

Now we can choose to evaluate just the forward computation, or we can choose to run both:

In [None]:
print("Calling the forward function:")
let (mseInnerResult2, mseInnerPB2) = mseInnerVJP(exampleData)
print()

testSame(mseInnerResult2, mseInnerResult1)


print("Calling the backward function:")
let mseInnerGrad2 = mseInnerPB2(TF(1))
print()

print(mseInnerGrad2)
// Check that we get the same result.
testSame(mseInnerGrad2, builtinGrad)


Calling the forward function:
squareVJP(_:)
meanVJP(_:)

Calling the backward function:
ùõÅmeanPB(x:ddx:)
ùõÅsquarePB(x:ddx:)

[0.5, 1.0, 1.5, 2.0]


Ok, great - we only ran each piece of the computation once, and we gained a single conceptual abstraction that bundles everything we need together.

Now we have all of the infrastructure and scaffolding necessary to define and compose computations and figure out their backwards versions from the chain rule.  Let's jump up a level to define Jeremy's example using the VJP form of the computation.

# Implementing Relu, MSE, and Lin with VJPs

Lets come back to our earlier examples and define pullbacks for our primary functions in the simple model function example.

In [None]:
func reluVJP(_ x: TF) -> (TF, (TF) -> TF) {
    return (max(x, 0),
            // Pullback for max(x, 0)
            { ùõÅout -> TF in
              (x .> 0).selecting(ùõÅout, TF(zeros: x.shape))
            })
}

In [None]:
func linVJP(_ inp: TF, _ w: TF, _ b: TF) -> (TF, (TF) -> (TF, TF, TF)) {
    return (inp ‚Ä¢ w + b,
            // Pullback for inp ‚Ä¢ w + b.  Three results because 'lin' has three args.
            { ùõÅout in
              (ùõÅout ‚Ä¢ w.transposed(), 
               inp.transposed() ‚Ä¢ ùõÅout,
               ùõÅout.unbroadcast(to: b.shape))
    })
}

In [None]:
func mseVJP(_ inp: TF, _ targ: TF) -> (TF, (TF) -> (TF)) {
    let tmp = inp.squeezingShape(at: -1) - targ
    
    // We already wrote a VJP for x.square().mean(), so just reuse it.
    let (mseInnerResult, mseInnerPB) = mseInnerVJP(tmp)
    
    // Return the result, and a pullback that expands back out to
    // the input shape.
    return (mseInnerResult, 
            { v in mseInnerPB(v).expandingShape(at: -1) })
}

And then our forward and backward can be refactored in:

In [None]:
func forwardAndBackward(_ inp: TF, _ targ: TF) -> (TF, TF, TF, TF, TF){
    //forward pass:
    let (l1, l1PB)    = linVJP(inp, w1, b1)
    let (l2, l2PB)    = reluVJP(l1)
    let (out, outPB)  = linVJP(l2, w2, b2)
    //we don't actually need the loss in backward, but we need the pullback.
    let (loss, lossPB) = mseVJP(out, targ)
    
    //backward pass:
    let ùõÅloss = TF(1) //We don't really need it but the gradient of the loss with respect to itself is 1
    let ùõÅout = lossPB(ùõÅloss)
    let (ùõÅl2, ùõÅw2, ùõÅb2) = outPB(ùõÅout)
    let ùõÅl1 = l2PB(ùõÅl2)
    let (ùõÅinp, ùõÅw1, ùõÅb1) = l1PB(ùõÅl1)
    return (ùõÅinp, ùõÅw1, ùõÅb1, ùõÅw2, ùõÅb2)
}

In [None]:
let (ùõÅxTrain, ùõÅw1, ùõÅb1, ùõÅw2, ùõÅb2) = forwardAndBackward(xTrain, yTrainF)
// Check this is still all correct
testSame(inp.grad, ùõÅxTrain)
testSame(w1a.grad, ùõÅw1)
testSame(b1a.grad, ùõÅb1)
testSame(w2a.grad, ùõÅw2)
testSame(b2a.grad, ùõÅb2)


squareVJP(_:)
meanVJP(_:)
ùõÅmeanPB(x:ddx:)
ùõÅsquarePB(x:ddx:)


# Old


As we said before, swift operates in a different way. If we go back to what is happening in the backward pass, we go from the end result (our loss) which allows us to compute the gradient of that loss with respect to the last activations. Then consider all the layers we went through during the forward pass in the reversed order (from the last one to the first one) and for each of them, compute the gradients of the loss with respect to the inputs (and potentially parameters) from the gradients of the loss with respect to the outputs.

For instance if we go back to the basic `relu_grad` function we had in python:
```
def relu_grad(inp, out):
    # grad of relu with respect to input activations
    inp.g = (inp>0).float() * out.g
```
we explain how we infer the gradients of the loss with respect to the inputs of the relu (`inp.g`) from the gradients of the loss with respect to the outputs of that same relu (`out.g`).

Swift implements differentation in a more functional way than PyTorch: there is no grad attribute, instead we just provide that function that will take the gradients with respect to the outputs and returns the gradients with respect to the inputs (and potentially, parameters).

The tricky thing is that that gradient computation often requires to have the inputs/outputs of the layer: if we look at `relu_grad` up there, it needs to know the value of `inp`. That's why we don't just write a function that does `ùõÅOut -> ùõÅInp`, but a function that will take the input and return that pullback:
```
(Inp) -> ((ùõÅOut) -> ùõÅInp)
```

Let's look at what it gives us for the relu:

In [None]:
func diffRelu(_ inp: TF) -> ((TF) -> TF) {
    return {ùõÅout -> TF in
        (inp .> 0).selecting(ùõÅout, TF(zeros: inp.shape))
    }
}

When we go through our relu layer, we won't just ask for `y = relu(x)`, but will also store the pullback `pb = diffRelu(x)` for the backward pass. This will automatically capture a reference to the value of `x` that is used inside that pullback. 

Other differentiation functions look a bit the same:

In [None]:
func diffMse(_ inp: TF, _ targ: TF) -> ((TF) -> TF) {
    return { ùõÅloss in
        2.0 * (inp.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.shape[0])
    }
}

`diffMse` doesn't return gradients for targ because the function isn't differentiable with respect to that variable in general (we could of course differentiate it in this case, but we don't need those gradients).

In [None]:
func diffLin(_ inp: TF, _ w: TF, _ b: TF) -> ((TF) -> (TF, TF, TF)) {
    return { ùõÅout in
        (ùõÅout ‚Ä¢ w.transposed(), inp.transposed() ‚Ä¢ ùõÅout, ùõÅout.unbroadcast(to: b.shape))
    }
}

`diffLin` returns the gradients with respect to all its inputs (more like inputs and parameters).

Then the backward and forward pass is written like this:

In [None]:
func forwardAndBackward(_ inp: TF, _ targ: TF) -> (TF, TF, TF, TF, TF){
    //forward pass:
    let (l1, pbL1)    = (lin(inp, w1, b1), diffLin(inp, w1, b1))
    let (l2, pbL2)    = (myRelu(l1), diffRelu(l1))
    let (out, pbOut)  = (lin(l2, w2, b2), diffLin(l2, w2, b2))
    //we don't actually need the loss in backward, but we need the pullback.
    let (loss, pbLoss) = (mse(out, targ), diffMse(out, targ))
    
    //backward pass:
    let ùõÅloss = TF(1) //We don't really need it but the gradient of the loss with respect to itself is 1
    let ùõÅout = pbLoss(ùõÅloss)
    let (ùõÅl2, ùõÅw2, ùõÅb2) = pbOut(ùõÅout)
    let ùõÅl1 = pbL2(ùõÅl2)
    let (ùõÅinp, ùõÅw1, ùõÅb1) = pbL1(ùõÅl1)
    return (ùõÅinp, ùõÅw1, ùõÅb1, ùõÅw2, ùõÅb2)
}

In [None]:
let (ùõÅxTrain, ùõÅw1, ùõÅb1, ùõÅw2, ùõÅb2) = forwardAndBackward(xTrain, yTrainF)

In [None]:
// Check the gradients computed both way are the same.
testNearZero(ùõÅxTrain - inp.grad)
testNearZero(ùõÅw1     - w1a.grad)
testNearZero(ùõÅb1     - b1a.grad)
testNearZero(ùõÅw2     - w2a.grad)
testNearZero(ùõÅb2     - b2a.grad)

## Using the S4TF Language Integrated Autodiff

Let's compare to the language-integrated Swift for TensorFlow autodiff now. We have to mark the function as `@differentiable`.  This informs the compiler that we want it to automatically generate its gradients, and causes it to emit errors if there is anything contributing to the result of the function that cannot be differentiated.

The `@differentiable` attribute is normally optional in a S4TF standalone environment, but is currently required in Jupyter notebooks.  The S4TF team is planning to relax this limitation over time.

In [None]:
@differentiable
func forward(_ inp: TF, _ targ: TF, w1: TF, b1: TF, w2: TF, b2: TF) -> TF {
    let l1 = matmul(inp, w1) + b1
    let l2 = relu(l1)
    let l3 = matmul(l2, w2) + b2
    return (l3.squeezingShape(at: -1) - targ).squared().mean()
}

Then we can ask for the gradients w.r.t. any individual parameter like this: (ùõÅ‚ÇÇ is for second way of computing gradients, not gradients squared, or second order gradients)

In [None]:
let ùõÅ‚ÇÇxTrain = gradient(at: xTrain) {xTrain in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let ùõÅ‚ÇÇw1 = gradient(at: w1) {w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let ùõÅ‚ÇÇb1 = gradient(at: b1) {b1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let ùõÅ‚ÇÇw2 = gradient(at: w2) {w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let ùõÅ‚ÇÇb2 = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

// Check that they agree with the manually calculated gradients.
testNearZero(ùõÅ‚ÇÇxTrain - ùõÅxTrain)
testNearZero(ùõÅ‚ÇÇw1     - ùõÅw1)
testNearZero(ùõÅ‚ÇÇb1     - ùõÅb1)
testNearZero(ùõÅ‚ÇÇw2     - ùõÅw2)
testNearZero(ùõÅ‚ÇÇb2     - ùõÅb2)

You can also ask for gradients with respect to multiple things at the same time, but unfortunately, current AD bugs prevent getting more than two gradients at a time.  We can do a little bit better than the above code like so:

In [None]:
let (ùõÅ‚ÇÉxTrain, ùõÅ‚ÇÉw1) = gradient(at: xTrain, w1) {
    xTrain, w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)
}
let (ùõÅ‚ÇÉb1, ùõÅ‚ÇÉw2) = gradient(at: b1, w2) {
    b1, w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)
}
let ùõÅ‚ÇÉb2 = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

// Check that they agree.
testNearZero(ùõÅ‚ÇÇxTrain - ùõÅ‚ÇÉxTrain)
testNearZero(ùõÅ‚ÇÇw1     - ùõÅ‚ÇÉw1)
testNearZero(ùõÅ‚ÇÇb1     - ùõÅ‚ÇÉb1)
testNearZero(ùõÅ‚ÇÇw2     - ùõÅ‚ÇÉw2)
testNearZero(ùõÅ‚ÇÇb2     - ùõÅ‚ÇÉb2)

This is currently pretty ugly, and even when the bugs are fixed, it still won't be very idiomatic.  A more common thing is to wrap up all your parameters into a struct, and differentiate w.r.t. all of them at the same time (which, when we refactor the code, will be our model itself).

Here is an example of that:


In [None]:
struct myParams: Differentiable {
    public var x, w1, b1, w2, b2: TF
}

let allParams = myParams(x: xTrain, w1: w1, b1: b1, w2: w2, b2: b2)

In [None]:
// We can now get all of the gradients at once with a single call, and a single forward computation.
let grads = gradient(at: allParams) {
  allParams in
    forward(allParams.x, yTrainF,
            w1: allParams.w1, 
            b1: allParams.b1,
            w2: allParams.w2, 
            b2: allParams.b2)
}

// Check that this still calculates the same thing.
testNearZero(ùõÅ‚ÇÇxTrain  - grads.x)
testNearZero(ùõÅ‚ÇÇw1      - grads.w1)
testNearZero(ùõÅ‚ÇÇb1      - grads.b1)
testNearZero(ùõÅ‚ÇÇw2      - grads.w2)
testNearZero(ùõÅ‚ÇÇb2      - grads.b2)

If you wanted the value for your loss as well as the gradients, you just have to use `valueWithGradient`.

In [None]:
let (loss,grads) = valueWithGradient(at: allParams) { 
    allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
}

testNearZero(ùõÅ‚ÇÇxTrain  - grads.x)
testNearZero(ùõÅ‚ÇÇw1      - grads.w1)
testNearZero(ùõÅ‚ÇÇb1      - grads.b1)
testNearZero(ùõÅ‚ÇÇw2      - grads.w2)
testNearZero(ùõÅ‚ÇÇb2      - grads.b2)

In terms of timing our implementation gives:

In [None]:
time(repeating: 10) { _ = forwardAndBackward(xTrain, yTrainF) }

In [None]:
time(repeating: 10) {
    _ = valueWithGradient(at: allParams) { 
        allParams in forward(allParams.x, 
                             yTrainF, 
                             w1: allParams.w1, 
                             b1: allParams.b1, 
                             w2: allParams.w2, 
                             b2: allParams.b2)
    }
}

### Refactor with valueWithPullback

Now one thing you will have noticed, is that in our forward and backward, we often ask for a value and the pullback at the same time, that's why it's often implemented together in the primitives of S4TF:

### Export

In [None]:
import NotebookExport
let exporter = NotebookExport(Path.cwd/"02_fully_connected.ipynb")
print(exporter.export(usingPrefix: "FastaiNotebook_"))