In [1]:
%install '.package(path: "$cwd/FastaiNotebook_02a_why_sqrt5")' FastaiNotebook_02a_why_sqrt5

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_02a_why_sqrt5")
		FastaiNotebook_02a_why_sqrt5
With SwiftPM flags: []
Working in: /tmp/tmpcf250x3g/swift-install
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 2.06s
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'FastaiNotebook_02a_why_sqrt5' (5 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [2]:
import FastaiNotebook_02a_why_sqrt5

In [3]:
// export
import Path
import TensorFlow

In [4]:
// export
public typealias TI = Tensor<Int32>

In [5]:
let y = TI(repeating: 0, shape: [1024])

### Data

In [6]:
var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: Path.home/".fastai"/"data"/"mnist_tst", flat: true)

In [7]:
let trainMean = xTrain.mean()
let trainStd  = xTrain.standardDeviation()

In [8]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [9]:
let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max().scalarized()+1
print(n,m,c)

60000 784 10


In [10]:
let nHid = 50

In [11]:
public struct MyModel: Layer {
    public var layer1: FADense<Float>
    public var layer2: FADense<Float>
    
    public init(nIn: Int, nHid: Int, nOut: Int){
        layer1 = FADense(inputSize: nIn, outputSize: nHid, activation: relu)
        layer2 = FADense(inputSize: nHid, outputSize: nOut)
    }
    
    @differentiable
    public func applied(to input: Tensor<Float>) -> Tensor<Float> {
        return input.sequenced(through: layer1, layer2)
    }
}

In [12]:
var model = MyModel(nIn: m, nHid: nHid, nOut: Int(c))

In [13]:
let pred = model.applied(to: xTrain)

### Cross entropy loss

In [14]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let exped = exp(activations) 
    return log(exped / exped.sum(alongAxes: -1))
}

In [15]:
let smPred = logSoftmax(pred)

In [16]:
yTrain[0..<3]

[5, 0, 4]


In [17]:
(smPred[0][5],smPred[1][0],smPred[2][4])

‚ñø 3 elements
  - .0 : -3.4211059
  - .1 : -3.4711692
  - .2 : -2.6047492


There is no fancy indexing yet so we have to use gather to get the indices we want out of our softmaxed predictions.

In [18]:
func nll<Scalar>(_ input: Tensor<Scalar>, _ target :TI) -> Tensor<Scalar> 
    where Scalar:TensorFlowFloatingPoint{
        let idx: TI = Raw.range(start: Tensor(0), limit: Tensor(60000), delta: Tensor(1))
        let indices = Raw.concat(concatDim: Tensor(1), [idx.expandingShape(at: 1), target.expandingShape(at: 1)])
        let losses = Raw.gatherNd(params: input, indices: indices)
        return -losses.mean()
    }

In [19]:
nll(smPred, yTrain)

3.2142696


In [20]:
time(repeating: 100){ let _ = nll(smPred, yTrain) }

0.9624212600000003 ms


Simplify `logSoftmax` with log formulas.

In [21]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - log(exp(activations).sum(alongAxes: -1))
}

In [22]:
let smPred = logSoftmax(pred)

In [23]:
nll(smPred, yTrain)

3.2142696


Use LogSumExp trick

In [24]:
smPred.max(alongAxes: -1).shape

‚ñø TensorShape
  ‚ñø dimensions : 2 elements
    - 0 : 60000
    - 1 : 1


In [25]:
func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let m = x.max(alongAxes: -1)
    return m + log(exp(x-m).sum(alongAxes: -1))
}

In [26]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - logSumExp(activations)
}

In [27]:
let smPred = logSoftmax(pred)

In [28]:
nll(smPred, yTrain)

3.2142696


In S4TF nll loss is combined with softmax in:

In [29]:
let loss = softmaxCrossEntropy(logits: pred, labels: yTrain)
loss

3.2142696


In [30]:
time(repeating: 100){ let _ = nll(logSoftmax(pred), yTrain)}

1.14722203 ms


In [31]:
time(repeating: 100){ let _ = softmaxCrossEntropy(logits: pred, labels: yTrain)}

1.1090175899999994 ms


## Basic training loop

Basically the training loop repeats over the following steps:
- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [32]:
// export
public func accuracy(_ output: TF, _ target: TI) -> TF{
    let corrects = Tensor<Float>(output.argmax(squeezingAxis: 1) .== target)
    return corrects.mean()
}

In [33]:
print(accuracy(pred, yTrain))

0.086333334


In [34]:
let bs=64                         // batch size
let xb = xTrain[0..<bs]          // a mini-batch from x
let preds = model.applied(to: xb) //predictions
print(preds[0], preds.shape)

[  -0.478586, 0.100090146,  -0.9420901, 0.009722412, -0.17873938, -0.44098502,  0.38839993,
  -1.6612036,   1.2247918,   2.3043802] TensorShape(dimensions: [64, 10])


In [35]:
let yb = yTrain[0..<bs]
let loss = softmaxCrossEntropy(logits: preds, labels: yb)

In [36]:
print(accuracy(preds, yb))

0.0625


In [37]:
let lr:Float = 0.5   // learning rate
let epochs = 1      // how many epochs to train for

Then we can go

In [38]:
let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
    let preds = model.applied(to: xb)
    return softmaxCrossEntropy(logits: preds, labels: yb)
}

Loop by hand

In [39]:
for epoch in 1...epochs{
    for i in 0..<((n-1)/bs){
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model.applied(to: xb)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        model.layer1.weight -= lr * grads.layer1.weight
        model.layer1.bias   -= lr * grads.layer1.bias
        model.layer2.weight -= lr * grads.layer2.weight
        model.layer2.bias   -= lr * grads.layer2.bias
    }
}

In [40]:
let preds = model.applied(to: xValid)
accuracy(preds, yValid)

0.8853


86% in one epoch, not too bad!

Naming all the parameters is a bit boring. We can use `AllDifferentiableVariables` objects to access them all.

In [41]:
for epoch in 1...epochs{
    for i in 0..<((n-1)/bs){
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model.applied(to: xb)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        var parameters = model.allDifferentiableVariables
        for kp in parameters.recursivelyAllWritableKeyPaths(to: TF.self){ 
            parameters[keyPath: kp] -= lr * grads[keyPath:kp]
        }
    }
}

Then we can use a S4TF optimizer to do the step for us.

In [42]:
let optimizer = SGD(for: model, learningRate: lr)

In [43]:
for epoch in 1...epochs{
    for i in 0..<((n-1)/bs){
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model.applied(to: xb)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

## Dataset

We can create a swift `Dataset` from our arrays. It will automatically batch things for us.

In [44]:
// export
public struct DataBatch<Inputs: Differentiable & TensorGroup, Labels: TensorGroup>: TensorGroup {
    public var xb: Inputs
    public var yb: Labels    
    
    public init(xb: Inputs, yb: Labels){ (self.xb,self.yb) = (xb,yb) }
}

In [45]:
let train_ds:Dataset<DataBatch> = Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(bs)

In [46]:
for epoch in 1...epochs{
    for batch in train_ds{
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model.applied(to: batch.xb)
            return softmaxCrossEntropy(logits: preds, labels: batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

This `Dataset` can also do the shuffle for us:

In [47]:
for epoch in 1...epochs{
    for batch in train_ds.shuffled(sampleCount: 60000, randomSeed: 42){
        let (loss, grads) = model.valueWithGradient { model -> TF in
            let preds = model.applied(to: batch.xb)
            return softmaxCrossEntropy(logits: preds, labels: batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

### Training loop

In [48]:
public func train<Opt: Optimizer, Labels:TensorGroup>(
    _ model: inout Opt.Model,
    on dataset: Dataset<DataBatch<Opt.Model.Input, Labels>>,
    using optimizer: inout Opt,
    lossFunc: @escaping @differentiable (Opt.Model.Output, @nondiff Labels) -> Tensor<Opt.Scalar>
) where Opt.Model: Layer,
        Opt.Model.Input: TensorGroup,
        Opt.Model.CotangentVector == Opt.Model.AllDifferentiableVariables,
        Opt.Scalar: TensorFlowFloatingPoint
{
    for batch in dataset {
        let (loss, ùõÅmodel) = model.valueWithGradient { model -> Tensor<Opt.Scalar> in 
            let pred = model.applied(to: batch.xb)                                      
            return lossFunc(pred, batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: ùõÅmodel)
    }
}

In [49]:
var optimizer = SGD(for: model, learningRate: lr)

In [None]:
train(&model, on: train_ds, using: &optimizer, lossFunc: softmaxCrossEntropy)

: 

### Export

In [None]:
notebookToScript(fname: (Path.cwd / "03_minibatch_training.ipynb").string)