# XLA compilation

In [None]:
%install '.package(path: "$cwd/FastaiNotebook_07_batchnorm")' FastaiNotebook_07_batchnorm

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_07_batchnorm")
		FastaiNotebook_07_batchnorm
With SwiftPM flags: []
Working in: /tmp/tmpbkswux0p/swift-install
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 3.46s
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'FastaiNotebook_07_batchnorm' (11 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [None]:
import FastaiNotebook_07_batchnorm
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

('inline', 'module://ipykernel.pylab.backend_inline')


In [None]:
// export
import Path
import TensorFlow
import Python

In [None]:
let plt = Python.import("matplotlib.pyplot")

Glue to get XLA compilation and AD to work together:

In [None]:
struct PullbackArgs<T : TensorGroup, U : TensorGroup> : TensorGroup {
    let input: T
    let cotangent: U
}

class CompiledFunction<Input: Differentiable & TensorGroup, Output: Differentiable & TensorGroup> {
    let f: @differentiable (Input) -> Output
    init(_ f: @escaping @differentiable (Input) -> Output) {
        self.f = f
    }
}

func xlaCompiled<T : Differentiable & TensorGroup, U : Differentiable & TensorGroup>(
    _ fn: @escaping @differentiable (T) -> U) -> CompiledFunction<T, U>
    where T.CotangentVector : TensorGroup, U.CotangentVector : TensorGroup {
    let xlaCompiledFn: (T) -> U = _graph(fn, useXLA: true)
    let xlaCompiledPullback = _graph(
        { (pbArgs: PullbackArgs<T, U.CotangentVector>) in
            pullback(at: pbArgs.input, in: fn)(pbArgs.cotangent) },
        useXLA: true
    )
    return CompiledFunction(differentiableFunction { x in
        (value: xlaCompiledFn(x), pullback: { v in
            xlaCompiledPullback(PullbackArgs(input: x, cotangent: v))})
    })
}

In [None]:
struct XLABatchNorm<Scalar: TensorFlowFloatingPoint>: LearningPhaseDependent, Norm {
    // Configuration hyperparameters
    @noDerivative let momentum: Scalar
    @noDerivative let epsilon: Scalar
    // Running statistics
    @noDerivative let runningMean: Reference<Tensor<Scalar>>
    @noDerivative let runningVariance: Reference<Tensor<Scalar>>
    // Trainable parameters
    var scale: Tensor<Scalar>
    var offset: Tensor<Scalar>
    // TODO: check why these aren't being synthesized
    typealias Input = Tensor<Scalar>
    typealias Output = Tensor<Scalar>
    @noDerivative public var delegate: LayerDelegate<Output> = LayerDelegate()
    
    // needed for tracing    
    struct TrainingKernelInput: TensorGroup, Differentiable, AdditiveArithmetic {
        let input: Tensor<Scalar>
        let scale: Tensor<Scalar>
        let offset: Tensor<Scalar>
        let runningMean: Tensor<Scalar>
        let runningVariance: Tensor<Scalar>
        let momentum: Tensor<Scalar>
        let epsilon: Tensor<Scalar>
    }
    
    struct TrainingKernelOutput: TensorGroup, Differentiable, AdditiveArithmetic {
        let normalized: Tensor<Scalar>
        let newRunningMean: Tensor<Scalar>
        let newRunningVariance: Tensor<Scalar>
    }
    
    static func trainingKernel(_ input: TrainingKernelInput) -> TrainingKernelOutput {
        let mean = input.input.mean(alongAxes: [0, 1, 2])
        let variance = input.input.variance(alongAxes: [0, 1, 2])
        let invMomentum = Tensor<Scalar>(1) - input.momentum
        let newRunningMean = input.runningMean * input.momentum + mean * invMomentum
        let newRunningVariance = input.runningVariance * input.momentum + variance * invMomentum
        let normalizer = rsqrt(variance + input.epsilon) * input.scale
        let normalized = (input.input - mean) * normalizer + input.offset
        return TrainingKernelOutput(
            normalized: normalized,
            newRunningMean: newRunningMean,
            newRunningVariance: newRunningVariance
        )
    }
    
    @noDerivative let compiledTrainingKernel: CompiledFunction<TrainingKernelInput, TrainingKernelOutput>
    
    init(featureCount: Int, momentum: Scalar, epsilon: Scalar = 1e-5) {
        self.momentum = momentum
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
        self.runningMean = Reference(Tensor(0))
        self.runningVariance = Reference(Tensor(0))
        // Compile the training kernel to a TensorFlow graph. TensorFlow will then
        // compile it to XLA once for each set of input shapes (hopefully!).
        self.compiledTrainingKernel = xlaCompiled(XLABatchNorm<Scalar>.trainingKernel)
    }
    
    init(featureCount: Int, epsilon: Scalar = 1e-5) {
        self.init(featureCount: featureCount, momentum: 0.9, epsilon: epsilon)
    }
    
    @differentiable
    func forwardTraining(to input: Tensor<Scalar>) -> Tensor<Scalar> {
        let kernelInput = TrainingKernelInput(
            input: input,
            scale: scale,
            offset: offset,
            runningMean: runningMean.value,
            runningVariance: runningVariance.value,
            momentum: Tensor(momentum),
            epsilon: Tensor(epsilon)
        )

        let kernelOutput = compiledTrainingKernel.f(kernelInput)
        
        self.runningMean.value = kernelOutput.newRunningMean
        self.runningVariance.value = kernelOutput.newRunningVariance
        
        return kernelOutput.normalized
    }
    
    @differentiable
    func forwardInference(to input: Tensor<Scalar>) -> Tensor<Scalar> {
        let normalizer = rsqrt(self.runningVariance.value + epsilon) * scale
        return (input - self.runningMean.value) * normalizer + offset
    }
}

In [None]:
struct XLARunningBatchNorm<Scalar: TensorFlowFloatingPoint>: LearningPhaseDependent, Norm {
    // Configuration hyperparameters
    @noDerivative let momentum: Scalar
    @noDerivative let epsilon: Scalar
    // Running statistics
    @noDerivative let runningSum: Reference<Tensor<Scalar>>
    @noDerivative let runningSumOfSquares: Reference<Tensor<Scalar>>
    @noDerivative let runningCount: Reference<Scalar>
    @noDerivative let samplesSeen: Reference<Scalar>
    // Trainable parameters
    var scale: Tensor<Scalar>
    var offset: Tensor<Scalar>
    // TODO: check why these aren't being synthesized
    typealias Input = Tensor<Scalar>
    typealias Output = Tensor<Scalar>
    @noDerivative public var delegate: LayerDelegate<Output> = LayerDelegate()
    
    // needed for tracing
    struct TrainingKernelInput: TensorGroup, Differentiable, AdditiveArithmetic { // needs AA?
        let input: Tensor<Scalar>
        let mom: Tensor<Scalar>
        let runningSum: Tensor<Scalar>
        let runningSumOfSquares: Tensor<Scalar>
        let newRunningCount: Tensor<Scalar>
        let scale: Tensor<Scalar>
        let offset: Tensor<Scalar>
        let epsilon: Tensor<Scalar>
    }
    
    struct TrainingKernelOutput: TensorGroup, Differentiable, AdditiveArithmetic {
        let normalized: Tensor<Scalar>
        let newRunningSum: Tensor<Scalar>
        let newRunningSumOfSquares: Tensor<Scalar>
    }
    
    static func trainingKernel(_ input: TrainingKernelInput) -> TrainingKernelOutput {
        let sum = input.input.sum(alongAxes: [0, 1, 2])
        let sumOfSquares = (input.input * input.input).sum(alongAxes: [0, 1, 2])
        let invmom = Tensor<Scalar>(1) - input.mom
        let newRunningSum = input.mom * input.runningSum + invmom * sum
        let newRunningSumOfSquares = input.mom * input.runningSumOfSquares + invmom * sumOfSquares
        let mean = newRunningSum / input.newRunningCount
        let variance = newRunningSumOfSquares / input.newRunningCount - mean * mean
        let normalizer = rsqrt(variance + input.epsilon) * input.scale
        let normalized = (input.input - mean) * normalizer + input.offset
        return TrainingKernelOutput(
            normalized: normalized,
            newRunningSum: newRunningSum,
            newRunningSumOfSquares: newRunningSumOfSquares
        )
    }
    
    @noDerivative let compiledTrainingKernel: CompiledFunction<TrainingKernelInput, TrainingKernelOutput>
    
    init(featureCount: Int, momentum: Scalar, epsilon: Scalar = 1e-5) {
        self.momentum = momentum
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
        self.runningSum = Reference(Tensor(0))
        self.runningSumOfSquares = Reference(Tensor(0))
        self.runningCount = Reference(Scalar(0))
        self.samplesSeen = Reference(Scalar(0))
        // Compile the training kernel to a TensorFlow graph. TensorFlow will then
        // compile it to XLA once for each set of input shapes (hopefully!).
        self.compiledTrainingKernel = xlaCompiled(XLARunningBatchNorm<Scalar>.trainingKernel)
    }
    
    init(featureCount: Int, epsilon: Scalar = 1e-5) {
        self.init(featureCount: featureCount, momentum: 0.9, epsilon: epsilon)
    }
    
    @differentiable
    func forwardTraining(to input: Tensor<Scalar>) -> Tensor<Scalar> {
        let (batch, channels) = (input.shape[0], Scalar(input.shape[3]))
        // it's fine to do scalar computation outside the JIT
        let mom = momentum / sqrt(Scalar(batch) - 1)
        let count = Scalar(input.scalarCount).withoutDerivative() / channels
        let newRunningCount = mom * runningCount.value + (1 - mom) * count
        
        let kernelInput = TrainingKernelInput(
            input: input,
            mom: Tensor(mom),
            runningSum: runningSum.value,
            runningSumOfSquares: runningSumOfSquares.value,
            newRunningCount: Tensor(newRunningCount),
            scale: scale,
            offset: offset,
            epsilon: Tensor(epsilon)
        )

        let kernelOutput = compiledTrainingKernel.f(kernelInput)
        
        self.runningSum.value = kernelOutput.newRunningSum
        self.runningSumOfSquares.value = kernelOutput.newRunningSumOfSquares
        self.runningCount.value = newRunningCount
        self.samplesSeen.value += Scalar(batch)
        
        return kernelOutput.normalized
    }
    
    @differentiable
    func forwardInference(to input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean = runningSum.value / runningCount.value
        let variance = runningSumOfSquares.value / runningCount.value - mean * mean
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
}

In [None]:
struct ConvXBN<Scalar: TensorFlowFloatingPoint>: Layer {
    var conv: FANoBiasConv2D<Scalar>
    var norm: XLABatchNorm<Scalar>
    typealias Input = Tensor<Scalar>
    typealias Output = Tensor<Scalar>
    public init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 1){
        // TODO (when control flow AD works): use Conv2D without bias
        self.conv = FANoBiasConv2D(cIn, cOut, ks: ks, stride: stride, activation: relu)
        self.norm = XLABatchNorm(featureCount: cOut, epsilon: 1e-5)
    }

    @differentiable
    func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        return norm(conv(input))
    }
    
    @differentiable
    public func call(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        return forward(input)
    }
}

In [None]:
struct ConvXRBN<Scalar: TensorFlowFloatingPoint>: Layer {
    var conv: FANoBiasConv2D<Scalar>
    var norm: XLARunningBatchNorm<Scalar>
    typealias Input = Tensor<Scalar>
    typealias Output = Tensor<Scalar>
    public init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 1){
        // TODO (when control flow AD works): use Conv2D without bias
        self.conv = FANoBiasConv2D(cIn, cOut, ks: ks, stride: stride, activation: relu)
        self.norm = XLARunningBatchNorm(featureCount: cOut, epsilon: 1e-5)
    }

    @differentiable
    func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        return norm(conv(input))
    }
    
    @differentiable
    public func call(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        return forward(input)
    }
}

In [None]:
public struct CnnModelXBN: Layer {
    public var convs: [ConvXBN<Float>]
    public var pool = FAGlobalAvgPool2D<Float>()
    public var flatten = Flatten<Float>()
    public var linear: FADense<Float>
    
    public init(channelIn: Int, nOut: Int, filters: [Int]){
        let allFilters = [channelIn] + filters
        convs = (0..<filters.count).map { ConvXBN(allFilters[$0], allFilters[$0+1]) }
        linear = FADense<Float>(filters.last!, nOut)
    }
    
    @differentiable
    public func call(_ input: TF) -> TF {
        return input.sequenced(through: convs, pool, flatten, linear)
    }
}

In [None]:
public struct CnnModelXRBN: Layer {
    public var convs: [ConvXRBN<Float>]
    public var pool = FAGlobalAvgPool2D<Float>()
    public var flatten = Flatten<Float>()
    public var linear: FADense<Float>
    
    public init(channelIn: Int, nOut: Int, filters: [Int]){
        let allFilters = [channelIn] + filters
        convs = (0..<filters.count).map { ConvXRBN(allFilters[$0], allFilters[$0+1]) }
        linear = FADense<Float>(filters.last!, nOut)
    }
    
    @differentiable
    public func call(_ input: TF) -> TF {
        return input.sequenced(through: convs, pool, flatten, linear)
    }
}

In [None]:
let data = mnistDataBunch(flat: false, bs: 512)

In [None]:
func optFunc(_ model: CnnModelXBN) -> SGD<CnnModelXBN> { return SGD(for: model, learningRate: 0.4) }
func modelInit() -> CnnModelXBN { return CnnModelXBN(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunc: softmaxCrossEntropy, optFunc: optFunc, modelInit: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
                      learner.makeAddChannel()])

In [None]:
time { try! learner.fit(1) }

Fatal error: Dimension -4 must be >= 0------------ 0.00% [0/118 00:00<00:00]
	 [[{{node Fill_376}}]]: file /swift-base/swift/stdlib/public/TensorFlow/CompilerRuntime.swift, line 278
Current stack trace:
0    libswiftCore.so                    0x00007fc49826de00 _swift_stdlib_reportFatalErrorInFile + 115
1    libswiftCore.so                    0x00007fc4981b606c <unavailable> + 3035244
2    libswiftCore.so                    0x00007fc4981b615e <unavailable> + 3035486
3    libswiftCore.so                    0x00007fc497ffda12 <unavailable> + 1231378
4    libswiftCore.so                    0x00007fc498182d42 <unavailable> + 2825538
5    libswiftCore.so                    0x00007fc497ffcef9 <unavailable> + 1228537
6    libswiftTensorFlow.so              0x00007fc4953ea022 <unavailable> + 598050
7    libswiftTensorFlow.so              0x00007fc4953e8770 checkOk(_:file:line:) + 508
8    libswiftTensorFlow.so              0x00007fc4953eba55 <unavailable> + 604757
9    libswiftTensorFlow.so   

In [None]:
func optFunc(_ model: CnnModelXRBN) -> SGD<CnnModelXRBN> { return SGD(for: model, learningRate: 0.4) }
func modelInit() -> CnnModelXRBN { return CnnModelXRBN(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunc: softmaxCrossEntropy, optFunc: optFunc, modelInit: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
                      learner.makeAddChannel()])

In [None]:
time { try! learner.fit(1) }

Fatal error: OOM when allocating tensor with shape[512,28,28,32] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node Mul_4}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
: file /swift-base/swift/stdlib/public/TensorFlow/CompilerRuntime.swift, line 278
Current stack trace:
0    libswiftCore.so                    0x00007fd40e5afe00 _swift_stdlib_reportFatalErrorInFile + 115
1    libswiftCore.so                    0x00007fd40e4f806c <unavailable> + 3035244
2    libswiftCore.so                    0x00007fd40e4f815e <unavailable> + 3035486
3    libswiftCore.so                    0x00007fd40e33fa12 <unavailable> + 1231378
4    libswiftCore.so                    0x00007fd40e4c4d42 <unavailable> + 2825538
5    libswiftCore.so                    0x00007fd40e33eef9 <unavailable> + 1228537
6    libswiftTensorFlow.so              0x00007fd40b72c

In [None]:
fileprivate extension Tensor where Scalar: Numeric {
    mutating func resetToZero() {
        self = Tensor(zeros: shape)
    }
}
/// Stochastic gradient descent (SGD) optimizer.
///
/// An optimizer that implements stochastic gradient descent, with support for momentum, learning
/// rate decay, and Nesterov momentum.
public class FASGD<Model: Layer>: Optimizer
    where Model.AllDifferentiableVariables == Model.CotangentVector {
    /// The learning rate.
    public var learningRate: Float
    /// The momentum factor. It accelerates stochastic gradient descent in the relevant direction
    /// and dampens oscillations.
    public var momentum: Float
    /// The weight decay.
    public var decay: Float
    /// Use Nesterov momentum if true.
    public var nesterov: Bool
    /// The velocity state of the model
    public var velocity: Model.AllDifferentiableVariables
    /// The set of steps taken.
    public var step: Int = 0
    
    public init(
        for model: Model,
        learningRate: Float = 0.01,
        momentum: Float = 0,
        decay: Float = 0,
        nesterov: Bool = false
    ) {
        precondition(learningRate >= 0, "Learning rate must be non-negative")
        precondition(momentum >= 0, "Momentum must be non-negative")
        precondition(decay >= 0, "Weight decay must be non-negative")

        self.learningRate = learningRate
        self.momentum = momentum
        self.decay = decay
        self.nesterov = nesterov
        velocity = model.allDifferentiableVariables
        for kp in velocity.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self) {
            velocity[keyPath: kp].resetToZero()
        }
        for kp in velocity.recursivelyAllWritableKeyPaths(to: Tensor<Double>.self) {
            velocity[keyPath: kp].resetToZero()
        }
    }

    public func update(_ model: inout Model.AllDifferentiableVariables,
                       along direction: Model.CotangentVector) {
        step += 1
        let learningRate = self.learningRate * 1 / (1 + decay * Float(step))
        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self) {
            velocity[keyPath: kp] =
                momentum * velocity[keyPath: kp] - learningRate * direction[keyPath: kp]
            if nesterov {
                model[keyPath: kp] +=
                    momentum * velocity[keyPath: kp] - learningRate * direction[keyPath: kp]
            } else {
                model[keyPath: kp] += velocity[keyPath: kp]
            }
        }
        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Double>.self) {
            velocity[keyPath: kp] =
                Double(momentum) * velocity[keyPath: kp] -
                Double(learningRate) * direction[keyPath: kp]
            if nesterov {
                model[keyPath: kp] +=
                    Double(momentum) * velocity[keyPath: kp] - Double(learningRate) *
                    direction[keyPath: kp]
            } else {
                model[keyPath: kp] += velocity[keyPath: kp]
            }
        }
    }
}

In [None]:
func optFunc(_ model: CnnModelBN) -> SGD<CnnModelBN> { return SGD(for: model, learningRate: 0.4) }
func modelInit() -> CnnModelBN { return CnnModelBN(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunc: softmaxCrossEntropy, optFunc: optFunc, modelInit: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
                      learner.makeAddChannel()])

In [None]:
// func optFunc(_ model: CnnModelXBN) -> FASGD<CnnModelXBN> { return FASGD(for: model, learningRate: 0.4) }
// func modelInit() -> CnnModelXBN { return CnnModelXBN(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
// let learner = Learner(data: data, lossFunc: softmaxCrossEntropy, optFunc: optFunc, modelInit: modelInit)
// let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
// learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
//                       learner.makeAddChannel()])
// time { try! learner.fit(1) }

traced kernel
traced kernel
traced kernel
traced kernel
traced kernel
traced kernel
traced kernel
traced kernel
made it to here
running CnnModelXBN forward----------------------- 0.00% [0/118 00:00<00:00]
running ConvXBN forward
input  TensorShape(dimensions: [512, 28, 28, 1])
conv  TensorShape(dimensions: [512, 14, 14, 8])
running compiled kernel
ran compiled kernel
norm  TensorShape(dimensions: [512, 14, 14, 8])
ran ConvXBN forward
running ConvXBN forward
input  TensorShape(dimensions: [512, 14, 14, 8])
conv  TensorShape(dimensions: [512, 7, 7, 16])
running compiled kernel
ran compiled kernel
norm  TensorShape(dimensions: [512, 7, 7, 16])
ran ConvXBN forward
running ConvXBN forward
input  TensorShape(dimensions: [512, 7, 7, 16])
conv  TensorShape(dimensions: [512, 4, 4, 32])
running compiled kernel
ran compiled kernel
norm  TensorShape(dimensions: [512, 4, 4, 32])
ran ConvXBN forward
running ConvXBN forward
input  TensorShape(dimensions: [512, 4, 4, 32])
conv  TensorShape(dimensions:

In [None]:
func optFunc(_ model: CnnModelXRBN) -> FASGD<CnnModelXRBN> { return FASGD(for: model, learningRate: 0.4) }
func modelInit() -> CnnModelXRBN { return CnnModelXRBN(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunc: softmaxCrossEntropy, optFunc: optFunc, modelInit: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
                      learner.makeAddChannel()])
time { try! learner.fit(1) }

## Export

In [None]:
notebookToScript(fname: (Path.cwd / "07a_xla_compilation.ipynb").string)