# Callbacks version 2

In [None]:
%install '.package(path: "$cwd/FastaiNotebooks")' FastaiNotebooks

## Load data

In [None]:
import FastaiNotebooks

In [None]:
// export
import Path
import TensorFlow

In [None]:
var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: mnistPath, flat: true)

In [None]:
let (n,m) = (Int(xTrain.shape[0]),Int(xTrain.shape[1]))
let c = yTrain.max()+1
print(n,m,c)

Those can't be used to define a model cause they're not Ints though...

In [None]:
let (n,m) = (60000,784)
let c = 10
let nHid = 50

In [None]:
// export
public struct BasicModel: Layer {
    public var layer1: Dense<Float>
    public var layer2: Dense<Float>
    
    public init(nIn: Int, nHid: Int, nOut: Int){
        layer1 = Dense(inputSize: nIn, outputSize: nHid, activation: relu)
        layer2 = Dense(inputSize: nHid, outputSize: nOut)
    }
    
    @differentiable
    public func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {
        return input.sequenced(in: context, through: layer1, layer2)
    }
}

In [None]:
var model = BasicModel(nIn: m, nHid: nHid, nOut: c)

In [None]:
// export
public struct DataBunch<Element> where Element: TensorGroup{
    public var train: Dataset<Element>
    public var valid: Dataset<Element>
    
    public init(train: Dataset<Element>, valid: Dataset<Element>) {
        self.train = train
        self.valid = valid
    }
}

In [None]:
//export
public func mnistDataBunch(path: Path = mnistPath, flat: Bool = false, bs: Int = 64
                          ) -> DataBunch<DataBatch<Tensor<Float>, Tensor<Int32>>>{
    let (xTrain,yTrain,xValid,yValid) = loadMNIST(path: path, flat: flat)
    return DataBunch(train: Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(Int64(bs)), 
                     valid: Dataset(elements:DataBatch(xb:xValid, yb:yValid)).batched(Int64(bs)))
}

In [None]:
let data = mnistDataBunch(flat: true)

## Learner (Richard's version)

In [None]:
public enum TrainerAction: Error {
    case skipEpoch
    case skipBatch
    case stop
}

Basic class

In [None]:
/// A model trainer, responsible for initializing and training a model on a given dataset.
// NOTE: When TF-421 is fixed, make `Label` not constrained to `Differentiable`.
public final class Trainer<Label: Differentiable & TensorGroup,
                           O: TensorFlow.Optimizer & AnyObject>
    where O.Scalar: Differentiable,
          O.Model.Input: TensorGroup
{
    // Common type aliases.
    public typealias Input = Model.Input
    public typealias Data = DataBunch<DataBatch<Input, Label>>
    public typealias Loss = Tensor<Float>
    public typealias Optimizer = O
    public typealias Model = Optimizer.Model
    public typealias Variables = Model.AllDifferentiableVariables
    // NOTE: When TF-421 is fixed, replace with:
    //   public typealias LossFunction = @differentiable (Model.Output, @nondiff Label) -> Loss
    public typealias LossFunction = @differentiable (Model.Output, Label) -> Loss
    public typealias EventHandler = (Trainer) throws -> Void
    
    /// The dataset on which the model will be trained.
    public let data: Data
    /// The optimizer used for updating model parameters along gradient vectors.
    public var optimizer: Optimizer
    /// The function that computes a loss value when given a prediction and a label.
    public var lossFunction: LossFunction
    /// The model being trained.
    public var model: Model
    
    /// The number of total epochs.
    public private(set) var epochCount: Int = .zero
    /// The current epoch.
    public private(set) var currentEpoch: Int = .zero
    /// The current gradient.
    public private(set) var currentGradient: Model.CotangentVector = .zero
    /// The current loss.
    public private(set) var currentLoss: Loss = .zero
    
    open class Delegate {
        open func trainingWillStart(trainer: Trainer) throws {}
        /// The completion of model training.
        open func trainingDidFinish(trainer: Trainer) throws {}
        /// A closure which will be called upon the start of an epoch.
        open func epochWillStart(trainer: Trainer) throws {}
        /// A closure which will be called upon the completion of an epoch.
        open func epochDidFinish(trainer: Trainer) throws {}
        /// A closure which will be called upon the start of model validation.
        open func validationWillStart(trainer: Trainer) throws {}
        /// A closure which will be called upon the start of training on a batch.
        open func batchWillStart(trainer: Trainer) throws {}
        /// A closure which will be called upon the completion of training on a batch.
        open func batchDidFinish(trainer: Trainer) throws {}
        /// A closure which will be called when a new loss has been computed.
        open func trainerDidProduceNewLoss(trainer: Trainer) throws {}
        /// A closure which will be called when a new gradient has been computed.
        open func trainerDidProduceNewGradient(trainer: Trainer) throws {}
        /// A closure which will be called upon the completion of an optimizer update.
        open func optimizerDidUpdate(trainer: Trainer) throws {}
    }
    public var delegates: [Delegate] = []
    
    /// The context used for layer applications.
    private let context = Context(learningPhase: .training)

    /// Creates a trainer.
    ///
    /// - Parameters:
    ///   - dataset: The dataset which will be trained on.
    ///   - lossFunction: The loss function.
    ///   - optimizer: The optimizer used for updating model parameters along
    ///     gradient vectors.
    ///   - modelInitializer: The closure that produces an model to be trained.
    ///
    public init(data: Data,
                lossFunction: @escaping LossFunction,
                optimizer: Optimizer,
                initializingWith modelInitializer: () -> Model) {
        self.data = data
        self.optimizer = optimizer
        self.lossFunction = lossFunction
        self.model = modelInitializer()
    }
}

Then let's write the parts of the training loop:

In [None]:
extension Trainer {
    /// Trains the model on the given batch.
    ///
    /// - Parameter batch: The batch of input data and labels to be trained on.
    ///
    private func train(onBatch batch: DataBatch<Input, Label>) throws {
        // NOTE: When the "subset of parameters" bug is fixed, replace with:
        //   let (loss, grad) = model.valueWithGradient { model -> Loss in
        //      let y = model.applied(to: batch.data, in: context)
        //      return lossFunction(y, batch.label)
        //   }
        print("Begin batch")
        let (loss, (grad, _)) = model.valueWithGradient(at: batch.yb) {
            (model, label) -> Loss in
            let y = model.applied(to: batch.xb, in: context)
            print("out")
            let loss = lossFunction(y, label)
            print(loss)
            return loss
        }
        print(loss)
        // NOTE: Put this inside `valueWithGradient`'s trailing closure when differentiation
        // supports throwing functions.
        currentLoss = loss
        try delegates.forEach { try $0.trainerDidProduceNewLoss(trainer: self) }
        currentGradient = grad
        try delegates.forEach { try $0.trainerDidProduceNewGradient(trainer: self) }
        optimizer.update(&model.allDifferentiableVariables, along: grad)
        try delegates.forEach { try $0.batchDidFinish(trainer: self) }
    }
    
    /// Performs a training epoch on a Dataset.
    private func train(onDataset ds: Dataset<DataBatch<Input, Label>>) throws {
        for batch in ds {
            print("Begin batch")
            try delegates.forEach { try $0.batchWillStart(trainer: self) }
            do { try train(onBatch: batch) }
            catch TrainerAction.skipBatch { break }
            try delegates.forEach { try $0.batchDidFinish(trainer: self) }
        }
        try delegates.forEach { try $0.epochDidFinish(trainer: self) }
    }
}

And the whole fit function.

In [None]:
extension Trainer{
    /// Starts training.
    ///
    /// - Parameter epochCount: The number of epochs that will be run.
    ///
    public func fit(epochCount: Int) throws {
        self.epochCount = epochCount
        do {
            try delegates.forEach { try $0.trainingWillStart(trainer: self) }
            for i in 0..<epochCount {
                self.currentEpoch = i
                print(i)
                try delegates.forEach { try $0.epochWillStart(trainer: self) }
                do { try train(onDataset: data.train) }
                try delegates.forEach { try $0.validationWillStart(trainer: self) }
                do { try train(onDataset: data.valid) }
                catch TrainerAction.skipEpoch { break }
            }
            try delegates.forEach { try $0.trainingDidFinish(trainer: self) }
        } catch TrainerAction.stop { return }
    }
}

### Test

Since the labels are non-differentiable and ints, 

In [None]:
let opt = SGD<BasicModel, Float>(learningRate: 1e-2)

In [None]:
func modelInit() -> BasicModel {return BasicModel(nIn: m, nHid: nHid, nOut: c)}

In [None]:
let (xTrain,yTrain,xValid,yValid) = loadMNIST(path: mnistPath, flat: true)

In [None]:
let yTrain1 = Raw.oneHot(indices: yTrain, depth: Tensor(10), onValue: Tensor(1.0), offValue: Tensor(0.0))
let yValid1 = Raw.oneHot(indices: yValid, depth: Tensor(10), onValue: Tensor(1.0), offValue: Tensor(0.0))

In [None]:
let train: Dataset<DataBatch<Tensor<Float>, Tensor<Float>>> = Dataset(
    elements:DataBatch(xb:xTrain, yb:Tensor<Float>(yTrain1))).batched(Int64(64))
let valid: Dataset<DataBatch<Tensor<Float>, Tensor<Float>>> = Dataset(
    elements:DataBatch(xb:xValid, yb:Tensor<Float>(yValid1))).batched(Int64(64))

In [None]:
let data = DataBunch(train: train, valid: valid)

In [None]:
@differentiable(vjp: _vjpSoftmaxCrossEntropy)
func softmaxCrossEntropy1<Scalar: TensorFlowFloatingPoint>(
    _ features: Tensor<Scalar>, _ labels: Tensor<Scalar>
) -> Tensor<Scalar> {
    return Raw.softmaxCrossEntropyWithLogits(features: features, labels: labels).loss.mean()
}

@usableFromInline
func _vjpSoftmaxCrossEntropy<Scalar: TensorFlowFloatingPoint>(
    features: Tensor<Scalar>, labels: Tensor<Scalar>
) -> (Tensor<Scalar>, (Tensor<Scalar>) -> (Tensor<Scalar>, Tensor<Scalar>)) {
    let (loss, grad) = Raw.softmaxCrossEntropyWithLogits(features: features, labels: labels)
    let batchSize = Tensor<Scalar>(features.shapeTensor[0])
    return (loss.mean(), { v in ((v / batchSize) * grad, Tensor<Scalar>(0)) })
}

In [None]:
let trainer = Trainer(data: data, lossFunction: softmaxCrossEntropy1, optimizer: opt, initializingWith: modelInit)

This line will crash the kernel. Failing in `Learner.train(onBatch: batch)` at the line
```
let (loss, (grad, _)) = model.valueWithGradient(at: batch.yb) {
            (model, label) -> Loss in
            let y = model.applied(to: batch.xb, in: context)
            print("out")
            let loss = lossFunction(y, label)
            print(loss)
            return loss
        }
```
when you arrive at `let loss = lossFunction(y, label)` (out is printed but not the loss).

In [None]:
//trainer.fit(epochCount: 1)

But if you request a batch and apply the pieces, it works fine.

In [None]:
var firstBatch: DataBatch<Tensor<Float>, Tensor<Float>>? = nil
for batch in data.train{
    firstBatch = batch
    break
}

In [None]:
let xb = firstBatch!.xb
let yb = firstBatch!.yb
let (loss, (grad, _)) = model.valueWithGradient(at: yb) {
            (model, label) -> Tensor<Float> in
            let y = model.applied(to: xb, in: Context(learningPhase: .training))
            let loss = softmaxCrossEntropy1(y, label)
            return loss
}