# Custom Mini-Batch and Training loop

### Imports

In [0]:
import Python
let request = Python.import("urllib.request")
let pickle = Python.import("pickle")
let gzip = Python.import("gzip")
let np = Python.import("numpy")
let plt = Python.import("matplotlib.pyplot")

In [0]:
import TensorFlow

### MNIST

Data

In [3]:
let result = request.urlretrieve(
    "https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz",
    "mnist.pkl.gz")
let filename = result[0]; filename

mnist.pkl.gz


In [0]:
let mnist = pickle.load(gzip.open(filename), encoding:"latin-1")
// read train, validation and test datasets
let train_mnist = mnist[0]
let valid_mnist = mnist[1]
let test_mnist = mnist[2]

In [0]:
func unsequeeze(_ array: PythonObject, _ dtype: PythonObject = np.float32) -> PythonObject {
    return np.expand_dims(array, axis:-1).astype(dtype)
}

In [0]:
// read training tuple into separate variables
let pyobj_train_x = train_mnist[0]
let pyobj_train_y = train_mnist[1].astype(np.int32) // expand dimension
// read validation tuple into separate variables
let pyobj_valid_x = valid_mnist[0]
let pyobj_valid_y = valid_mnist[1].astype(np.int32) // expand dimension
// read test tuple into separate variables
let pyobj_test_x = test_mnist[0]
let pyobj_test_y = test_mnist[1].astype(np.int32) // expand dimension

In [0]:
// read tensorflow arrays into Tensors
let X_train = Tensor<Float32>(numpy: pyobj_train_x)! // ! to unwrap optionals
let y_train = Tensor<Int32>(numpy: pyobj_train_y)! // ! to unwrap optionals

In [8]:
X_train.shape

▿ TensorShape
  ▿ dimensions : 2 elements
    - 0 : 50000
    - 1 : 784


Model

In [9]:
let m : Int = Int(X_train.shape[0]) // number of samples
let n_in: Int = Int(X_train.shape[1]) // number of features
let nh: Int = 50 // number of 
let n_out: Int = 10 //number of classes

print("\(n_in) -> \(nh) -> \(n_out)")

784 -> 50 -> 10


In [0]:
struct Model: Layer {
    var layer1 = Dense<Float>(inputSize: n_in, outputSize: nh, activation: relu)
    var layer2 = Dense<Float>(inputSize: nh, outputSize: n_out)

    @differentiable
    func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {
        return input.sequenced(in: context, through: layer1, layer2)
    }
    
    var description: String {
        return "description here"
    }
}

var model = Model()

In [11]:
let ctx = Context(learningPhase: .training)

// Apply the model to a batch of features.
let preds = model.applied(to: X_train, in: ctx)
preds[0..<2]

[[0.5411263, 0.3102402, 0.41393316, 0.17374031, -0.2054505, 0.45775208, 0.13550594, -1.2340554, -0.13964355, 0.8585694], [0.6222204, 0.15639141, 0.8304158, 0.3665827, 0.27860576, 0.51567, -0.117491305, 0.065558776, -0.095890775, 0.1879227]]


In [0]:
// test helper functions
func test_near_zero(_ val: Float32, _ msg: String) -> Void {
    assert(val < 1e-3, msg)
}

func test_almost_eq(_ t1: Tensor<Float32>, _ t2: Tensor<Float32>, _ msg: String, _ epsilon: Float32 = 1e-3) -> Void {
    assert(t1 - t2 < epsilon, msg)
}

### Custom loss function
We need to compute the softmax of our activations, then apply a log:
$$ i = \frac{e^{x_i}}{\sum_{0 \leq  j \leq n-1} e^{x_j}} $$

In [0]:
func log_softmax(_ x: Tensor<Float>) -> Tensor<Float> {
    let softmax = exp(x) / (exp(x).sum(alongAxes: -1))
    return log(softmax)
}

with a sample check that our implementation is equal to tensorflow implementation

In [0]:
let x: Tensor<Float> = Tensor<Float>(arrayLiteral: [1, 2, 3, 4], [4, 3, 2, 1])

In [15]:
log_softmax(x)

[[-3.4401896, -2.4401896, -1.4401897, -0.44018975], [-0.44018975, -1.4401897, -2.4401896, -3.4401896]]


In [16]:
logSoftmax(x)

[[-3.4401896, -2.4401896, -1.4401897, -0.4401897], [-0.4401897, -1.4401897, -2.4401896, -3.4401896]]


In [0]:
test_almost_eq(log_softmax(x), logSoftmax(x), "Our impl should be same as Tensorflow impl")

In [0]:
let y_hat: Tensor<Float> = log_softmax(preds)

Given  $x$ and its prediction $p(x)$, the **Cross Entropy** loss is: 
$$ - \sum x \log p(x) $$

Now as the output of the NN is a 1-hot encoded array,  we can rewrite the formula for the index $i$ of a desired target as follows: 
$$-\log(p_{i})$$
Technically, if the predictions are of shape (m, 10) and target is (m, 1) then result should be `predictions[:, target]`.

In [19]:
let x1: Tensor<Float> = Tensor<Float>(arrayLiteral: [2], [3])
let x2: Tensor<Float> = log_softmax(x)

print("\(x1.shape) \(x2.shape)")

TensorShape(dimensions: [2, 1]) TensorShape(dimensions: [2, 4])


In [20]:
x2[1..<2]

[[-0.44018975, -1.4401897, -2.4401896, -3.4401896]]


In [21]:
let i: Int32 = 0
let pos: Int32 = Int32(x1[i][0].scalar!)
x2[i][pos].scalar! 

-1.4401897


Finnally a minually calculated loss looks like:

In [0]:
func nll(labels: Tensor<Int32>, logits: Tensor<Float>) -> Float {
    let size = labels.shape[0]
    var sum : Float = 0
    for i in 0..<size {
        let pos: Int32 = labels[i][0].scalar!
        sum += logits[i][pos].scalar!
    }
    return sum / Float(size)
}

In [23]:
// our way
let loss1: Float = nll(labels: y_train, logits: y_hat)

Fatal error: Dimension -1 must be >= 0: file /swift-base/swift/stdlib/public/TensorFlow/CompilerRuntime.swift, line 2094
Current stack trace:
0    libswiftCore.so                    0x00007f2d761b4f40 _swift_stdlib_reportFatalErrorInFile + 115
1    libswiftCore.so                    0x00007f2d760fd3dc <unavailable> + 3003356
2    libswiftCore.so                    0x00007f2d760fd4ce <unavailable> + 3003598
3    libswiftCore.so                    0x00007f2d75f44e12 <unavailable> + 1199634
4    libswiftCore.so                    0x00007f2d760c76b2 <unavailable> + 2782898
5    libswiftCore.so                    0x00007f2d75f44259 <unavailable> + 1196633
6    libswiftTensorFlow.so              0x00007f2d64d19ad2 <unavailable> + 441042
7    libswiftTensorFlow.so              0x00007f2d64d18230 checkOk(_:file:line:) + 491
8    libswiftTensorFlow.so              0x00007f2d64d3b270 _TFCCheckOk(_:) + 81
9    libswiftTensorFlow.so              0x00007f2d64d3b260 _swift_tfc_CheckOk + 9

: ignored

In [0]:
// tensorflow-way
let loss2: Float = softmaxCrossEntropy(logits: preds, labels: y_train).scalar!

In [0]:
test_near_zero(loss1-loss2, "Loss manually calculated should be similar to Tensorflow-way")

Accuracy function:

In [0]:
func accuracy(_ logits: Tensor<Float>, _ labels: Tensor<Int32>) -> Float {
    return Tensor<Float>(logits.argmax(squeezingAxis: -1) .== labels).mean().scalarized()
}

In [0]:
accuracy(preds, y_train)

### Basic training loop
- Grap a batch from the dataset
- Do a forward pass to get the output of the model on this batch
- compute a loss by comparint the output with the labels 
- Do a backward pass to calculate the gradients of the loss 
- update the model parameters with the gradients 

In [0]:
let bs: Int32 = 64

// grap batch
let X_batch: Tensor<Float> = X_train[0..<bs]
let y_batch: Tensor<Int32> = y_train[0..<bs]

let ctx = Context(learningPhase: .training)

let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
    // forward pass
    let preds = model.applied(to: X_batch, in: ctx)
    // compute loss
    return softmaxCrossEntropy(logits: preds, labels: y_batch)
}
// backward pass

/**
print("Current loss: \(loss)")
print("Current accuracy: \(accuracy(preds, y_batch))")

Continue from 47:00
*/

In [0]:
for l in model {
    print(l)
}