# CNTK - Hand Writing Recognition Tutorial

In [None]:
# let's make sure CNTK is available and up to date
import cntk as C
C.__version__

In [None]:
# batches are represented either as dense numpy arrays
import numpy as np

# ...or as scipy compressed sparse matrices (CSR)
from scipy.sparse import csr_matrix

# helper package for this tutorial (data loading, manual testing, etc.)
import dlt

# other usefel stuff
import math

In [None]:
from cntk.device import set_default_device, gpu, cpu

# All you really need is your laptop CPU
set_default_device(cpu())

# ...but deep learning on a GPU is way more fun!
# set_default_device(gpu(0))

In [None]:
# name the experiment, to distinguish runs
xp_name = ...

## Data Loading

In [None]:
# Each sample is of size 256 (dimension 1)

# You can think of it as a flattened square of black and white pixels of size 16x16

# In the training datasets there are 93000 samples
# We are going to break them into batches
train = dlt.load_hdf5('/data/uji/train.hdf')
# print(train.x.shape)  # (93000, 256)
# print(train.y.shape)  # (93000,)

# print(train.x[0, :])           # [ 0.00 0.00 ... ] - a single example (256-vector of floats [0 1])#
# print(train.y[0])              # 21 - label representation
# print(train.vocab[train.y[0]]) # L  - actual label

# In the validation dataset there are only 620 samples
# A single batch is fine if you've got the memory (no backprop on it anyway)
valid = dlt.load_hdf5('/data/uji/valid.hdf')
# print(valid.x.shape)  # (620, 256)
# print(valid.y.shape)  # (620,)

## Data Batching

In [None]:
# one hot little detour

# if your data is one-hot indices, like target ids, it's better to represent it as a sparse matrix.
# this is a snippet taken from the CNTK documentation that converts a list of indices to a compressed sparse matrix
# there might be better ways, if you find one let me know!
def seq_to_csr_matrix(seq, vocab_size):
    indptr = [0]
    indices = []
    data = []
    for term_idx in seq:
        indices.append(term_idx)
        data.append(1)
        indptr.append(len(indices))
    return csr_matrix((data, indices, indptr), shape=(len(seq), vocab_size), dtype=np.float32)

# much more efficient, however, is to incrementally create CSR arrays.
# see https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
# for more information.

In [None]:
# data dependent hyper-parameters
# (i.e. size of inputs and outputs)

# what's the number of dimensions in input? (i.e. number of features)
input_features = ... # (hint: they are all the same and it's a dimension in the batches)
print('input_features', input_features)

# what's the number of different letters we are trying to recognise? (i.e. number of labels)
label_classes = ... # (hint: it's the maximum label you can come across in the data)
print('label_classes', label_classes)

In [None]:
# how many examples do we show to the network before backpropating the error?
# this number should be as big as you can make it without running out of memory, 
# BUT it might hurt your convergence in the long run if it's too big.
batch_size = ...

# how many training batches are there?
n_train_batches = ...
print('n_train_batches', n_train_batches)

# a list of tuples [(example, label), ...]
train_batches = ...

# double check you've got them all
assert n_train_batches == len(train_batches)

# all batches have the same number of samples, exept the last one which might be a bit short
for full_batch in train_batches[:-1]:
    assert full_batch[0].shape[0] == batch_size
    
print('full batch shape', train_batches[0][0].shape)
print('last batch shape', train_batches[-1][0].shape)

## Model

In [None]:
# the simplest possible model is a single layer feed forward network (aka perceptron)

# 1. a linear tranformation projects the input features into a hidden layer
# 2. a non-linearity is applied
# 3. a linear transformation projects from hidden_size to the number of output labels
# (4.) our aim is to obtain a probability distribution over the labels, so a softmax would be applied here
#      BUT in cntk the loss function (cross entropy) is coupled with the softmax, so no need to add it at this stage

# cntk.layers has some useful stuff like 
#   - `Dense` for linear transformations
#   - `Sequential` for concatenating layers
# activations are passed in but in case you wonder they live in the main cntk package

# for more info refer to 
def basic_feed_forward_model(hidden_size, activation_fn, label_classes):
    return ...

In [None]:
# input variable for ingress data
features = ...

# input variable for label data (this is sparse)
label = ...

# Instantiate the feedforward model
model = ...

# Apply the model to the input features
z = ...

# A training loss function + softmax (cross entropy and softmax)
# parameters: applied network and the label input variable 
ce = ...

# An test time error function (i.e. classification)
# parameters: applied network and the label input variable 
pe = ...

## Training

In [None]:
# Setup the way the model is going to be trained.
#   - what's the learning rate (schedule)?
#   - what's the algorigthm for gradient descent?
#   - what's the output you want to visualise during training?

# https://www.cntk.ai/pythondocs/cntk.learners.html

# learning rate is specified using a schedule, so that it can vary automatically during training.
# you also need to decide if it should be relative to the `minibatches` or the `samples`
lr_per_minibatch = ...

# TensorBoardProgressWriter helps visualising what's going on during trainining
# make sure to set log_dir='/logs/<name_of_experiment>' and also to specify a model
# https://www.cntk.ai/pythondocs/cntk.logging.html
logger = ...

# setup the algorithm for gradient descent (in cntk speak: learner)
learner = ...

# a trainer takes care of all your training needs (sort of)
trainer = ...

In [None]:
# how big is your model? (aka number of parameters)
...

In [None]:
# loop through the dataset many times
for epoch in range(...):
    
    # show the model all the training batches and make it learn!
    for train_batch, train_labels in train_batches:

        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        # the key names are the name given to the input layers
        # the values are the actual batch and label
        ...
        
    # summarize
    ...
    
    # now evaluate the model on the validation set
    valid_batch, valid_labels = ...
    
    valid_error = ...
    
    # After each epoch, print the accuracy on the validation set as a percentage
    valid_accuracy = ... 
    print('Epoch %d, validation accuracy: %.2f%%' % (epoch, valid_accuracy))
    logger.write_value('valid_accuracy', valid_accuracy, epoch)
    
    # summarize
    ...
    
    # save model to file
    ...

## Manual Testing

In [None]:
# (Optional) demo - try your classifier 

def classify(img):
    # Hint - if you need a batch dimension, try: img.reshape(1, -1)
    print("TODO - classify img, shape %s" % img.shape)
    scores = ... # replace with real scores (it's easier than it sounds!)
    return train.vocab[np.argmax(scores)]

# quick check
assert classify(valid.x[0, :]) in train.vocab

dlt.CustomInput(classify)