In [261]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import os

data_root = "../training-data/"
data_lang = "javascript"
data = {
  'train': { 'file': data_root + 'train/' + data_lang + '/0.train.ctf', 'location': 0 },
  'test': { 'file': data_root +'test/' + data_lang + '/0.test.ctf', 'location': 0 },
  'query': { 'file': data_root + 'utils/' + 'query.wl', 'location': 1 },
  'slots': { 'file': data_root + 'utils/' + 'slots.wl', 'location': 1 },
}

In [262]:
import math
import numpy as np

import cntk as C

In [263]:
# setting seed
np.random.seed(0)
C.cntk_py.set_fixed_random_seed(1)
C.cntk_py.force_deterministic_algorithms()

query_wl = [line.rstrip('\n') for line in open(data['query']['file'])]
slots_wl = [line.rstrip('\n') for line in open(data['slots']['file'])]

# number of words in vocab, slot labels, and intent labels
vocab_size = len(query_wl) ; num_labels = len(slots_wl) ; num_intents = 1    

# model dimensions
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 150
hidden_dim = 300

# Create the containers for input feature (x) and the label (y)
x = C.sequence.input_variable(vocab_size)
y = C.sequence.input_variable(num_labels)

def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim, name='embed'),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels, name='classify')
        ])

In [264]:
# peek
z = create_model()
print(z.embed.E.shape)
print(z.classify.b.value)

(-1, 150)
[ 0.  0.  0.  0.  0.  0.  0.  0.]


In [265]:
# Pass an input and check the dimension
z = create_model()
print(z(x).embed.E.shape)

(32, 150)


In [266]:
def create_reader(path, is_training):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
         query         = C.io.StreamDef(field='S0', shape=vocab_size,  is_sparse=True),
         intent_unused = C.io.StreamDef(field='S1', shape=num_intents, is_sparse=True),  
         slot_labels   = C.io.StreamDef(field='S2', shape=num_labels,  is_sparse=True)
     )), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

In [267]:
reader = create_reader(data['train']['file'], is_training=True)
reader.streams.keys()

dict_keys(['slot_labels', 'query', 'intent_unused'])

In [268]:
def create_criterion_function_preferred(model, labels):
    ce   = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error      (model, labels)
    return ce, errs # (model, labels) -> (loss, error metric)

In [269]:
def train_test(train_reader, test_reader, model_func, max_epochs=10):
    
    # Instantiate the model function; x is the input (feature) variable 
    model = model_func(x)
    
    # Instantiate the loss and error function
    loss, label_error = create_criterion_function_preferred(model, y)

    # training config
    epoch_size = 18000        # 18000 samples is half the dataset size 
    minibatch_size = 70
    
    # LR schedule over epochs 
    # In CNTK, an epoch is how often we get out of the minibatch loop to
    # do other stuff (e.g. checkpointing, adjust learning rate, etc.)
    # (we don't run this many epochs, but if we did, these are good values)
    lr_per_sample = [0.003]*4+[0.0015]*24+[0.0003]
    lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample]
    lr_schedule = C.learning_rate_schedule(lr_per_minibatch, C.UnitType.minibatch, epoch_size)
    
    # Momentum schedule
    momentum_as_time_constant = C.momentum_as_time_constant_schedule(700)
    
    # We use a the Adam optimizer which is known to work well on this dataset
    # Feel free to try other optimizers from 
    # https://www.cntk.ai/pythondocs/cntk.learner.html#module-cntk.learner
    learner = C.adam(parameters=model.parameters,
                     lr=lr_schedule,
                     momentum=momentum_as_time_constant,
                     gradient_clipping_threshold_per_sample=15, 
                     gradient_clipping_with_truncation=True)

    # Setup the progress updater
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs)
    
    # Uncomment below for more detailed logging
    #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) 

    # Instantiate the trainer
    trainer = C.Trainer(model, (loss, label_error), learner, progress_printer)

    # process minibatches and perform model training
    C.logging.log_number_of_parameters(model)

    t = 0
    for epoch in range(max_epochs):         # loop over epochs
        epoch_end = (epoch+1) * epoch_size
        while t < epoch_end:                # loop over minibatches on the epoch
            data = train_reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
                x: train_reader.streams.query,
                y: train_reader.streams.slot_labels
            })
            trainer.train_minibatch(data)               # update model with it
            t += data[y].num_samples                    # samples so far
        trainer.summarize_training_progress()
    
    while True:
        minibatch_size = 500
        data = test_reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
            x: test_reader.streams.query,
            y: test_reader.streams.slot_labels
        })
        if not data:                                 # until we hit the end
            break
        trainer.test_minibatch(data)
    
    trainer.summarize_test_progress()

In [270]:
def do_train_test():
    global z
    z = create_model()
    train_reader = create_reader(data['train']['file'], is_training=True)
    test_reader = create_reader(data['test']['file'], is_training=False)
    train_test(train_reader, test_reader, z)

In [271]:
do_train_test()

Training 548408 parameters in 6 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.052017 * 18042, metric = 1.59% * 18042 3.368s (5356.9 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.000022 * 17976, metric = 0.00% * 17976 3.547s (5067.9 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.000012 * 18003, metric = 0.00% * 18003 3.397s (5299.7 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.000008 * 17991, metric = 0.00% * 17991 3.290s (5468.4 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.000006 * 18048, metric = 0.00% * 18048 3.078s (5863.5 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.000005 * 17970, metric = 0.00% * 17970 4.009s (4482.4 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.000005 * 17988, metric = 0.00% * 17988 3.253s (5529.7 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.000004 * 18024, metric = 0.00% * 18024 3.130s (5758.5 samp

In [277]:
# load dictionaries
query_dict = {query_wl[i]:i for i in range(len(query_wl))}
slots_dict = {slots_wl[i]:i for i in range(len(slots_wl))}
print(query_dict)

# let's run a sequence through
seq = "BOS Object . entries ( unknown ) ; EOS"
w = [query_dict[w] for w in seq.split()] # convert to word indices
print(w)
onehot = np.zeros([len(w),len(query_dict)], np.float32)
for t in range(len(w)):
    onehot[t,w[t]] = 1

#x = C.sequence.input_variable(vocab_size)
pred = z(x).eval({x:[onehot]})[0]
print(pred.shape)
best = np.argmax(pred,axis=1)
print(best)
list(zip(seq.split(),[slots_wl[s] for s in best]))

{'info': 23, 'BOS': 1, '[': 7, 'keys': 29, 'warn': 15, '{': 3, '(': 5, 'values': 31, 'error': 16, 'trace': 27, 'groupEnd': 22, '"': 12, 'count': 17, 'console': 13, 'Object': 28, 'unknown': 0, 'log': 14, 'timeEnd': 26, 'group': 20, 'table': 24, "'": 11, ')': 6, ']': 8, 'time': 25, '}': 4, 'assert': 18, '.': 10, 'EOS': 2, ';': 9, 'entries': 30, 'groupCollapsed': 21, 'clear': 19}
[1, 28, 10, 30, 5, 0, 6, 9, 2]
(9, 8)
[7 0 7 1 2 7 3 6 7]


[('BOS', 'O'),
 ('Object', 'class'),
 ('.', 'O'),
 ('entries', 'method.name'),
 ('(', 'method.args_start'),
 ('unknown', 'O'),
 (')', 'method.args_end'),
 (';', 'statement_end'),
 ('EOS', 'O')]