In [1]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import requests
import os

def download(url, filename):
    """ utility function to download a file """
    response = requests.get(url, stream=True)
    with open(filename, "wb") as handle:
        for data in response.iter_content():
            handle.write(data)

locations = ['Tutorials/SLUHandsOn', 'Examples/LanguageUnderstanding/ATIS/BrainScript']

data = {
  'train': { 'file': 'atis.train.ctf', 'location': 0 },
  'test': { 'file': 'atis.test.ctf', 'location': 0 },
  'query': { 'file': 'query.wl', 'location': 1 },
  'slots': { 'file': 'slots.wl', 'location': 1 }
}

for item in data.values():
    location = locations[item['location']]
    path = os.path.join('..', location, item['file'])
    if os.path.exists(path):
        print("Reusing locally cached:", item['file'])
        # Update path
        item['file'] = path
    elif os.path.exists(item['file']):
        print("Reusing locally cached:", item['file'])
    else:
        print("Starting download:", item['file'])
        url = "https://github.com/Microsoft/CNTK/blob/v2.0/%s/%s?raw=true"%(location, item['file'])
        download(url, item['file'])
        print("Download completed")


Reusing locally cached: atis.test.ctf
Reusing locally cached: atis.train.ctf
Reusing locally cached: slots.wl
Reusing locally cached: query.wl


In [2]:
# import CNTK and other useful libraries

import math
import numpy as np
import cntk as C

In [3]:
# Select the right target device when this notebook is being tested:
if 'TEST_DEVICE' in os.environ:
    if os.environ['TEST_DEVICE'] == 'cpu':
        C.device.try_set_default_device(C.device.cpu())
    else:
        C.device.try_set_default_device(C.device.gpu(0))

In [4]:
# number of words in vocab, slot labels, and intent labels
vocab_size = 943 ; num_labels = 129 ; num_intents = 26    

# model dimensions
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 150
hidden_dim = 300

# Create the containers for input feature (x) and the label (y)
x = C.sequence.input_variable(vocab_size)
y = C.sequence.input_variable(num_labels)

def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim, name='embed'),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels, name='classify')
        ])

In [5]:
# peek
z = create_model()
print(z.embed.E.shape)
print(z.classify.b.value)

(-1, 150)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [6]:
# Pass an input and check the dimension
z = create_model()
print(z(x).embed.E.shape)

(943, 150)


In [7]:
def create_reader(path, is_training):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
         query         = C.io.StreamDef(field='S0', shape=vocab_size,  is_sparse=True),
         intent_unused = C.io.StreamDef(field='S1', shape=num_intents, is_sparse=True),  
         slot_labels   = C.io.StreamDef(field='S2', shape=num_labels,  is_sparse=True)
     )), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

In [8]:
# peek
reader = create_reader(data['train']['file'], is_training=True)
reader.streams.keys()

dict_keys(['intent_unused', 'slot_labels', 'query'])

In [9]:
def create_criterion_function(model):
    labels = C.placeholder(name='labels')
    ce   = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error      (model, labels)
    return C.combine ([ce, errs]) # (features, labels) -> (loss, metric)

criterion = create_criterion_function(create_model())
criterion.replace_placeholders({criterion.placeholders[0]: C.sequence.input_variable(num_labels)})

Composite(Combine): Input('Input2300', [#, *], [129]), Placeholder('labels', [???], [???]) -> Output('Block2270_Output_0', [#, *], [???]), Output('Block2290_Output_0', [#, *], [???])

In [10]:
def create_criterion_function_preferred(model, labels):
    ce   = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error      (model, labels)
    return ce, errs # (model, labels) -> (loss, error metric)

In [11]:
def train(reader, model_func, max_epochs=10):
    
    # Instantiate the model function; x is the input (feature) variable 
    model = model_func(x)
    
    # Instantiate the loss and error function
    loss, label_error = create_criterion_function_preferred(model, y)

    # training config
    epoch_size = 18000        # 18000 samples is half the dataset size 
    minibatch_size = 70
    
    # LR schedule over epochs 
    # In CNTK, an epoch is how often we get out of the minibatch loop to
    # do other stuff (e.g. checkpointing, adjust learning rate, etc.)
    # (we don't run this many epochs, but if we did, these are good values)
    lr_per_sample = [0.003]*4+[0.0015]*24+[0.0003]
    lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample]
    lr_schedule = C.learning_rate_schedule(lr_per_minibatch, C.UnitType.minibatch, epoch_size)
    
    # Momentum schedule
    momentum_as_time_constant = C.momentum_as_time_constant_schedule(700)
    
    # We use a the Adam optimizer which is known to work well on this dataset
    # Feel free to try other optimizers from 
    # https://www.cntk.ai/pythondocs/cntk.learner.html#module-cntk.learner
    learner = C.adam(parameters=model.parameters,
                     lr=lr_schedule,
                     momentum=momentum_as_time_constant,
                     gradient_clipping_threshold_per_sample=15, 
                     gradient_clipping_with_truncation=True)

    # Setup the progress updater
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs)
    
    # Uncomment below for more detailed logging
    #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) 

    # Instantiate the trainer
    trainer = C.Trainer(model, (loss, label_error), learner, progress_printer)

    # process minibatches and perform model training
    C.logging.log_number_of_parameters(model)

    t = 0
    for epoch in range(max_epochs):         # loop over epochs
        epoch_end = (epoch+1) * epoch_size
        while t < epoch_end:                # loop over minibatches on the epoch
            data = reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
                x: reader.streams.query,
                y: reader.streams.slot_labels
            })
            trainer.train_minibatch(data)               # update model with it
            t += data[y].num_samples                    # samples so far
        trainer.summarize_training_progress()

In [12]:
# Run the model

def do_train():
    global z
    z = create_model()
    reader = create_reader(data['train']['file'], is_training=True)
    train(reader, z)
do_train()

Training 721479 parameters in 6 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.692392 * 18010, metric = 14.14% * 18010 14.127s (1274.9 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.196757 * 18051, metric = 4.43% * 18051 9.989s (1807.1 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.127802 * 17941, metric = 2.88% * 17941 7.583s (2366.0 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.089395 * 18059, metric = 2.14% * 18059 7.789s (2318.5 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.057005 * 17957, metric = 1.33% * 17957 9.109s (1971.3 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.051911 * 18021, metric = 1.22% * 18021 8.221s (2192.1 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.047058 * 17980, metric = 1.17% * 17980 11.428s (1573.3 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.040754 * 18025, metric = 1.07% * 18025 11.094s (1624.8 

In [13]:
# Evaluate the model

def evaluate(reader, model_func):
    
    # Instantiate the model function; x is the input (feature) variable 
    model = model_func(x)
    
    # Create the loss and error functions
    loss, label_error = create_criterion_function_preferred(model, y)

    # process minibatches and perform evaluation
    progress_printer = C.logging.ProgressPrinter(tag='Evaluation', num_epochs=0)

    while True:
        minibatch_size = 500
        data = reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
            x: reader.streams.query,
            y: reader.streams.slot_labels
        })
        if not data:                                 # until we hit the end
            break

        evaluator = C.eval.Evaluator(loss, progress_printer)
        evaluator.test_minibatch(data)
     
    evaluator.summarize_test_progress()

In [14]:
def do_test():
    reader = create_reader(data['test']['file'], is_training=False)
    evaluate(reader, z)
do_test()
z.classify.b.value

Finished Evaluation [1]: Minibatch[1-23]: metric = 0.34% * 10984;


array([-3.44756097e-02, -9.48742926e-02, -4.88143601e-02, -5.89377098e-02,
       -1.66121200e-02, -4.74706963e-02, -4.71220650e-02, -9.88641307e-02,
       -2.75263712e-02, -5.82716130e-02, -3.17683741e-02, -3.77656259e-02,
       -5.17959744e-02, -5.13010696e-02, -8.52241889e-02, -7.09928051e-02,
       -1.22144140e-01, -4.68127318e-02,  4.03327495e-02, -1.32786557e-01,
       -7.63492659e-02, -5.49318455e-02,  5.47909038e-03, -7.66143017e-03,
       -4.85182293e-02,  1.35481404e-02, -3.55821103e-02, -3.82119953e-03,
       -1.73545275e-02, -3.66954207e-02, -3.91245857e-02, -3.24573815e-02,
       -7.47283399e-02, -1.02308849e-02, -5.27024157e-02,  9.11082923e-02,
        6.01251870e-02, -2.08287835e-02, -1.44987218e-02, -4.56239879e-02,
       -1.41239911e-01, -6.53960556e-02,  1.86713673e-02, -4.64309864e-02,
        1.97561365e-02, -8.04510191e-02,  3.41979191e-02,  2.72030365e-02,
        2.97282673e-02, -4.36839908e-02, -6.07293919e-02, -8.45976621e-02,
        2.59138849e-02, -

In [15]:
# load dictionaries
query_wl = [line.rstrip('\n') for line in open(data['query']['file'])]
slots_wl = [line.rstrip('\n') for line in open(data['slots']['file'])]
query_dict = {query_wl[i]:i for i in range(len(query_wl))}
slots_dict = {slots_wl[i]:i for i in range(len(slots_wl))}

# let's run a sequence through
seq = 'BOS flights from new york to seattle EOS'
w = [query_dict[w] for w in seq.split()] # convert to word indices
print(w)
onehot = np.zeros([len(w),len(query_dict)], np.float32)
for t in range(len(w)):
    onehot[t,w[t]] = 1

#x = C.sequence.input_variable(vocab_size)
pred = z(x).eval({x:[onehot]})[0]
print(pred.shape)
best = np.argmax(pred,axis=1)
print(best)
list(zip(seq.split(),[slots_wl[s] for s in best]))

[178, 429, 444, 619, 937, 851, 752, 179]
(8, 129)
[128 128 128  48 110 128  78 128]


[('BOS', 'O'),
 ('flights', 'O'),
 ('from', 'O'),
 ('new', 'B-fromloc.city_name'),
 ('york', 'I-fromloc.city_name'),
 ('to', 'O'),
 ('seattle', 'B-toloc.city_name'),
 ('EOS', 'O')]

In [16]:
# Add batch normalization
def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels)
        ])

# Enable these when done:
z = create_model()
#do_train()
#do_test()

In [17]:
# Add lookahead
def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels)
        ])
    
# Enable these when done:
z = create_model()
#do_train()
#do_test()

In [18]:
# Add bidirectional recurrence
def create_model():
    with C.layers.default_options(initial_state=0.1):  
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels)
        ])

# Enable these when done:
#do_train()
#do_test()

### Solution 1: Adding Batch Normalization

In [19]:
def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            #C.layers.BatchNormalization(),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            #C.layers.BatchNormalization(),
            C.layers.Dense(num_labels)
        ])

do_train()
do_test()

Training 721479 parameters in 6 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.774070 * 18010, metric = 15.11% * 18010 6.858s (2626.1 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.223397 * 18051, metric = 5.12% * 18051 7.450s (2423.0 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.153779 * 17941, metric = 3.51% * 17941 8.274s (2168.4 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.107101 * 18059, metric = 2.62% * 18059 8.326s (2169.0 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.068279 * 17957, metric = 1.60% * 17957 10.771s (1667.2 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.061056 * 18021, metric = 1.43% * 18021 11.655s (1546.2 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.052172 * 17980, metric = 1.22% * 17980 7.706s (2333.2 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.048326 * 18025, metric = 1.23% * 18025 8.932s (2018.0 s

### Solution 2: Add a Lookahead

In [20]:
def OneWordLookahead():
    x = C.placeholder()
    apply_x = C.splice(x, C.sequence.future_value(x))
    return apply_x

def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            OneWordLookahead(),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels)        
        ])

do_train()
do_test()

Training 901479 parameters in 6 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.623953 * 18010, metric = 12.71% * 18010 12.903s (1395.8 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.166708 * 18051, metric = 3.72% * 18051 10.257s (1759.9 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.108443 * 17941, metric = 2.37% * 17941 11.541s (1554.5 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.068857 * 18059, metric = 1.57% * 18059 7.845s (2302.0 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.040734 * 17957, metric = 0.99% * 17957 11.045s (1625.8 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.036186 * 18021, metric = 0.81% * 18021 13.684s (1316.9 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.039799 * 17980, metric = 0.99% * 17980 10.951s (1641.9 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.028615 * 18025, metric = 0.79% * 18025 11.143s (161

### Solution 3: Bidirectional Recurrent Model

In [21]:
def BiRecurrence(fwd, bwd):
    F = C.layers.Recurrence(fwd)
    G = C.layers.Recurrence(bwd, go_backwards=True)
    x = C.placeholder()
    apply_x = C.splice(F(x), G(x))
    return apply_x 

def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            BiRecurrence(C.layers.LSTM(hidden_dim//2), 
                                  C.layers.LSTM(hidden_dim//2)),
            C.layers.Dense(num_labels)
        ])

do_train()
do_test()

Training 541479 parameters in 9 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.723112 * 18010, metric = 13.59% * 18010 9.051s (1989.8 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.173359 * 18051, metric = 4.02% * 18051 13.087s (1379.3 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.104300 * 17941, metric = 2.20% * 17941 9.784s (1833.7 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.066566 * 18059, metric = 1.59% * 18059 9.216s (1959.5 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.038120 * 17957, metric = 0.88% * 17957 12.410s (1447.0 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.038607 * 18021, metric = 0.89% * 18021 9.877s (1824.5 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.035566 * 17980, metric = 0.90% * 17980 11.005s (1633.8 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.027885 * 18025, metric = 0.70% * 18025 8.636s (2087.2 