In [1]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import requests
import os

def download(url, filename):
    """ utility function to download a file """
    response = requests.get(url, stream=True)
    with open(filename, "wb") as handle:
        for data in response.iter_content():
            handle.write(data)

locations = ['Tutorials/SLUHandsOn', 'Examples/LanguageUnderstanding/ATIS/BrainScript']

data = {
  'train': { 'file': 'atis.train.ctf', 'location': 0 },
  'test': { 'file': 'atis.test.ctf', 'location': 0 },
  'query': { 'file': 'query.wl', 'location': 1 },
  'slots': { 'file': 'slots.wl', 'location': 1 }
}

for item in data.values():
    location = locations[item['location']]
    path = os.path.join('..', location, item['file'])
    if os.path.exists(path):
        print("Reusing locally cached:", item['file'])
        # Update path
        item['file'] = path
    elif os.path.exists(item['file']):
        print("Reusing locally cached:", item['file'])
    else:
        print("Starting download:", item['file'])
        url = "https://github.com/Microsoft/CNTK/blob/v2.0/%s/%s?raw=true"%(location, item['file'])
        download(url, item['file'])
        print("Download completed")


Reusing locally cached: atis.train.ctf
Reusing locally cached: atis.test.ctf
Reusing locally cached: slots.wl
Reusing locally cached: query.wl


In [2]:
import math
import numpy as np
import cntk as C

In [3]:
# Select the right target device when this notebook is being tested:
if 'TEST_DEVICE' in os.environ:
    if os.environ['TEST_DEVICE'] == 'cpu':
        C.device.try_set_default_device(C.device.cpu())
    else:
        C.device.try_set_default_device(C.device.gpu(0))

In [4]:
# Test for CNTK version
if not C.__version__ == "2.0":
    raise Exception("this notebook was designed to work with 2.0. Current Version: " + C.__version__) 

In [5]:
# setting seed
np.random.seed(0)
C.cntk_py.set_fixed_random_seed(1)
C.cntk_py.force_deterministic_algorithms()

# number of words in vocab, slot labels, and intent labels
vocab_size = 943 ; num_labels = 129 ; num_intents = 26    

# model dimensions
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 150
hidden_dim = 300

# Create the containers for input feature (x) and the label (y)
x = C.sequence.input_variable(vocab_size)
y = C.sequence.input_variable(num_labels)

def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim, name='embed'),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels, name='classify')
        ])

In [6]:
# peek
z = create_model()
print(z.embed.E.shape)
print(z.classify.b.value)

(-1, 150)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.]


In [7]:
# Pass an input and check the dimension
z = create_model()
print(z(x).embed.E.shape)

(943, 150)


In [8]:
def create_reader(path, is_training):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
         query         = C.io.StreamDef(field='S0', shape=vocab_size,  is_sparse=True),
         intent_unused = C.io.StreamDef(field='S1', shape=num_intents, is_sparse=True),  
         slot_labels   = C.io.StreamDef(field='S2', shape=num_labels,  is_sparse=True)
     )), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

In [9]:
# peek
reader = create_reader(data['train']['file'], is_training=True)
reader.streams.keys()

dict_keys(['intent_unused', 'slot_labels', 'query'])

In [10]:
def create_criterion_function(model):
    labels = C.placeholder(name='labels')
    ce   = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error      (model, labels)
    return C.combine ([ce, errs]) # (features, labels) -> (loss, metric)

criterion = create_criterion_function(create_model())
criterion.replace_placeholders({criterion.placeholders[0]: C.sequence.input_variable(num_labels)})

Composite(Combine): Input('Input2300', [#, *], [129]), Placeholder('labels', [???], [???]) -> Output('Block2270_Output_0', [#, *], [1]), Output('Block2290_Output_0', [#, *], [])

In [11]:
def create_criterion_function_preferred(model, labels):
    ce   = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error      (model, labels)
    return ce, errs # (model, labels) -> (loss, error metric)

In [12]:
def train_test(train_reader, test_reader, model_func, max_epochs=10):
    
    # Instantiate the model function; x is the input (feature) variable 
    model = model_func(x)
    
    # Instantiate the loss and error function
    loss, label_error = create_criterion_function_preferred(model, y)

    # training config
    epoch_size = 18000        # 18000 samples is half the dataset size 
    minibatch_size = 70
    
    # LR schedule over epochs 
    # In CNTK, an epoch is how often we get out of the minibatch loop to
    # do other stuff (e.g. checkpointing, adjust learning rate, etc.)
    # (we don't run this many epochs, but if we did, these are good values)
    lr_per_sample = [0.003]*4+[0.0015]*24+[0.0003]
    lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample]
    lr_schedule = C.learning_rate_schedule(lr_per_minibatch, C.UnitType.minibatch, epoch_size)
    
    # Momentum schedule
    momentum_as_time_constant = C.momentum_as_time_constant_schedule(700)
    
    # We use a the Adam optimizer which is known to work well on this dataset
    # Feel free to try other optimizers from 
    # https://www.cntk.ai/pythondocs/cntk.learner.html#module-cntk.learner
    learner = C.adam(parameters=model.parameters,
                     lr=lr_schedule,
                     momentum=momentum_as_time_constant,
                     gradient_clipping_threshold_per_sample=15, 
                     gradient_clipping_with_truncation=True)

    # Setup the progress updater
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs)
    
    # Uncomment below for more detailed logging
    #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) 

    # Instantiate the trainer
    trainer = C.Trainer(model, (loss, label_error), learner, progress_printer)

    # process minibatches and perform model training
    C.logging.log_number_of_parameters(model)

    t = 0
    for epoch in range(max_epochs):         # loop over epochs
        epoch_end = (epoch+1) * epoch_size
        while t < epoch_end:                # loop over minibatches on the epoch
            data = train_reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
                x: train_reader.streams.query,
                y: train_reader.streams.slot_labels
            })
            trainer.train_minibatch(data)               # update model with it
            t += data[y].num_samples                    # samples so far
        trainer.summarize_training_progress()
    
    while True:
        minibatch_size = 500
        data = test_reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
            x: test_reader.streams.query,
            y: test_reader.streams.slot_labels
        })
        if not data:                                 # until we hit the end
            break
        trainer.test_minibatch(data)
    
    trainer.summarize_test_progress()

In [13]:
def do_train_test():
    global z
    z = create_model()
    train_reader = create_reader(data['train']['file'], is_training=True)
    test_reader = create_reader(data['test']['file'], is_training=False)
    train_test(train_reader, test_reader, z)

In [14]:
do_train_test()

Training 721479 parameters in 6 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.823758 * 18010, metric = 16.06% * 18010 2.203s (8175.2 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.237409 * 18051, metric = 5.37% * 18051 1.604s (11253.7 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.161362 * 17941, metric = 3.82% * 17941 1.578s (11369.5 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.110150 * 18059, metric = 2.60% * 18059 1.639s (11018.3 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.074098 * 17957, metric = 1.79% * 17957 1.621s (11077.7 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.063400 * 18021, metric = 1.58% * 18021 1.570s (11478.3 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.057787 * 17980, metric = 1.45% * 17980 1.599s (11244.5 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.055115 * 18025, metric = 1.53% * 18025 1.605s (1123

In [15]:
z.classify.b.value

array([-0.01661626, -0.06849118, -0.02536024,  0.00650608, -0.01364892,
       -0.0009079 , -0.00154566, -0.03404587,  0.02474708, -0.02293944,
        0.09312913,  0.0431481 , -0.05600131,  0.03491592, -0.06626801,
       -0.08001208, -0.08224191, -0.03043125,  0.00466935, -0.08937697,
       -0.00029537, -0.03336236,  0.04734406,  0.0271787 ,  0.00629511,
       -0.02088273,  0.00323861, -0.07437889,  0.01641067, -0.0409265 ,
       -0.03098237,  0.05914192, -0.05276666, -0.01397363,  0.04463421,
        0.00217605, -0.00161776, -0.06331958,  0.00754404, -0.0749741 ,
       -0.08729474, -0.09689215, -0.01376782, -0.08082633, -0.06354423,
       -0.0532876 ,  0.03256474,  0.0412824 ,  0.03448279, -0.0682756 ,
       -0.04975243, -0.04335478,  0.04458596, -0.0533246 ,  0.04666631,
        0.03415982, -0.07967258, -0.01372983, -0.01122679,  0.00258926,
        0.05711882, -0.00203596,  0.04756062, -0.01168122,  0.01214732,
        0.01728878, -0.12160189, -0.02169815,  0.01856033, -0.08

In [16]:
# load dictionaries
query_wl = [line.rstrip('\n') for line in open(data['query']['file'])]
slots_wl = [line.rstrip('\n') for line in open(data['slots']['file'])]
query_dict = {query_wl[i]:i for i in range(len(query_wl))}
slots_dict = {slots_wl[i]:i for i in range(len(slots_wl))}

# let's run a sequence through
seq = 'BOS flights from new york to seattle EOS'
w = [query_dict[w] for w in seq.split()] # convert to word indices
print(w)
onehot = np.zeros([len(w),len(query_dict)], np.float32)
for t in range(len(w)):
    onehot[t,w[t]] = 1

#x = C.sequence.input_variable(vocab_size)
pred = z(x).eval({x:[onehot]})[0]
print(pred.shape)
best = np.argmax(pred,axis=1)
print(best)
list(zip(seq.split(),[slots_wl[s] for s in best]))

[178, 429, 444, 619, 937, 851, 752, 179]
(8, 129)
[128 128 128  48 110 128  78 128]


[('BOS', 'O'),
 ('flights', 'O'),
 ('from', 'O'),
 ('new', 'B-fromloc.city_name'),
 ('york', 'I-fromloc.city_name'),
 ('to', 'O'),
 ('seattle', 'B-toloc.city_name'),
 ('EOS', 'O')]

In [17]:
# Add lookahead
def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels)
        ])
    
# Enable these when done:
#z = create_model()
#do_train_test()


In [18]:
# Add bidirectional recurrence
def create_model():
    with C.layers.default_options(initial_state=0.1):  
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels)
        ])

# Enable these when done:
#z = create_model()
#do_train_test()
