# Simple LSTM model

In [None]:
import os
import cntk as C
import numpy as np
import copy
import cntk.tests.test_utils
cntk.tests.test_utils.set_device_from_pytest_env() # (only needed for our build system)
C.cntk_py.set_fixed_random_seed(1) # fix a random seed for CNTK components

azureml_share_env = 'AZUREML_NATIVE_SHARE_DIRECTORY'
is_azure_ml = azureml_share_env in os.environ
share_path = os.environ[azureml_share_env] if is_azure_ml else '../../'

train_path = os.path.join(share_path, "data/final/final.train.ctf")
val_path = os.path.join(share_path, "data/final/final.val.ctf")
test_path = os.path.join(share_path, "data/final/final.test.ctf")

C.__version__

In [None]:
# Creates the reader
def create_reader(path, is_training, input_dim, label_dim):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
        features = C.io.StreamDef(field='S0', shape=input_dim,   is_sparse=True),
        labels   = C.io.StreamDef(field='S1', shape=label_dim,   is_sparse=False)
    )), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

# Defines the LSTM model for classifying sequences
def lstm_sequence_classifier(features, num_classes, embedding_dim, LSTM_dim):
    with C.layers.default_options(initial_state=0.1):
        classifier = C.layers.Sequential([C.layers.Embedding(embedding_dim, name='embed'),
                                          C.layers.Recurrence(C.layers.LSTM(LSTM_dim), go_backwards=False),
                                          C.sequence.last,
                                          C.layers.Dense(num_classes, name='dense')])
    return classifier(features)

#### Create the model and criterion

In [None]:
vocab_size = 101590

# Model dimensions
input_dim = vocab_size
hidden_dim = 500
embedding_dim = 1000
num_classes = 3

# Input variables denoting the features and label data
features = C.sequence.input_variable(shape=input_dim, is_sparse=True)
labels = C.input_variable(num_classes)

# Instantiate the sequence classification model
model = lstm_sequence_classifier(features, num_classes, embedding_dim, hidden_dim)

# Create criterion
loss        = C.cross_entropy_with_softmax(model, labels)
label_error = C.classification_error(model, labels)

#### Create the reader for training data

In [None]:
reader = create_reader(train_path, True, input_dim, num_classes)
print(reader.streams.keys())

reader_val = create_reader(val_path, True, input_dim, num_classes)
print(reader_val.streams.keys())

#### Train the model

In [None]:
max_epochs = 10

epoch_size = 9206 # Total number of sequences
minibatch_size = 300 # Minimum number of tokens being fetched in a minibatch

epoch_size_val = 1150

progress_printer = C.logging.ProgressPrinter(minibatch_size)

# SGD learner
#lr_per_sample = C.learners.learning_rate_schedule(0.0015, C.learners.UnitType.sample)
# learner = C.learners.sgd(model.parameters, lr=lr_per_sample)

lr_schedule = C.learning_parameter_schedule(1, minibatch_size=C.learners.IGNORE)
t_schedule = C.momentum_schedule(0.971, minibatch_size=C.learners.IGNORE)
learner = adadelta = C.adadelta(model.parameters, lr_schedule, 0.999, 1e-6)

trainer = C.Trainer(model, (loss, label_error),
                    learner,
                    progress_printer)

input_map = {
    features : reader.streams.features,
    labels   : reader.streams.labels
}

for epoch in range(max_epochs):
    # Train on one epoch
    t = 0
    while t < epoch_size:
        mb = reader.next_minibatch(minibatch_size, input_map=input_map)
        trainer.train_minibatch(mb)
        t += mb[labels].num_samples # Current number of read sequences
    trainer.summarize_training_progress()
    
    print('Evaluating on the validation set')
    
    # Evaluate validation set after one epoch
    t = 0
    while t < epoch_size_val:
        mb = reader_val.next_minibatch(minibatch_size, input_map=input_map)
        trainer.test_minibatch(mb)
        t += mb[labels].num_samples
    trainer.summarize_test_progress()
    
    print('End of epoch', epoch)


#### Test the model

In [None]:
reader_test = create_reader(test_path, False, input_dim, num_classes)

num_test_sequences = 1150
test_minibatch_size = 1000
test_result = 0.0

read_test_sequences = 0
while read_test_sequences < num_test_sequences:
    mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map)
    eval_error = trainer.test_minibatch(mb)
    test_result = test_result + eval_error
    read_test_sequences += mb[labels].num_samples

trainer.summarize_test_progress()


#### Save the model

In [None]:
model_path = 'outputs/lstm_model.cmf'
model.save(model_path)

#### Test the saved model

In [None]:
from preprocess.normalize_sentences import SentenceNormalizer
from cntk.ops.functions import load_model

saved_model = load_model(model_path)

with open('dictionary.txt', 'r', encoding='utf-8') as f:
    dictionary = f.read().strip().split('\n') 

sent_normalizer = SentenceNormalizer(dictionary=dictionary)
normalized = sent_normalizer.fit_transform(
    ["Mortgage payoff trick eliminates up to 15 years",
     "Mortgage payoff trick eliminates up to"], to_index=True)

print(normalized)
pred_score = saved_model(C.Value.one_hot(normalized, vocab_size))
print(pred_score)

pred_class = np.argmax(pred_score, axis=1)
labels = []
with open('labels.txt', 'r', encoding='utf-8') as f:
    labels = f.read().strip().split('\n')
pred_class = [labels[p] for p in pred_class]
print(pred_class)
