In [30]:
import cntk as C
import numpy as np

#Import local modules
import os
import sys
modulesPath = "scripts"
modulesPath = os.path.abspath(os.path.join(modulesPath))
if modulesPath not in sys.path: sys.path.append(modulesPath)
from bicorpus import Bicorpus

C.cntk_py.set_fixed_random_seed(0)

#Model hyperparameters
my_dtype = np.float32
hidden_dim = 512
num_layers = 2
attention_dim = 128
use_attention = True
use_embedding = True
embedding_dim = 200

vocabSize = 30000
sourceVocabSize = vocabSize
destVocabSize = vocabSize

numSequences = 10000
training_ratio = 3 / 4

In [31]:
files = {}

sourceTraining = "corpora/europarl-v7.es-en.es"
destTraining = "corpora/europarl-v7.es-en.en"



with open(sourceTraining, "r", encoding = "utf-8") as sourceFile:
    sourceLines = sourceFile.readlines()
with open(destTraining, "r", encoding = "utf-8") as destFile:
    destLines = destFile.readlines()

trainingCorp = Bicorpus(sourceLines, destLines, vocabSize = vocabSize, numSequences = 10000)


500 sequences read.
1000 sequences read.
1500 sequences read.
2000 sequences read.
2500 sequences read.
3000 sequences read.
3500 sequences read.
4000 sequences read.
4500 sequences read.
5000 sequences read.
5500 sequences read.
6000 sequences read.
6500 sequences read.
7000 sequences read.
7500 sequences read.
8000 sequences read.
8500 sequences read.
9000 sequences read.
9500 sequences read.
10000 sequences read.


In [32]:
training_lines = trainingCorp.training_lines()
sourceW2I, destW2I = trainingCorp.getW2IDicts()
sourceI2W, destI2W = trainingCorp.getI2WDicts()
sourceVocabSize, destVocabSize = len(sourceW2I), len(destW2I)

seq_start_index = destW2I[Bicorpus.start_token()]
seq_end_index = destW2I[Bicorpus.end_token()]
seq_start = C.constant(np.asarray([i == seq_start_index for i in range(len(destW2I))], dtype = my_dtype))

In [33]:
# Source and target inputs to the model
sourceAxis = C.Axis("sourceAxis")
destAxis = C.Axis("destAxis")
sourceSequence = C.layers.SequenceOver[sourceAxis]
destSequence = C.layers.SequenceOver[destAxis]

In [45]:
#Returns a general sequence-to-sequence model
def create_model():
    print("Called create_model()")
    embed = C.layers.Embedding(embedding_dim, name = "embed") if use_embedding else identity #Where is "identity defined?
    
    with C.layers.default_options(enable_self_stabilization = True, go_backwards = not use_attention):
        LastRecurrence = C.layers.Fold if not use_attention else C.layers.Recurrence
        encode = C.layers.Sequential([
            embed,
            C.layers.Stabilizer(),
            C.layers.For(range(num_layers - 1), lambda: C.layers.Recurrence(C.layers.GRU(hidden_dim))),
            LastRecurrence(C.layers.GRU(hidden_dim), return_full_state = True),
            C.layers.Label("encoded_h")                                  
        ])
        
    print("Defined the encoder.")
    
    with C.layers.default_options(enable_self_stabilization = True):
        stab_in = C.layers.Stabilizer()
        rec_blocks = [C.layers.GRU for i in range(num_layers)]
        stab_out = C.layers.Stabilizer()
        proj_out = C.layers.Dense(destVocabSize, name = "out_proj")
        if use_attention:
            attention_model = C.layers.AttentionModel(attention_dim, name = "attention_model")
            
        @C.Function
        def decode(history, input):
            encoded_input = encode(input)
            r = history
            r = embed(r)
            r = stab_in(r)
            for i in range(num_layers):
                print("i =", i)
                rec_block = rec_blocks[i]
                if i == 0:
                    if use_attention:
                        @C.Function
                        def gru_with_attention(dh, x):
                            h_att = attention_model(encoded_input.outputs[0], dh)
                            x = C.splice(x, h_att)
                            return rec_block(dh, x)
                        r = C.layers.Recurrence(gru_with_attention)(r)
                    else:
                        r = C.layers.Recurrence(rec_block)(r)
                else:
                    r = C.layers.RecurrenceFrom(rec_block)( *(encoded_input.outputs + (r,)) )
            r = stab_out(r)
            r = proj_out(r)
            r = C.layers.Label("out_proj_out")(r)
            return r
        
        print("Defined the decoder.")
        
        return decode

In [35]:
def create_model_train(s2smodel):
    @C.Function
    def model_train(input, labels):
        past_labels = C.layers.Delay(initial_state = sequence_start)(labels)
        return s2smodel(past_labels, input)

In [36]:
#Model used in testing
def create_model_greedy(s2smodel):
    @C.Function
    @C.layers.Signature(InputSequence[C.layers.Tensor[input_vocab_dim]])
    def model_greedy(input):
        unfold = C.layers.UnfoldFrom(lambda history: s2smodel(history, input) >> C.hardmax,
                                    until_predicate = lambda w: w[..., sentence_end_index],
                                    length_increase = length_increase)
        return unfold(initial_state = sentence_start, dynamic_axes_like = input)
    return model_greedy

In [37]:
def create_criterion_function(model):
    @C.Function
    @C.layers.Signature(input=sourceSequence[C.layers.Tensor[input_vocab_dim]],
                        labels=destSequence[C.layers.Tensor[label_vocab_dim]])
    def criterion(input, labels):
        # criterion function must drop the <s> from the labels
        postprocessed_labels = C.sequence.slice(labels, 1, 0) # <s> A B C </s> --> A B C </s>
        z = model(input, labels)
        ce = C.cross_entropy_with_softmax(z, postprocessed_labels) #labels)
        errs = C.classification_error(z, postprocessed_labels) #labels)
        return (ce, errs)

    return criterion

In [38]:
def format_sequences(sequences, i2w):
    return [" ".join([i2w[np.argmax(w)] for w in s]) for s in sequences]

def debug_attention(model, input):
    q = C.combine([model, model.attention_model.attention_weights])
    #words, p = q(input) # Python 3
    words_p = q(input)
    words = words_p[0]
    p     = words_p[1]
    output_seq_len = words[0].shape[0]
    p_sq = np.squeeze(p[0][:output_seq_len,:,:]) # (batch, output_len, input_len, 1)
    opts = np.get_printoptions()
    np.set_printoptions(precision=5)
    print(p_sq)
    np.set_printoptions(**opts)

In [39]:
#The original had "vocab" as a parameter but never used it
def train(train_reader, valid_reader, sourceW2I, destW2I, s2smodel, max_epochs, epoch_size):
    model_train = create_model_train(s2smodel)
    criterion = create_criterion_function(model_train)
    model_greedy = create_model_greedy(s2smodel)
    
    minibatch_size = 72
    lr = 0.001 if use_attention else 0.005
    learner = C.fsadagrad(model_train.parameters,
                         lr = C.learning_rate_schedule([lr]*2 + [lr/2]*3 +[lr/4], C.UnitType.sample, epoch_size),
                         momentum = C.momentum_as_time_constant_schedule(1100),
                         gradient_clipping_threshold_per_sample = 2.3,
                         gradient_clipping_with_truncation = True)
    trainer = C.Trainer(None, criterion, learner)                      #
    
    total_samples = 0
    mbs = 0
    eval_freq = 100
    
    C.logging.log_number_of_parameters(model_train) ; print()
    progress_printer = C.logging.ProgressPrinter(freq = 30, tag = "Training")
    
    sparse_to_dense = create_sparse_to_dense(input_vocab_dim)
    
    for epoch in range(max_epochs):
        while total_samples < (epoch + 1) * epoch_size:
            mb_train = train_reader.next_minibatch(minibatch_size)
            trainer.train_minibatch({criterion.arguments[0]: mb_train[train_reader.streams.features],
                                     criterion.arguments[1]: mb_train[train_reader.streams.labels]})
            progress_printer.update_with_trainer(trainer, with_metric = True)
            
            if mbs % eval_freq == 0:
                mb_valid = valid_reader.next_minibatch(1)
                e = model_greedy(mb_valid[valid_reader.streams.features])
                
                #Need to i2w to my own dictionary
                #Really, I just need to add a function to my Bicorpus class
                print(format_sequence(sparse_to_dense(mb_valid[valid_reader.streams.features]), sourceI2W))
                print("-->")
                print(format_sequences(e, destI2W))
                
                if use_attention:
                    debug_attention(model_greedy, mb_valid[valid_reader.streams.features])
                    
                total_samples += mb.train[train_reader.streams.labels].num_samples
                mbs += 1
                
    progress_printer.epoch_summary(with_metric = True)

In [44]:
model = create_model()
"""
model_train = create_model_train(model)
criterion = create_criterion_function(model_train)

minibatch_size = 72
lr = 0.001 if use_attention else 0.005


learner = C.fsadagrad(model_train.parameters,
                         lr = C.learning_rate_schedule([lr]*2 + [lr/2]*3 +[lr/4], C.UnitType.sample, epoch_size),
                         momentum = C.momentum_as_time_constant_schedule(1100),
                         gradient_clipping_threshold_per_sample = 2.3,
                         gradient_clipping_with_truncation = True)

trainer = C.Trainer(None, criterion, learner)

trainer.train_minibatch({criterion.arguments[0]: ["Hola"],
                         criterion.arguments[1]: ["Hello"]
})
"""

Called create_model()
Defined the encoder.
i = 0


SystemError: <built-in function slice> returned a result with an error set