In [1]:
import mxnet as mx
import numpy as np
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [2]:
# evaluation function
def perplexity(label, pred, ignore_label):
    label = label.T.reshape((-1,))
    loss = 0.
    for i in range(pred.shape[0]):
        if label[i] == ignore_label:
            break
        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
    return np.exp(loss / label.size)

In [204]:
# enc_train_input='data/1.en'
# dec_train_input='data/1.ru'
enc_train_input='data/train.en'
dec_train_input='data/train.ru'

num_buckets=1
num_layers=3
num_hidden=10
batch_size=1
iterations=1
expt_name='simple'
params_dir='params'
shuffle=False
reverse=True
top_words=0

In [220]:
import re
# import numpy as np

remove_chars = re.compile(r'([\(\)\'\"{}\[\]\*\-/])')
punctuation = re.compile(r'([!,.?:;@#$%&]+)')
def filter_text(text):
    text = remove_chars.sub(' ', text)
    # returns the text without the <EOS> token
    text = punctuation.sub(r' \1 ', text) # replaces the punctuation so that there is a space seperating it from the word
    text = text.lower().strip(' \t\n') # replaces big caps with small caps
    return text

white_spaces = re.compile(r'[ \n\r\t]+')
def get_vocab(file, vocab_count={}):
    with open(file, 'r', encoding='utf-8', errors='ignore') as fid:
        for line in fid:
            if len(line) == 0:
                continue
            tokens = white_spaces.split(filter_text(line))
            for token in tokens:
                if len(token) > 0:
                    if token in vocab_count:
                        vocab_count[token] += 1
                    else:
                        vocab_count[token] = 1
    return vocab_count

def text_2_indices(word2idx, text):
    # return the list of indices representing this text including the <EOS> token at the end...
    tokens = white_spaces.split(filter_text(text))
    indices = []
    unk_index = word2idx.get('<UNK>')
    indices = [ word2idx.get(token, unk_index) for token in tokens ]
    indices.append(word2idx['<EOS>'])
    return np.array(indices)

def get_unified_vocab(enc_input_file, dec_input_file, percentile=80):
    vocab_count = get_vocab(enc_input_file) # this returns a dictionary
    vocab_count = get_vocab(dec_input_file, vocab_count) # this returns a dictionary
    
    word_distribution = np.array( [ v for v in vocab_count.values() ] )
    min_count = np.percentile(word_distribution, percentile)
    vocab = []
    for k,v in vocab_count.items():
        if v >= min_count:
            vocab.append(k)
    vocab.sort()
    
    vocab.append('<UNK>') # token representing a word unseen in the training set, reserved for rare words
    vocab.append('<EOS>') # token representing the End-of-Sentence
    vocab.append('<PAD>') # token representing the padding for use in bucketing RNN of different lengths
    
    word2idx = { w:i for i,w in enumerate(vocab) }
    idx2word = [ w for w in vocab ]

    return word2idx, idx2word

def get_data_label(enc_input_file, dec_input_file, word2idx):
    enc_input = []
    with open(enc_input_file, 'r', encoding='utf-8', errors='ignore') as fid:
        for line in fid:
            indices = text_2_indices(word2idx, line)
            enc_input.append(indices)

    dec_input = []
    with open(dec_input_file, 'r', encoding='utf-8', errors='ignore') as fid:
        for line in fid:
            indices = text_2_indices(word2idx, line)
            dec_input.append(indices)
    return np.array( list(zip(enc_input, dec_input)) )

In [221]:
filter_text('asdf/ssd!!!')

'asdf ssd !!!'

In [222]:
word2idx, idx2word = get_unified_vocab(enc_train_input, dec_train_input, top_words)
train_data_label = get_data_label(enc_train_input, dec_train_input, word2idx)

In [223]:
idx2word

['!',
 '!!',
 '!!!',
 '!&',
 '!,',
 '!;',
 '!?',
 '#',
 '$',
 '$,',
 '$.',
 '$:',
 '%',
 '%!',
 '%,',
 '%.',
 '%.&#',
 '%:',
 '%;',
 '%?',
 '&',
 '&#',
 '+',
 '+1',
 '+2',
 '+20',
 '+3',
 '+39',
 ',',
 ',&',
 ',&#',
 ',,',
 ',.',
 '.',
 '.&',
 '.&#',
 '.,',
 '.,..',
 '..',
 '...',
 '...,',
 '....',
 '.....',
 '.:',
 '.;',
 '.?',
 '0',
 '00',
 '000',
 '000jihadi',
 '000th',
 '000\xa0children',
 '000\xa0kilometers',
 '000”',
 '007',
 '01',
 '010',
 '011630',
 '0116302248',
 '012',
 '018',
 '02',
 '025',
 '029',
 '03',
 '033',
 '04',
 '05',
 '057',
 '06',
 '07',
 '070',
 '0711',
 '08',
 '08»',
 '09',
 '0\xa0баллов',
 '0°с',
 '0»',
 '0”',
 '1',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '100amp',
 '100bn',
 '100m',
 '100th',
 '100\xa0000',
 '100\xa0млрд',
 '101',
 '101000',
 '1010010010001',
 '101st',
 '102',
 '1029',
 '103',
 '10307',
 '104',
 '105',
 '10500',
 '105m',
 '105th',
 '106',
 '106000',
 '107',
 '108',
 '1084',
 '109',
 '1090',
 '10amp',
 '10over100',
 '10th',
 '10\xa0',
 '1

In [144]:
# import mxnet as mx
# import numpy as np

from collections import namedtuple

LSTMState = namedtuple("LSTMState", ["c", "h"])
LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias", "h2h_weight", "h2h_bias"])

def lstm_cell(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
    """LSTM Cell symbol"""
    if dropout > 0.:
        indata = mx.sym.Dropout(data=indata, p=dropout)
    
    i2h = mx.sym.FullyConnected(
        data=indata,
        weight=param.i2h_weight,
        bias=param.i2h_bias,
        num_hidden=num_hidden * 4,
        name="t%d_l%d_i2h" % (seqidx, layeridx)
    )

    h2h = mx.sym.FullyConnected(
        data=prev_state.h,
        weight=param.h2h_weight,
        bias=param.h2h_bias,
        num_hidden=num_hidden * 4,
        name="t%d_l%d_h2h" % (seqidx, layeridx)
    )

    gates = i2h + h2h
    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4, name="t%d_l%d_slice" % (seqidx, layeridx))

    in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid")
    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
    forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid")
    out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid")

    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")

    return LSTMState(c=next_c, h=next_h)

def init_lstm(num_layer):
    param_cells = []
    last_states = []
    for i in range(num_layer):
        param_cells.append(
            LSTMParam(
                i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
                i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
                h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
                h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)
            )
        )
        last_states.append(
            LSTMState(
                c=mx.sym.Variable("l%d_init_c" % i),
                h=mx.sym.Variable("l%d_init_h" % i)
            )
        )
    return param_cells, last_states

def lstm_unroll(num_layer, seqlen, num_hidden, num_labels, dropout=0.0):
    cls_weight   = mx.sym.Variable("cls_weight")
    cls_bias     = mx.sym.Variable("cls_bias")
    embed_weight = mx.sym.Variable("embed_weight")

    param_cells, last_states = init_lstm(num_layer)
    data = mx.sym.Variable('data')
    label = mx.sym.Variable('label')
    
    embed = mx.sym.Embedding(
        data=data, # the idx to the embedding
        input_dim=num_labels, # the number of rows for embed_weight
        weight=embed_weight,  # the matrix representing the idx2vec
        output_dim=num_hidden, # the number of cols for embed_weight
        name='embed'
    )
    
    wordvec = mx.sym.SliceChannel(data=embed, num_outputs=seqlen, squeeze_axis=1)

    hidden_all = []
    for seqidx in range(seqlen):
        hidden = wordvec[seqidx]
        # stack LSTM
        for i in range(num_layer):
            dp = 0.0 if i == 0 else dropout
            next_state = lstm_cell(
                num_hidden,
                indata=hidden,
                prev_state=last_states[i],
                param=param_cells[i],
                seqidx=seqidx,
                layeridx=i,
                dropout=dp
            )
            hidden = next_state.h
            last_states[i] = next_state
        # decoder
        if dropout > 0.0:
            hidden = mx.sym.Dropout(data=hidden, p=dropout)
        
        hidden_all.append(hidden)
        
    hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
    pred = mx.sym.FullyConnected(
        data=hidden_concat,
        num_hidden=num_labels, # num_labels is the index of <PAD> that means this layer will predict 0, 1, ..., num_labels-1
        weight=cls_weight,
        bias=cls_bias,
        name='pred'
    )

    label = mx.sym.transpose(data=label) # e.g. if shape is (1,M) it becomes (M,1)
    label = mx.sym.Reshape(data=label, shape=(-1,)) # if shape is (M,1) it becomes (M,)
    output = mx.sym.SoftmaxOutput(
        data=pred,
        label=label,
        name='t%d_softmax' % seqidx,
        use_ignore=True,
        ignore_label=num_labels # ignore the index of <PAD>
    ) # output becomes (num_labels, M)
    return output

def get_lstm_sym_generator(num_layers, num_hidden, num_labels, dropout=0.0):
    def generate_lstm_sym(seqlen):
        return lstm_unroll(num_layers, seqlen, num_hidden, num_labels, dropout)
    return generate_lstm_sym

def get_lstm_init_states(num_layers, num_dim, batch_size=1):
    init_h = [('l%d_init_h' % i, (batch_size, num_dim)) for i in range(num_layers)]
    init_c = [('l%d_init_c' % i, (batch_size, num_dim)) for i in range(num_layers)]
    init_states = init_h + init_c
    return init_states

In [137]:
# import mxnet as mx
# import numpy as np

from collections import namedtuple
from sklearn.cluster import KMeans

class EncoderDecoderBatch(object):
    def __init__(self, all_data, all_label, init_states, bucket_key):
        self.pad = 0 # at this point i do not know what is this for...
        
        #all_data.shape is (x,y,z)
        self.batch_size = all_data.shape[0]
        
        # provide data, essential assignment
        self.data = [ mx.nd.array(all_data) ]
        
        # essential assignment
        self.provide_data = [('data', (self.batch_size, bucket_key))]
        for x in init_states:
            self.data.append(mx.nd.zeros(x[1])) # x[1] is the shape of the initial data
            self.provide_data.append(x)

        # provide label, essential assignment
        self.label = [ mx.nd.array(all_label) ]
        self.provide_label = [ ('label', (self.batch_size, bucket_key)) ]
        
        self.init_states = init_states

        # bucket_key is essential for this databatch
        self.bucket_key = bucket_key

def synchronize_batch_size(train_iter, test_iter):
    batch_size = min(train_iter.batch_size, test_iter.batch_size)
    train_iter.batch_size = batch_size
    test_iter.batch_size = batch_size
    train_iter.generate_init_states()
    test_iter.generate_init_states()

# now define the bucketing, padding and batching SequenceIterator...
class EncoderDecoderIter(mx.io.DataIter):
    def __init__(self, data_label, word2idx, idx2word, num_hidden, num_layers,
                 init_states_function, batch_size=1, num_buckets=10, shuffle=False, rev=False):

        super(EncoderDecoderIter, self).__init__() # calling DataIter.__init__()

        # data is a numpy array of 3 dimensions, #, timesteps, vector_dim
        self.data_label = data_label

        self.word2idx = word2idx
        self.idx2word = idx2word

        self.num_hidden = num_hidden
        self.num_layers = num_layers
        self.num_buckets = num_buckets

        # now we need to find the buckets based on the input data...
        self.buckets, self.buckets_count, self.assignments = self.generate_buckets()
        # buckets are a tuple of the encoder/decoder length

        self.batch_size = min(np.min(self.buckets_count), batch_size)
        self.init_states_function = init_states_function
        self.pad_label = word2idx['<PAD>']
        self.shuffle = shuffle
        self.rev = rev # reverse the encoder input
        self.reset()
        self.generate_init_states()

    def generate_init_states(self):
        self.init_states = self.init_states_function(self.num_layers, self.num_hidden, self.batch_size)

    def generate_buckets(self):
        enc_dec_data = [ len(data)+len(label)-1 for data, label in self.data_label ]
        enc_dec_data = np.reshape(np.array(enc_dec_data), (-1, 1))

        kmeans = KMeans(n_clusters=self.num_buckets, random_state=1) # use clustering to decide the buckets
        assignments = kmeans.fit_predict(enc_dec_data) # get the assignments

        # get the max of every cluster
        buckets = np.array([np.amax(enc_dec_data[assignments==i]) for i in range(self.num_buckets)])

        # get # of sequences in each bucket... then assign the batch size as the minimum(minimum(bucketsize), batchsize)
        buckets_count = np.array([enc_dec_data[assignments==i].shape[0] for i in range(self.num_buckets)])

        return buckets, buckets_count, assignments

    @property
    def default_bucket_key(self):
        return np.amax(self.buckets)

    @property
    def provide_data(self): # this is necessary when specifying custom DataIter
        # length of data variable is length of encoder + length of decoder
        bucket_key = self.default_bucket_key
        return [('data', (self.batch_size, bucket_key))] + self.init_states
    #
    @property
    def provide_label(self): # this is necessary when specifying custom DataIter
        # length of label variable is only the length of decoder
        bucket_key = self.default_bucket_key
        return [('label', (self.batch_size, bucket_key))]

    # for custom DataIter, we must implement this class as an iterable and return a DataBatch
    def __iter__(self): # this is necessary to convert this class into an iterable
        return self

    def __next__(self):
        if self.iter_next():
            # suppose to get self.cursor:self.cursor + self.batch_size
            batch = self.data_label[self.assignments == self.cur_permute_bucket]\
                [self.in_bucket_permutation[self.cursor:self.cursor+self.batch_size]]

            # get size of this bucket
            seqlen = self.buckets[self.cur_permute_bucket] # this seqlen already deducted the <EOS>

            all_data = np.full((self.batch_size, seqlen), self.pad_label, dtype=float)
            all_label = np.full((self.batch_size, seqlen), self.pad_label, dtype=float)

            for i, (data, label) in enumerate(batch):
                if self.rev:
                    # reverse the input except for the <EOS> at end of input
                    # according to Ilya Sutskever et al. Sequence to Sequence Learning with Neural Networks
                    # there is a reason for this... which you should ask freddy
                    data[:-1] = np.flipud(data[:-1])

                all_data[i, :data.shape[0]] = data
                all_data[i, data.shape[0]:data.shape[0]+label.shape[0]-1] = label[:-1]
                all_label[i, data.shape[0]-1:data.shape[0]-1+label.shape[0]] = label

            return EncoderDecoderBatch(all_data, all_label, self.init_states, seqlen)
        else:
            raise StopIteration

    def iter_next(self):
        self.cursor += self.batch_size
        if self.cursor < self.buckets_count[self.cur_permute_bucket]:
            if self.cursor + self.batch_size > self.buckets_count[self.cur_permute_bucket]:
                # it is going to overflow the bucket
                self.cursor -= self.cursor + self.batch_size - self.buckets_count[self.cur_permute_bucket]
            return True
        else:
            self.cur_bucket += 1
            if self.cur_bucket < self.num_buckets:
                self.cursor = 0
                self.cur_permute_bucket = self.bucket_permutation[self.cur_bucket]
                if self.shuffle:
                    self.in_bucket_permutation = np.random.permutation(self.buckets_count[self.cur_permute_bucket])
                else:
                    self.in_bucket_permutation = np.array(range(self.buckets_count[self.cur_permute_bucket]))
                return True
            else:
                return False

    def reset(self): # for iterable
        self.cursor = -self.batch_size
        self.cur_bucket = 0

        if self.shuffle:
            self.bucket_permutation = np.random.permutation(self.num_buckets)
        else:
            self.bucket_permutation = np.array(range(self.num_buckets))

        self.cur_permute_bucket = self.bucket_permutation[self.cur_bucket]
        if self.shuffle:
            self.in_bucket_permutation = np.random.permutation(self.buckets_count[self.cur_permute_bucket])
        else:
            self.in_bucket_permutation = np.array(range(self.buckets_count[self.cur_permute_bucket]))


In [138]:
train_iter = EncoderDecoderIter(train_data_label, word2idx, idx2word,
            num_hidden, num_layers, get_lstm_init_states, batch_size=batch_size,
            num_buckets=num_buckets, shuffle=shuffle, rev=reverse)

In [139]:
def print_iter(iter):
    iter.reset()
    print('provide_data: ', iter.provide_data)
    print('provide_label: ', iter.provide_label)
    print('buckets: ', iter.buckets)
    print('buckets count: ', iter.buckets_count)
    print('assignments: ', iter.assignments)
    print('batch_size: ', iter.batch_size)
    for i, data_batch in enumerate(iter):
        print(i, data_batch.provide_data)
        print(i, data_batch.provide_label)
        print(i, data_batch.bucket_key)
#         print(i, data_batch.data)
        for j, d in enumerate(data_batch.data):
#             print(i, j, data_batch.data[j].shape)
            if j==0:
                print(i, 'data:', data_batch.data[j].asnumpy())
#         print(i, data_batch.label)
#         print(i, data_batch.label[0].shape)
        print(i, 'label:', data_batch.label[0].asnumpy())
#         print('\n')
        break

In [140]:
print_iter(train_iter)

provide_data:  [('data', (1, 12)), ('l0_init_h', (1, 10)), ('l1_init_h', (1, 10)), ('l2_init_h', (1, 10)), ('l0_init_c', (1, 10)), ('l1_init_c', (1, 10)), ('l2_init_c', (1, 10))]
provide_label:  [('label', (1, 12))]
buckets:  [12]
buckets count:  [10]
assignments:  [0 0 0 0 0 0 0 0 0 0]
batch_size:  1
0 [('data', (1, 12)), ('l0_init_h', (1, 10)), ('l1_init_h', (1, 10)), ('l2_init_h', (1, 10)), ('l0_init_c', (1, 10)), ('l1_init_c', (1, 10)), ('l2_init_c', (1, 10))]
0 [('label', (1, 12))]
0 12
0 data: [[  6.  41.  23.  42.  42.  42.  42.  42.  42.  42.  42.  42.]]
0 label: [[ 42.  23.  41.  42.  42.  42.  42.  42.  42.  42.  42.  42.]]


In [147]:
import os

context = mx.cpu()

model_args = {}
if os.path.isfile('%s/%s-symbol.json' % (params_dir, expt_name)):
    filelist = os.listdir(params_dir) # get list of params file
    paramfilelist = []
    for f in filelist:
        if f.startswith('%s-' % expt_name) and f.endswith('.params'):
            paramfilelist.append( int(re.split(r'[-.]', f)[1]) )
    last_iteration = max(paramfilelist)
    print('loading pretrained model %s/%s at epoch %d' % (params_dir, expt_name, last_iteration))
    tmp = mx.model.FeedForward.load('%s/%s' % (params_dir, expt_name), last_iteration)
    model_args.update({
        'arg_params' : tmp.arg_params,
        'aux_params' : tmp.aux_params,
        'begin_epoch' : tmp.begin_epoch
    })

num_labels = len(word2idx)
iterations = 1000
model = mx.model.FeedForward(
    ctx           = context, # uses all the available CPU in the machine
    symbol        = get_lstm_sym_generator(num_layers, num_hidden, num_labels),
    num_epoch     = iterations,
    learning_rate = 0.1,
    momentum      = 0.0,
    wd            = 0.00001,
    initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
    **model_args
)

if not os.path.exists(params_dir):
    os.makedirs(params_dir)

loading pretrained model params/simple at epoch 1


In [148]:
model.fit(
    X = train_iter,
    eval_metric = mx.metric.np(perplexity, use_ignore=True, ignore_label=num_labels),
    batch_end_callback = [ mx.callback.Speedometer(batch_size, frequent=10) ],
    epoch_end_callback = [ mx.callback.do_checkpoint( '%s/%s' % (params_dir, expt_name) ) ]
)

In [149]:
last_iteration = iterations
print('loading pretrained model %s/%s at epoch %d' % (params_dir, expt_name, last_iteration))
_, arg_params, __ = mx.model.load_checkpoint('%s/%s' % (params_dir, expt_name), last_iteration)

loading pretrained model params/simple at epoch 1000


In [155]:
def lstm_inference_symbol(num_layer, num_hidden, num_labels, dropout=0.0):
    param_cells, last_states = init_lstm(num_layer)
    
    data = mx.sym.Variable('data')
    embed_weight=mx.sym.Variable("embed_weight")
    
    hidden = mx.sym.Embedding(data=data, input_dim=num_labels, weight=embed_weight, output_dim=num_hidden, name='embed')
    
    # stack layers of LSTM for 1 sequence
    for i in range(num_layer):
        dp = 0.0 if i == 0 else dropout
        next_state = lstm_cell(
            num_hidden,
            indata=hidden,
            prev_state=last_states[i],
            param=param_cells[i],
            seqidx=0,
            layeridx=i,
            dropout=dp
        )
        hidden = next_state.h
        last_states[i] = next_state
    
    if dropout > 0.0:
        hidden = mx.sym.Dropout(data=hidden, p=dropout)
    
    output = []
    for state in last_states:
        # very important to be in this order!!!
        output.append(state.h)
        output.append(state.c)
    
    return mx.sym.Group(output)

class LSTMInferenceModel(object):
    def __init__(self, num_layer, num_hidden, num_labels, arg_params, ctx=mx.cpu(), dropout=0.0):
        
        self.sym = lstm_inference_symbol(num_layer, num_hidden, num_labels, dropout)
        self.num_labels = num_labels
        
        batch_size = 1
        init_states = get_lstm_init_states(num_layer, num_hidden, batch_size)
        data_shape = [("data", (batch_size, ))]

        input_shapes = dict(init_states + data_shape)
        self.executor = self.sym.simple_bind(ctx=ctx, **input_shapes)

        # copy the transition parameters over to executor
        for key in self.executor.arg_dict.keys():
            if key in arg_params:
                arg_params[key].copyto(self.executor.arg_dict[key])

        state_name = []
        for i in range(num_layer):
            # very important to be in this order!!!
            state_name.append("l%d_init_h" % i)
            state_name.append("l%d_init_c" % i)

        self.states_dict = dict(zip(state_name, self.executor.outputs)) # this transfer the output of previous state to current

        self.cls_weight = arg_params['cls_weight']
        self.cls_bias   = arg_params['cls_bias']
        self.ctx = ctx

    def predict(self, x):
        # another symbolic graph here... 
        data       = mx.sym.Variable('data')
        cls_weight = mx.sym.Variable("cls_weight")
        cls_bias   = mx.sym.Variable("cls_bias")
    
        pred = mx.sym.FullyConnected(
            data       = data,
            num_hidden = self.num_labels,
            weight     = cls_weight,
            bias       = cls_bias,
            name       = 'pred'
        )
        
        output = mx.sym.SoftmaxOutput(
            data = pred,
            name = 'softmax'
        )
        
        executor = output.bind(ctx=self.ctx, args={
            'data': x,
            'cls_weight': self.cls_weight,
            'cls_bias'  : self.cls_bias,
            'softmax_label': mx.nd.array([0]) # this is a dummy label, just meant to fulfill the requirements...
        })
        
        executor.forward()
        prob = np.squeeze(executor.outputs[0].asnumpy())
        return prob
        
    def forward(self, input_data, new_seq=False):
        # input data is of shape (seqlen, dim)
        # input data has to be of type numpy.array
        if new_seq == True:
            # this is meant to reset the initial states to 0.0
            for key in self.states_dict.keys():
                self.executor.arg_dict[key][:] = 0.0
        
        for x in input_data:
            y = mx.nd.array([x]) # put it in a [] so that the shape becomes (1, xxx)
            y.copyto(self.executor.arg_dict["data"])
            self.executor.forward() # move forward one step...
            for key in self.states_dict.keys():
                # copy the hidden and c to the init_states for the next sequence
#                 print('forwarding', key)
                self.states_dict[key].copyto(self.executor.arg_dict[key])
        
        return self.predict(self.states_dict['l2_init_h']) # change this to use last layer next time...
    
    def translate(self, text, reverse=True):
        

In [158]:
model2 = LSTMInferenceModel(num_layers, num_hidden, num_labels, arg_params)

In [171]:
# get the word...
def get_word(prob, idx2word, sample=True):
    if sample:
        cdf = np.cumsum(prob) / np.sum(prob)
        idx = np.argmax(np.random.rand(1) < cdf)
    else:
        idx = np.argmax(prob)
    return idx, idx2word[idx]

In [182]:
def translate(model2, text, idx2word, reverse=True):
    data = text_2_indices(word2idx, text)
    if reverse:
        data[:-1] = np.flipud(data[:-1])
    eos_idx = word2idx['<EOS>']
    
    words = ''
    prob = model2.forward(data, new_seq=True)
    idx, word = get_word(prob, idx2word, sample=True)
    while idx != eos_idx:
        words += word + ' '
        prob = model2.forward(np.array([idx]))
        idx, word = get_word(prob, idx2word, sample=True)
    
    return words.strip()

In [190]:
translate(model2, 'good dinner', idx2word)

'спасибо'

In [188]:
translate(model2, 'i just ate', idx2word)

'ужин хороший'

In [None]:
я только что съел мой ужин