In [1]:
import mxnet as mx
import numpy as np

In [2]:
english_file = 'data/news-commentary-v9.ru-en.en'
russian_file = 'data/news-commentary-v9.ru-en.ru'

train_en_file = 'data/train.en'
train_ru_file = 'data/train.ru'

test_en_file = 'data/test.en'
test_ru_file = 'data/test.ru'

In [None]:
train_proportion = 0.9

In [3]:
import re
punctuation = re.compile(r'([!,.?:;@#$%&\'\"]+)')
def filter_text(text):
    # this matches the function in scala for processing the text
    text = punctuation.sub(r' \1 ', text) # replaces the 4 punctuation so that there is a space seperating it from the word
    text = text.lower().strip(' \t\n') # replaces big caps with small caps
    text = text + ' <EOS>'
    return text

In [None]:
train_en_fid = open(train_en_file, 'w', encoding='utf-8')
train_ru_fid = open(train_ru_file, 'w', encoding='utf-8')

test_en_fid = open(test_en_file, 'w', encoding='utf-8')
test_ru_fid = open(test_ru_file, 'w', encoding='utf-8')

fid_en = open(english_file, 'r', encoding='utf-8', errors='ignore')
fid_ru = open(russian_file, 'r', encoding='utf-8', errors='ignore')

for line_en in fid_en:
    line_en = line_en.strip(' \n\t')
    line_ru = fid_ru.readline()
    if len(line_en) == 0:
        continue
    
    line_en = line_en + '\n'
    line_ru = line_ru.strip(' \n\t') + '\n'
    
    if np.random.rand() < train_proportion:
        train_en_fid.write(line_en)
        train_ru_fid.write(line_ru)
    else:
        test_en_fid.write(line_en)
        test_ru_fid.write(line_ru)

train_en_fid.close()
test_en_fid.close()
train_ru_fid.close()
test_ru_fid.close()
fid_en.close()
fid_ru.close()

In [4]:
white_spaces = re.compile(r'[ \n\r\t]+')

def get_vocab(file, vocab_count):
    with open(file, 'r', encoding='utf-8', errors='ignore') as fid:
        for line in fid:
            if len(line) == 0:
                continue
            line2 = filter_text(line)
            tokens = white_spaces.split(line2)
            for token in tokens:
                if len(token) > 0:
                    if token in vocab_count:
                        vocab_count[token] += 1
                    else:
                        vocab_count[token] = 1              
    return vocab_count

def text_2_indices(word2idx, text):
    tokens = white_spaces.split(filter_text(text)) #add the <EOS> and split it...
    indices = []
    unk_index = word2idx.get('UNK')
    indices = [ word2idx.get(token, unk_index) for token in tokens ]
    return np.array(indices)

In [13]:
def get_unified_vocab(enc_input_file, dec_input_file):
    vocab_count = {}
    vocab_count = get_vocab(enc_input_file, vocab_count) # this returns a dictionary
    vocab_count = get_vocab(dec_input_file, vocab_count) # this returns a dictionary
    
    word_distribution = np.array( [ v for v in vocab_count.values() ] )
    min_count = np.percentile(word_distribution, 80)
    vocab = []
    for k,v in vocab_count.items():
        if v >= min_count:
            vocab.append(k)
    vocab.append('UNK')
    vocab.sort()
    word2idx = { w:i for i,w in enumerate(vocab) }
    idx2word = [ w for w in vocab ] #{ v:k for k,v in word2idx.items() }

    pad_index = len(word2idx)
    word2idx['<PAD>'] = pad_index
    idx2word.append('<PAD>')
    return word2idx, idx2word

def get_data_label(enc_input_file, dec_input_file, word2idx):
    enc_input = []
    with open(enc_input_file, 'r', encoding='utf-8', errors='ignore') as fid:
        for line in fid:
            indices = text_2_indices(word2idx, line)
            enc_input.append(indices)
    dec_input = []
    with open(dec_input_file, 'r', encoding='utf-8', errors='ignore') as fid:
        for line in fid:
            indices = text_2_indices(word2idx, line)
            dec_input.append(indices)
    return np.array( list(zip(enc_input, dec_input)) )

In [14]:
word2idx, idx2word = get_unified_vocab(train_en_file, train_ru_file)

In [15]:
word2idx['electing']

5529

In [18]:
idx2word[-2]

'\uf0b7'

In [None]:
# 'electing': 5529

In [None]:
def get_lstm_init_states(num_layers, num_dim, batch_size=1):
    init_h = [('l%d_init_h' % i, (batch_size, num_dim)) for i in range(num_layers)]
    init_c = [('l%d_init_c' % i, (batch_size, num_dim)) for i in range(num_layers)]
    init_states = init_h + init_c
    return init_states

In [None]:
from collections import namedtuple
from sklearn.cluster import KMeans

EncDecBucketKey = namedtuple('EncDecBucketKey', ['enc_len', 'dec_len'])

class EncoderDecoderBatch(object):
    def __init__(self, all_data, all_label, init_states, bucket_key):
        # provide data, essential assignment
        self.data = [ mx.nd.array(all_data) ] + [ mx.nd.zeros(x[1]) for x in init_states ]

        # provide label, essential assignment
        self.label = [ mx.nd.array(all_label) ]

        self.init_states = init_states

        # bucket_key is essential for this databatch
        self.bucket_key = bucket_key

        #all_data.shape is (x,y,z)
        self.batch_size = all_data.shape[0]

    # this two properties are essential too!
    @property
    def provide_data(self):
        return [('data', (self.batch_size, self.bucket_key.enc_len + self.bucket_key.dec_len))] + self.init_states

    @property
    def provide_label(self):
        return [('label', (self.batch_size, self.bucket_key.dec_len))]


def synchronize_batch_size(train_iter, test_iter):
    batch_size = min(train_iter.batch_size, test_iter.batch_size)
    train_iter.batch_size = batch_size
    test_iter.batch_size = batch_size
    train_iter.generate_init_states()
    test_iter.generate_init_states()


# now define the bucketing, padding and batching SequenceIterator...
class EncoderDecoderIter(mx.io.DataIter):
    def __init__(self, data_label, word2idx, idx2word, num_hidden, num_layers, 
                 init_states_function, batch_size=1, num_buckets=10, shuffle=False, rev=False):

        super(EncoderDecoderIter, self).__init__() # calling DataIter.__init__()

        # data is a numpy array of 3 dimensions, #, timesteps, vector_dim
        self.data_label = data_label
        
        self.word2idx = word2idx
        self.idx2word = idx2word

        self.num_hidden = num_hidden
        self.num_layers = num_layers
        self.num_buckets = num_buckets

        # arrange the data so that

        # now we need to find the buckets based on the input data...
        self.buckets, self.buckets_count, self.assignments = self.generate_buckets()
        # buckets are a tuple of the encoder/decoder length

        self.batch_size = min(np.min(self.buckets_count), batch_size)
        self.init_states_function = init_states_function
        self.pad_label = word2idx['<PAD>']
        self.shuffle = shuffle
        self.rev = rev # reverse the encoder input
        self.reset()
        self.generate_init_states()

    def generate_init_states(self):
        self.init_states = self.init_states_function(self.num_layers, self.num_hidden, self.batch_size)

    def generate_buckets(self):
        enc_dec_data = []
        for data, label in data_label:
            enc_len = len(data) - 1 # minue one because of the <EOS>
            dec_len = len(label)
            enc_dec_data.append((enc_len, dec_len))

        enc_dec_data = np.array(enc_dec_data)

        kmeans = KMeans(n_clusters = self.num_buckets, random_state = 1) # use clustering to decide the buckets
        assignments = kmeans.fit_predict(enc_dec_data) # get the assignments

        # get the max of every cluster
        buckets = np.array([np.max( enc_dec_data[assignments==i], axis=0 ) for i in range(self.num_buckets)])

        # get # of sequences in each bucket... then assign the batch size as the minimum(minimum(bucketsize), batchsize)
        buckets_count = np.array( [ enc_dec_data[assignments==i].shape[0] for i in range(self.num_buckets) ] )

        return buckets, buckets_count, assignments

    @property
    def default_bucket_key(self):
        enc_len, dec_len = np.max(self.buckets, axis=0)
        return EncDecBucketKey(enc_len = enc_len, dec_len = dec_len)

    @property
    def provide_data(self): # this is necessary when specifying custom DataIter
        # length of data variable is length of encoder + length of decoder
        enc_dec_bucket_key = self.default_bucket_key

        return [('data', (self.batch_size, enc_dec_bucket_key.enc_len + enc_dec_bucket_key.dec_len))] + self.init_states
    #
    @property
    def provide_label(self): # this is necessary when specifying custom DataIter
        # length of label variable is only the length of decoder
        enc_dec_bucket_key = self.default_bucket_key
        return [('label', (self.batch_size, enc_dec_bucket_key.dec_len))]
    
    # for custom DataIter, we must implement this class as an iterable and return a DataBatch
    def __iter__(self): # this is necessary to convert this class into an iterable
        return self
    
    def __next__(self):
        if self.iter_next():
            # suppose to get self.cursor:self.cursor + self.batch_size
            batch = self.data_label[self.assignments == self.cur_permute_bucket]\
                [ self.in_bucket_permutation[self.cursor:self.cursor+self.batch_size] ]

            # get size of this bucket
            enc_len, dec_len = self.buckets[self.cur_permute_bucket] # this enc_len already deducted the <EOS>
            # total length of rnn sequence is enc_len+dec_len

            all_data = np.full((self.batch_size, enc_len+dec_len), self.pad_label, dtype=float)
            all_label = np.full((self.batch_size, dec_len), self.pad_label, dtype=float)

            for i, (data, label) in enumerate(batch):
                if self.rev:
                    # reverse the input except for the <EOS> at end of input
                    # according to Ilya Sutskever et al. Sequence to Sequence Learning with Neural Networks
                    data[:-1] = np.flipud(data[:-1])

                enc_input = np.concatenate((data, label[:-1])) # data <EOS> label
                z = enc_len - data.shape[0] + 1
                all_data[i, z:enc_len + label.shape[0]] = enc_input
                all_label[i, :label.shape[0]] = label

            return EncoderDecoderBatch(all_data, all_label, self.init_states, EncDecBucketKey(enc_len=enc_len, dec_len=dec_len))
        else:
            raise StopIteration
    
    def iter_next(self):
        self.cursor += self.batch_size
        if self.cursor < self.buckets_count[self.cur_permute_bucket]:
            if self.cursor + self.batch_size > self.buckets_count[self.cur_permute_bucket]:
                # it is going to overflow the bucket
                self.cursor -= self.cursor + self.batch_size - self.buckets_count[self.cur_permute_bucket]
            return True
        else:
            self.cur_bucket += 1
            if self.cur_bucket < self.num_buckets:
                self.cursor = 0
                self.cur_permute_bucket = self.bucket_permutation[self.cur_bucket]
                if self.shuffle:
                    self.in_bucket_permutation = np.random.permutation(self.buckets_count[self.cur_permute_bucket])
                else:
                    self.in_bucket_permutation = np.array(range(self.buckets_count[self.cur_permute_bucket]))
                return True
            else:
                return False

    def reset(self): # for iterable
        self.cursor = -self.batch_size
        self.cur_bucket = 0

        if self.shuffle:
            self.bucket_permutation = np.random.permutation(self.num_buckets)
        else:
            self.bucket_permutation = np.array(range(self.num_buckets))

        self.cur_permute_bucket = self.bucket_permutation[self.cur_bucket]
        if self.shuffle:
            self.in_bucket_permutation = np.random.permutation(self.buckets_count[self.cur_permute_bucket])
        else:
            self.in_bucket_permutation = np.array(range(self.buckets_count[self.cur_permute_bucket]))

In [None]:
num_buckets = 10
num_layers  = 1
batch_size  = 10
iterations  = 1
expt_name   = 'test'
params_dir  = 'params'
seed        = 1
shuffle     = True
rev         = True
dropout     = 0.0
context     = mx.cpu()
num_hidden  = 10

num_labels = len(vocab)

In [None]:
def print_iter(iter):
    iter.reset()
    print('provide_data: ', iter.provide_data)
    print('provide_label: ', iter.provide_label)
    print('buckets: ', iter.buckets)
    print('buckets count: ', iter.buckets_count)
    print('assignments: ', iter.assignments)
    print('batch_size: ', iter.batch_size)
    for i, data_batch in enumerate(iter):
        print(i, data_batch.provide_data)
        print(i, data_batch.provide_label)
        print(i, data_batch.bucket_key)
        print(i, data_batch.data)
        for j, d in enumerate(data_batch.data):
            print(i, j, data_batch.data[j].shape)
        print(i, data_batch.label)
        print(i, data_batch.label[0].shape)
#         print(i, data_batch.label[0].asnumpy())
#         print('\n')
        break

In [None]:
train_iter = EncoderDecoderIter(
    data_label, word2idx, idx2word, num_hidden, num_layers, 
    get_lstm_init_states, batch_size=1, num_buckets=num_buckets, shuffle=shuffle, rev=rev
)

In [None]:
print_iter(train_iter)

In [None]:
LSTMState = namedtuple("LSTMState", ["c", "h"])
LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias", "h2h_weight", "h2h_bias"])

def lstm_cell(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
    """LSTM Cell symbol"""
    if dropout > 0.:
        indata = mx.sym.Dropout(data=indata, p=dropout)
    i2h = mx.sym.FullyConnected(
        data=indata,
        weight=param.i2h_weight,
        bias=param.i2h_bias,
        num_hidden=num_hidden * 4,
        name="t%d_l%d_i2h" % (seqidx, layeridx)
    )

    h2h = mx.sym.FullyConnected(
        data=prev_state.h,
        weight=param.h2h_weight,
        bias=param.h2h_bias,
        num_hidden=num_hidden * 4,
        name="t%d_l%d_h2h" % (seqidx, layeridx)
    )

    gates = i2h + h2h
    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4, name="t%d_l%d_slice" % (seqidx, layeridx))

    in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid")
    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
    forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid")
    out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid")

    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")

    return LSTMState(c=next_c, h=next_h)

def init_lstm(num_lstm_layer):
    param_cells = []
    last_states = []
    for i in range(num_lstm_layer):
        param_cells.append(
            LSTMParam(
                i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
                i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
                h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
                h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)
            )
        )
        last_states.append(
            LSTMState(
                c=mx.sym.Variable("l%d_init_c" % i),
                h=mx.sym.Variable("l%d_init_h" % i)
            )
        )
    return param_cells, last_states

In [None]:
def lstm_unroll(num_lstm_layer, enc_len, dec_len, num_hidden, num_labels, dropout=0.0):
    
    cls_weight = mx.sym.Variable("cls_weight")
    cls_bias = mx.sym.Variable("cls_bias")
    embed_weight=mx.sym.Variable("embed_weight")
    
    param_cells, last_states = init_lstm(num_lstm_layer)
    data = mx.sym.Variable('data')
    label = mx.sym.Variable('label')
    # (batch, time, vec) so axis 1 is the time step
    
    embed = mx.sym.Embedding(
        data=data, input_dim=num_labels,
        weight=embed_weight, output_dim=num_hidden, name='embed'
    )
    wordvec = mx.sym.SliceChannel(data=embed, num_outputs=enc_len + dec_len, squeeze_axis=1)
    
    hidden_all = []
    for seqidx in range(enc_len + dec_len):
        hidden = wordvec[seqidx]
        # stack LSTM
        for i in range(num_lstm_layer):
            dp = 0.0 if i == 0 else dropout
            next_state = lstm_cell(
                num_hidden,
                indata=hidden,
                prev_state=last_states[i],
                param=param_cells[i],
                seqidx=seqidx,
                layeridx=i,
                dropout=dp
            )
            hidden = next_state.h
            last_states[i] = next_state
        # decoder
        if dropout > 0.0:
            hidden = mx.sym.Dropout(data=hidden, p=dropout)
        if seqidx >= enc_len:
            hidden_all.append(hidden)
    hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
    pred = mx.sym.FullyConnected(
        data=hidden_concat,
        num_hidden=num_labels, # num_labels is the index of <PAD> that means this layer will predict 0, 1, ..., num_labels-1
        weight=cls_weight,
        bias=cls_bias,
        name='pred'
    )

    label = mx.sym.transpose(data=label) # e.g. if shape is (1,M) it becomes (M,1)
    label = mx.sym.Reshape(data=label, shape=(-1,)) # if shape is (M,1) it becomes (M,)
    output = mx.sym.SoftmaxOutput(
        data=pred,
        label=label,
        name='t%d_softmax' % seqidx,
        use_ignore=True,
        ignore_label=num_labels # ignore the index of <PAD>
    ) # output becomes (num_labels, M)
    return output

def get_lstm_sym_generator(num_layers, dim, num_labels, dropout=0.0):
    def generate_lstm_sym(bucketkey):
        return lstm_unroll(num_layers, bucketkey.enc_len, bucketkey.dec_len, dim, num_labels, dropout)
    return generate_lstm_sym

def perplexity(label, pred, ignore_label):
    label = label.T.reshape((-1,))
    loss = 0.
    for i in range(pred.shape[0]):
        if label[i] == ignore_label:
            break
        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
    return np.exp(loss / label.size)

In [None]:
import os, logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

train_iter = EncoderDecoderIter(
    data_label, word2idx, idx2word, num_hidden, num_layers, 
    get_lstm_init_states, batch_size=1, num_buckets=num_buckets, shuffle=shuffle, rev=rev
)

batch_size = train_iter.batch_size
print('batch_size:', batch_size)
# load parameters file if exists!!!

model_args = {}
if os.path.isfile('%s/%s-symbol.json' % (params_dir, expt_name)):
    filelist = os.listdir(params_dir) # get list of params file
    paramfilelist = []
    for f in filelist:
        if f.startswith('%s-' % expt_name) and f.endswith('.params'):
            paramfilelist.append( int(re.split(r'[-.]', f)[1]) )
    last_iteration = max(paramfilelist)
    print('loading pretrained model %s/%s at epoch %d' % (params_dir, expt_name, last_iteration))
    tmp = mx.model.FeedForward.load('%s/%s' % (params_dir, expt_name), last_iteration)
    model_args.update({
        'arg_params' : tmp.arg_params,
        'aux_params' : tmp.aux_params,
        'begin_epoch' : tmp.begin_epoch
    })

model = mx.model.FeedForward(
    ctx           = context, # uses all the available CPU in the machine
    symbol        = get_lstm_sym_generator(num_layers, num_hidden, num_labels, dropout),
    num_epoch     = iterations,
    learning_rate = 0.01,
    momentum      = 0.0,
    wd            = 0.00001,
    initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
    **model_args
)

if not os.path.exists(params_dir):
    os.makedirs(params_dir)

In [None]:
model.fit(
    X = train_iter,
    eval_metric = mx.metric.np(perplexity, use_ignore=True, ignore_label=num_labels),
    batch_end_callback = [ mx.callback.Speedometer(batch_size, frequent=100) ],
    epoch_end_callback = [ mx.callback.do_checkpoint( '%s/%s' % (params_dir, expt_name) ) ]
)

In [None]:
len(data_label)

In [None]:
tmp = mx.model.FeedForward.load('%s/%s' % ('params', 'july24'), 8)

In [None]:
tmp.arg_params['embed_weight'].shape

In [None]:
word2idx['<PAD>']