In [1]:
import numpy as np
import mxnet as mx, math
import argparse, math
from mxnet import gluon
from mxnet.gluon import nn, rnn
import mxnet.ndarray as F
from mxnet import gluon, autograd
import os
import time

import pickle

In [2]:
context = mx.gpu(0)
#context = mx.cpu(0)
print (context)

gpu(0)


In [3]:
emsize = 512    #help='size of word embeddings')
nhid = 512      #help='number of hidden units per layer')
nlayers = 1     #help='number of layers')
lr = 0.1          #help='initial learning rate')
clip = 0.25     #help='gradient clipping')
epochs = 10     #help='upper epoch limit')
batch_size = 128    #help='batch size')
bptt = 20           #help='sequence length')
dropout = 0.0      #help='dropout applied to layers (0 = no dropout)')
#tied = 'store_true'      #help='tie the word embedding and softmax weights')
log_interval = 200    #help='report interval')
save = 'model.params'      #help='path to save the final model')
gctype = 'none'        #help='type of gradient compression to use, \ takes `2bit` or `none` for now.')
gcthreshold = 0.5       #help='threshold for 2bit gradient compression')
#mode = 'lstm'

# Load the data

In [4]:
with open('text8.train.pkl', 'rb') as f:
    data = pickle.load(f)

input = data['input']
label = data['label']
input = mx.nd.array(input)
label = mx.nd.array(label)

vocab = len(data['worddic'])

In [5]:
print (vocab)
ntokens = vocab

44369


# Define the model

In [12]:
class Adaptivesoftmax(gluon.Block):
    def __init__(self, input_size, cutoff, reduce_factor=4):
        # input_size refers to the hidden_size from LSTM/RNN
        super(Adaptivesoftmax, self).__init__()
        
        self.input_size = input_size
        self.cutoff = cutoff
        self.output_size = cutoff[0] + len(cutoff) - 1

        self.head = nn.Dense(units=self.output_size, in_units=input_size, flatten=False)
        self.tail = nn.Sequential()
        
        for i in range(len(cutoff) - 1):
            if reduce_factor == 1:
                seq = nn.Dense(units=(cutoff[i + 1] - cutoff[i]), in_units=input_size, flatten=False)

            else:
                seq = nn.Sequential()
                seq.add(nn.Dense(units=(input_size // reduce_factor ** i), 
                                 in_units=input_size, flatten=False))
                seq.add(nn.Dense(units=(cutoff[i + 1] - cutoff[i]), 
                                 in_units=(input_size // reduce_factor ** i), flatten=False))

            self.tail.add(seq)
        
    def set_target(self, target):
        #this function helps to select the data for different clusters
        self.id = []
        target = target.asnumpy()

        for i in range(len(self.cutoff)):
            if i < (len(self.cutoff) - 1):
                mask_1 = (target >= self.cutoff[i])
                mask_2 = (target <= self.cutoff[i + 1])
                mask = mask_1 * mask_2
            else:
                mask = (target < self.cutoff[0])
            
            mask = mask.reshape((mask.shape[1],mask.shape[0]))
            if True in mask:
                self.id.append(mask[0])

            else:
                self.id.append(None)
        
                       
    def forward(self, input, target):
        #this part is for training; it contains both forward and loss 
        #shape (1120,2001): (bptt*batch_size, cutoff[0] + len(cutoff) - 1)
        output_head = self.head(input)
        nnloss = 0
        self.target = target
        
        if self.target is not None:
            self.set_target(self.target)
            
        for i in range(len(self.id)-1):
            if self.id[i] is not None:
                id_select = np.array(self.id[i])
                output_tail = self.tail[i](input[id_select])
                prob_head = F.log_softmax(output_head[id_select])
                split = prob_head[:,self.cutoff[0]+i].expand_dims(1)  
                prob_tail = F.log_softmax(output_tail[id_select]) + split
                
                loss = gluon.loss.SoftmaxCrossEntropyLoss(from_logits=True)
                nnloss = nnloss + mx.nd.sum(loss(prob_tail, target[id_select]))
                
        if self.id[-1] is not None:
                id_select = np.array(self.id[-1])
                loss = gluon.loss.SoftmaxCrossEntropyLoss()
                nnloss = nnloss + mx.nd.sum(loss(output_head[id_select], target[id_select]))            
                
        nnloss = nnloss / (len(target))    
        return nnloss     
    
    def log_prob(self, input):  
        #this part is for test and it does not requrie gradients
        head_out = self.head(input)
            
        #target_size refers to bptt * batch_size
        target_size = len(head_out[0])
        prob = mx.nd.zeros((target_size, self.cutoff[-1]))
            
        lsm_head = mx.nd.log_softmax(head_out, axis=1)
        prob[:, : self.cutoff[0]] = lsm_head[:, : self.cutoff[0]]
        
        for i in range(len(self.tail)):
            split = lsm_head[:, self.cutoff[0] + i].unsqueeze(1)
            tail_out = self.tail[i](input)
            lsm_tail = mx.nd.log_softmax(tail_out, axis=1) + split
            prob[:, self.cutoff[i] : self.cutoff[i + 1]] = lsm_tail
        
        return prob                 

In [13]:
class LanguageModel(gluon.Block):
    def __init__(self, vocab_size, num_embed, num_hidden, num_layers, dropout=0.0,
            adaptive_softmax=True, cutoff=[2000], **kwargs):
        super(LanguageModel, self).__init__(**kwargs)
        
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.encoder = nn.Embedding(vocab_size, num_embed,
                                        weight_initializer=mx.init.Uniform(0.1))

            self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout,
                                    input_size=num_embed)

        if adaptive_softmax:
            self.linear = Adaptivesoftmax(num_hidden, [*cutoff, vocab_size + 1])
        else:
            self.linear = nn.Dense(units=vocab_size, in_units=num_hidden, flatten=False)
            
        self.adaptive_softmax = adaptive_softmax

        self.num_layers = num_layers
        self.num_hidden = num_hidden
        
    def forward(self, input, hidden, target=None, training=True):
        #this part is for training
        embed = self.encoder(input)
        print (embed.shape)
        embed = self.drop(embed)

        output, hidden = self.rnn(embed, hidden)
        print(output.shape)
        output = self.drop(output)

        if self.adaptive_softmax:
            self.linear.set_target(target)

        #(bptt*batch_size, hidden_size) 
        nnloss = self.linear(output.reshape(output.shape[0] * output.shape[1], output.shape[2]), target)

        return nnloss, hidden
         
    def log_prob(self, input, hidden):
        #this part is for test
        embed = self.encoder(input)
        output, hidden = self.rnn(embed, hidden)
        prob = self.linear.log_prob(output.reshape(output.shape[0] * output.shape[1], output.shape[2]))

        return prob, hidden            

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)
        

In [14]:
model = LanguageModel(vocab_size=ntokens, num_embed=emsize, num_hidden=nhid, num_layers=nlayers, dropout=0.25,
                       cutoff=[2000, 10000])
model.initialize(mx.init.Xavier(), ctx=context)

trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': lr,
                         'momentum': 0,
                         'wd': 1e-5})
loss = gluon.loss.SoftmaxCrossEntropyLoss(from_logits=True)

In [15]:
model.collect_params()

languagemodel1_ (
  Parameter languagemodel1_embedding0_weight (shape=(44369, 512), dtype=float32)
  Parameter languagemodel1_lstm0_l0_i2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter languagemodel1_lstm0_l0_h2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter languagemodel1_lstm0_l0_i2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter languagemodel1_lstm0_l0_h2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter dense5_weight (shape=(2002, 512), dtype=float32)
  Parameter dense5_bias (shape=(2002,), dtype=float32)
  Parameter dense6_weight (shape=(512, 512), dtype=float32)
  Parameter dense6_bias (shape=(512,), dtype=float32)
  Parameter dense7_weight (shape=(8000, 512), dtype=float32)
  Parameter dense7_bias (shape=(8000,), dtype=float32)
  Parameter dense8_weight (shape=(128, 512), dtype=float32)
  Parameter dense8_bias (shape=(128,), dtype=float32)
  Parameter dense9_weight (shape=(34370, 128), dtype=float32)

# Define the training process

In [16]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

def eval(data_source, target_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=batch_size, ctx=context)
    for (data, target) in zip(data_source, target_source):
        data = data.as_in_context(context).T
        target = target.as_in_context(context).T.reshape((-1, 1))
        prob, hidden = model.log_prob(data, hidden)
        L = loss(prob, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal

def train():
    best_val = float("Inf")
    for epoch in range(epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=batch_size, ctx=context)
        i = 0
        for (data, target) in zip(input, label):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            #print (data.shape, target.shape)
            hidden = detach(hidden)
            #print (F.max(target))
            with autograd.record():
                nnloss, hidden = model(data, hidden, target)
                L = nnloss
                L.backward()
            
            grads = [p.grad(context) for p in model.collect_params().values()]
            #gluon.utils.clip_global_norm(grads, clip)

            trainer.step(1, ignore_stale_grad=True)
            total_L += L.asscalar()
            
            
            i+=1
            if i % log_interval == 0 and i > 0:
                cur_L = total_L / log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0


In [None]:
train()