In [1]:
from mxnet.gluon import nn
from mxnet import ndarray as nd
from data_helper.mr_loader import *
from mxnet import gluon, autograd, io
import mxnet as mx

import time
from datetime import timedelta

In [2]:
base_dir = 'data/mr'
pos_file = os.path.join(base_dir, 'rt-polarity.pos.txt')
neg_file = os.path.join(base_dir, 'rt-polarity.neg.txt')
vocab_file = os.path.join(base_dir, 'rt-polarity.vocab.txt')


save_path = 'save_models'  # model save path
if not os.path.exists(save_path):
    os.mkdir(save_path)
model_file = os.path.join(save_path, 'mr_cnn.pt')


def get_time_dif(start_time):
    """
    Return the time used since start_time.
    """
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [3]:
class Conv_Max_Pooling(nn.Block):
    def __init__(self, channels, kernel_size, **kwargs):
        super(Conv_Max_Pooling, self).__init__(**kwargs)

        with self.name_scope():
            self.conv = nn.Conv1D(channels, kernel_size)
            self.pooling = nn.GlobalMaxPool1D()
        
    def forward(self, x):
        output = self.pooling(self.conv(x))
        return nd.relu(output).flatten()

In [4]:
class Config(object):
    """
    CNN parameters
    """
    embedding_dim = 128  # embedding vector size
    seq_length = 50  # maximum length of sequence
    vocab_size = 8000  # most common words

    num_filters = 100  # number of the convolution filters (feature maps)
    kernel_sizes = [3, 4, 5]   # three kind of kernels (windows)

    hidden_dim = 128  # hidden size of fully connected layer

    dropout_prob = 0.5  # how much probability to be dropped
    learning_rate = 1e-3  # learning rate
    batch_size = 50  # batch size for training
    num_epochs = 20  # total number of epochs

    print_per_batch = 50 # print out the intermediate status every n batches

    num_classes = 2  # number of classes

    dev_split = 0.1  # percentage of dev data

In [5]:
class CNN(nn.Block):
    def __init__(self, config, **kwargs):
        super(CNN, self).__init__(**kwargs)
        
        V = config.vocab_size
        E = config.embedding_dim
        Nf = config.num_filters
        Ks = config.kernel_sizes
        C = config.num_classes
        dropout = config.dropout_prob
        
        with self.name_scope():
            self.embedding = nn.Embedding(V, E)
            self.conv1 = Conv_Max_Pooling(Nf, Ks[0])
            self.conv2 = Conv_Max_Pooling(Nf, Ks[1])
            self.conv3 = Conv_Max_Pooling(Nf, Ks[2])
            self.fc1 = nn.Dense(C)
            
    def forward(self, x):
        x = self.embedding(x).transpose((0, 2, 1))
        o1, o2, o3 = self.conv1(x), self.conv2(x), self.conv3(x)
        outputs = self.fc1(nd.concat(o1, o2, o3))
        
        return outputs

In [6]:
class MRDataset(gluon.data.Dataset):
    def __init__(self, x, y):
        super(MRDataset, self).__init__()
        self.x = x
        self.y = y
    
    def __getitem__(self, index):
        return self.x[index].astype(np.float32), self.y[index].astype(np.float32)
    
    def __len__(self):
        return len(self.x)

In [7]:
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        output = net(data)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

In [8]:
print('Loading data...')
start_time = time.time()
config = Config()
corpus = Corpus(pos_file, neg_file, vocab_file, config.dev_split, config.seq_length, config.vocab_size)
print(corpus)
config.vocab_size = len(corpus.words)

print('Configuring CNN model...')
model = CNN(config)
model.collect_params().initialize()

loss = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': 0.001})

train_loader = gluon.data.DataLoader(MRDataset(corpus.x_train, corpus.y_train), batch_size=128, shuffle=False)
test_loader = gluon.data.DataLoader(MRDataset(corpus.x_test, corpus.y_test), batch_size=128, shuffle=False)

total_batch = 0
total_loss = 0.0
best_acc_val = 0.0

for epoch in range(config.num_epochs):
    cumulative_loss = 0.0
    print('Epoch:', epoch + 1)
    # load the training data in batch
    
    for data, label in train_loader:
        batch_len = len(data)
        # print('cur_batch:', batch_len)
        with autograd.record():
            output = model(data)
            losses = loss(output, label)
        losses.backward()
        trainer.step(batch_len)
        cumulative_loss += nd.sum(losses).asscalar()
    
    test_accuracy = evaluate_accuracy(test_loader, model)
    train_accuracy = evaluate_accuracy(train_loader, model)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (epoch, cumulative_loss/len(corpus.x_train), train_accuracy, test_accuracy))  

Loading data...
Training: 9595, Testing: 1067, Vocabulary: 8000
Configuring CNN model...
Epoch: 1
Epoch 0. Loss: 0.659942268344, Train_acc 0.767170401251, Test_acc 0.696344892221
Epoch: 2
Epoch 1. Loss: 0.42460086737, Train_acc 0.922876498176, Test_acc 0.765698219306
Epoch: 3
Epoch 2. Loss: 0.228351220195, Train_acc 0.968108389786, Test_acc 0.746016869728
Epoch: 4
Epoch 3. Loss: 0.111777372296, Train_acc 0.985617509119, Test_acc 0.741330834114
Epoch: 5
Epoch 4. Loss: 0.0505577494416, Train_acc 0.990724335591, Test_acc 0.749765698219
Epoch: 6
Epoch 5. Loss: 0.0258927302537, Train_acc 0.998332464825, Test_acc 0.759137769447
Epoch: 7
Epoch 6. Loss: 0.0225548458571, Train_acc 0.994267847837, Test_acc 0.761012183693
Epoch: 8
Epoch 7. Loss: 0.0257302293372, Train_acc 0.998228243877, Test_acc 0.746954076851
Epoch: 9
Epoch 8. Loss: 0.0203728131853, Train_acc 0.997498697238, Test_acc 0.733833177132
Epoch: 10
Epoch 9. Loss: 0.0109604583976, Train_acc 0.999687337155, Test_acc 0.751640112465
Epoch

In [None]:
def train():
    """
    Train and evaluate the model with training and validation data.
    """
    print('Loading data...')
    start_time = time.time()
    config = Config()
    corpus = Corpus(pos_file, neg_file, vocab_file, config.dev_split, config.seq_length, config.vocab_size)
    print(corpus)
    config.vocab_size = len(corpus.words)

    print('Configuring CNN model...')
    model = CNN(config)

    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': 0.01})
    
    total_batch = 0
    total_loss = 0.0
    best_acc_val = 0.0
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        # load the training data in batch
        train_loader = io.NDArrayIter(data={'data': nd.array(corpus.x_train)}, 
                                      label={'label': nd.array(corpus.y_train)}, 
                                      batch_size=config.batch_size)
        for batch in train_loader:
            cur_batch = config.batch_size - batch.pad
            with autograd.record():
                output = model(batch.data[0])
                loss = softmax_cross_entropy(output, batch.label[:cur_batch])
            loss.backward()
            trainer.step(cur_batch)
            print(loss)

            optimizer.zero_grad()
            outputs = model(inputs)  # forward computation
            loss = criterion(outputs, targets)
            total_loss += loss.data[0]
            total_batch += 1

            if total_batch % config.print_per_batch == 0:
                # print out intermediate status
                avg_loss = total_loss / config.print_per_batch
                total_loss = 0.0

                _, pred_train = torch.max(outputs.data, 1)
                corrects = (pred_train == targets.data).sum()
                acc_train = corrects / len(x_batch)
                loss_val, acc_val, _ = evaluate(model, val_data)  # evaluate on val data

                if acc_val > best_acc_val:
                    # store the best validation result
                    best_acc_val = acc_val
                    improved_str = '*'
                    torch.save(model.state_dict(), model_file)
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, avg_loss, acc_train, loss_val, acc_val, time_dif, improved_str))

            # back propagation
            loss.backward()
            optimizer.step()

    test(model, val_data)