In [None]:
# Load IMDB data using Keras 

import numpy
import mxnet as mx
import pandas as pd
import keras
from keras.datasets import imdb
from matplotlib import pyplot

In [None]:
# load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data()
X = numpy.concatenate((X_train, X_test), axis=0)
y = numpy.concatenate((y_train, y_test), axis=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# check what values are in the labels.  Should be 0 or 1.
print (numpy.unique(y))

In [None]:
# check number of unique words in the dataset
print (len(numpy.unique(numpy.hstack(X))))

In [None]:
# Summarize review length
print("Review length: ")
result = [len(x) for x in X]
print("Mean %.2f words (%f)" % (numpy.mean(result), numpy.std(result)))
# plot review length
pyplot.boxplot(result)
pyplot.show()

In [None]:
from keras.preprocessing.sequence import pad_sequences

# The number if unique words is too large.  For simplicity, we want to use the top 5000 words and zero out the rest.

(X_train, y_train), (X_test, y_test)  = imdb.load_data(nb_words=5000)

# We need to pad the training data with 0's, so they all have lenghth of 500. As mojority of review fall into 500 words or less
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=500)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=500)

In [None]:
# Create MXNet NDArray Iterator from the numpy training set and labels
# in order to assign numpy array to MXNet NDArrayIter, the input vector needs to be same size and padded if it is short
# 128 is the batch size.  We can choose to shuffle the data (true or false)
trainIter = mx.io.NDArrayIter(X_train, y_train, 128, shuffle=True)
testIter = mx.io.NDArrayIter(X_test, y_test, 128, shuffle=True)

'''
for batch in trainIter:
    print ('Original Data')
    print (X_train[0:128])
    print ('Original Lable')
    print (y_train[0:128])
    print ('DATA')
    print (batch.data[0].asnumpy().astype(int))
    print ('LABEL')
    print (batch.label[0].asnumpy().astype(int))
    #print(batch.data, batch.label, batch.pad)
    print ('EOL')

'''

In [None]:

# create MLP network using MXNet


import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

inputdata = mx.sym.Variable('data')
input_y = mx.sym.Variable('softmax_label')  # placeholder for output

#add a dropout.  Can be used for regularization
#inputdatadp = mx.sym.Dropout(inputdata, p=0.2)

vocabsize = 5000

# input_dim is the size of the vocaburary.  output_dim is the dimension of the output embedded vector.
Embeddata = mx.sym.Embedding(data = inputdata, input_dim=vocabsize, output_dim=32, name='embed') 

data1 = mx.sym.Flatten(data = Embeddata, name='flatten')

fc1  = mx.sym.FullyConnected(data=data1, num_hidden=250) 
act1 = mx.sym.Activation(data=fc1, act_type="relu")  


fc2 = mx.sym.FullyConnected(data=act1, num_hidden=1) 
act2 = mx.sym.Activation(data=fc2, act_type="sigmoid")  

#mlp = mx.sym.SoftmaxOutput(data=act2, label=input_y, name='softmax')

mlp = mx.sym.LogisticRegressionOutput(data=fc2, label=input_y, name='softmax')

mx.viz.plot_network(mlp)

#print (mlp.list_arguments())
#print (mlp.list_outputs())
#print (mlp.debug_str())

In [None]:
# Train the model


def norm_stat(d):
    """The statistics you want to see.
    We compute the L2 norm here but you can change it to anything you like."""
    return mx.nd.norm(d)/numpy.sqrt(d.size)


# create a trainable module on GPU 
batch_size = 128

mlp_model = mx.mod.Module(symbol=mlp, context=mx.gpu()) 

# bind is not needed for training using fit. Redudant  Keeping here to show warning.  
#mlp_model.bind(data_shapes=trainIter.provide_data,label_shapes=trainIter.provide_label)

# init_params is also optional for training using fit
#mlp_model.init_params()

mon = mx.mon.Monitor(
    100,                 # Print every 100 batches
    norm_stat,           # The statistics function defined above
    pattern='.*weight',  # A regular expression. Only arrays with name matching this pattern will be included.
    sort=True)           # Sort output by name



mlp_model.fit(trainIter,  # training data               
    #eval_data=testIter,  # validation data               
    optimizer='sgd',  # use SGD to train               
    optimizer_params={'learning_rate':0.01, 'momentum': 0.9},  # use fixed learning rate.  momentum is for sgd only              
    #optimizer="adam",  # use adam to train
    #optimizer_params={'learning_rate':0.01}, # set learning rate for adam         
    eval_metric='acc',  # report accuracy during training               
    batch_end_callback = mx.callback.Speedometer(batch_size, 100), # output progress for each 100 data batches   
    num_epoch=10, # train at most 10 data passes
    monitor=mon)  


In [None]:
# deprecated syntax.  
# just trying it out

model = mx.model.FeedForward(
    ctx = mx.gpu(0),      # Run on GPU 0
    symbol = mlp,         # Use the network we just defined
    num_epoch = 10,       # Train for 10 epochs
    learning_rate = 0.1,  # Learning rate
    momentum = 0.9,       # Momentum for SGD with momentum
    wd = 0.00001)         # Weight decay for regularization
model.fit(
    X=trainIter,  # Training data set
    eval_data=testIter,  # Testing data set. MXNet computes scores on test set every epoch
    batch_end_callback = mx.callback.Speedometer(batch_size, 200))  # Logging module to print out progress

In [None]:
prob = mlp_model.predict(testIter) 
print (prob.shape)
print (mlp_model)

acc = mx.metric.Accuracy() 
mlp_model.score(testIter, acc) 
print(acc) 
print (acc.get()[1])

In [None]:

# MLP for the IMDB problem
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

In [None]:

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
print (X_train.shape)
print (y_train.shape)

In [None]:

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [None]:
print (X_train.shape)
print (X_test.shape)

In [None]:

# create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:

# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
from keras.utils import data_utils
import json

def get_word_index(path='imdb_word_index.json'):
    """Retrieves the dictionary mapping word indices back to words.
    # Arguments
        path: where to cache the data (relative to `~/.keras/dataset`).
    # Returns
        The word index dictionary.
    """
    path = data_utils.get_file(path,
                    origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json')
    f = open(path)
    data = json.load(f)
    f.close()
    return data

In [None]:

data = get_word_index()

# get json element directly with string as the index.  Use this to encode the sentence into numpy array
print (data['timey']) 