In [1]:
#1 -- pretrain a matrix of the embedding of each term, out_dims = n_topics
#    1.a.  Cost function is sum(abs(h(x) - (sigmoid(x * W2) * W2t)))
#2 -- Create W1, which is (n_docs * n_topics).  W1[doc,:] = normalize(sum(embeddings of words in doc))
#3 -- Train
#    3.a.  The training examples are all term-document combinations. 
#    3.b   For each training example (term, doc), select a random document that does not contain the term
#    3.c   Minimize 
# Note -- can pass a regularizer function, zero_grad(g, p): return 0., to a layer to prevent it from being trained. 

In [15]:
import os
import sys
import timeit
import numpy
from keras.models import *
from keras.layers.core import *
from keras.layers.embeddings import *
from keras.optimizers import SGD
from keras.regularizers import l2, l1l2
from keras.constraints import unitnorm,nonneg
from keras.optimizers import Adadelta
from keras.layers.advanced_activations import ThresholdedReLU
from keras import backend as K
from scipy.io import loadmat
from scipy.io import savemat
from keras.models import model_from_json
from IPython.display import SVG
from keras.utils.visualize_util import to_graph
from keras.callbacks import ModelCheckpoint,RemoteMonitor
import theano
to_path = "./"

In [3]:
term_matrix = loadmat(to_path + "t1_termatrix.mat", variable_names = "target").get("target").astype("float32")
term_matrix.shape

(28956, 300)

In [None]:
# pretrain W2, the distribution of topics over grams
n_topics = 128

encoder = Sequential([Dense(n_topics, input_dim=300,
                           W_regularizer = l1l2(l1 = 0.0001, l2 = .001), 
                            b_regularizer = l1l2(l1 = 0.0001, l2 = 0.001),
                            activation = "sigmoid")])
decoder = Sequential([Dense(300, input_dim=n_topics)])

autoencoder = Sequential()
autoencoder.add(AutoEncoder(encoder = encoder, 
                            decoder = decoder, 
                            output_reconstruction=False))

autoencoder.compile(loss = "mse", optimizer=Adadelta())

In [None]:
history = autoencoder.fit(term_matrix, term_matrix, batch_size = 1, nb_epoch = 50)

In [None]:
#save model
json_string = autoencoder.to_json()
open('autoencoder.json', 'w').write(json_string)
autoencoder.save_weights('autoencoder.h5', overwrite=True)
#save activations for use in pretraining
output = autoencoder.predict(term_matrix)
savemat("./t1_ntm_pretrain.mat", { 'activations' : output})
output.shape

In [None]:
#load model
#autoencoder = model_from_json(open('autoencoder.json').read())
#autoencoder.load_weights('autoencoder.h5')
#pretrained_W2 = autoencoder.layers[0].encoder.get_weights()
#numpy.save(file = "t1_w2_pretrain.npy", arr = pretrained_W2)

In [4]:
#get initial weights for W2 from the autoencoder
pretrained_W2 = numpy.load(to_path + "t1_w2_pretrain.npy")

In [5]:
#get initial weights for W1 that were pretrained in R based on the autoencoder activations
pretrained_W1 = loadmat(to_path + "t1_ntm_pret.mat", variable_names = "w1").get("w1").astype("float32")
#get training examples
examples = loadmat(to_path + "t1_ntm_pret.mat", variable_names = "examples").get("examples")

In [6]:
(n_docs, n_topics, n_terms, n_epochs) = (pretrained_W1.shape[0], 
                               pretrained_W1.shape[1], 
                               term_matrix.shape[0], 
                                        examples.shape[1] - 2)
(n_docs, n_topics, n_terms, n_epochs)

(954905, 128, 28956, 4)

In [9]:
# Build the actual training model
def build_ntm():
    ntm = Graph()
    ntm.add_input(name = "d_pos", input_shape = (1,), dtype = "int")
    ntm.add_input(name = "d_neg", input_shape = (1,), dtype = "int")
    ntm.add_shared_node(Embedding(input_dim = n_docs, 
                                  output_dim = n_topics, 
                                  weights = [pretrained_W1], 
    #                              W_regularizer = l2(0.001),
                                  W_regularizer = l2(0.05), 
                                  input_length = 1),
                        name = "topicmatrix",
                        inputs =  ["d_pos", "d_neg"], 
                        outputs = ["wd_pos", "wd_neg"],
                        merge_mode = None)
    ntm.add_input(name = "g", input_shape = (1,), dtype = "int")
    ntm.add_node(Embedding(input_dim = n_terms, 
                          output_dim = 300,
                          weights = [term_matrix], 
                           trainable = False,
                           input_length = 1), 
                 name = "le", input = "g")
    ntm.add_node(Flatten(), input = "le", name = "le_")
    ntm.add_node(Dense(n_topics, activation = "sigmoid", 
                       weights = pretrained_W2, 
                 #      W_regularizer = l2(0.001)),
                         W_regularizer = l2(0.05)),
                 name = "lt", input = "le_")
    ntm.add_node(Flatten(), name = "wd_pos_", input = "wd_pos")
    ntm.add_node(Flatten(), name = "wd_neg_", input = "wd_neg")
    ntm.add_node(Activation("softmax"), name = "ld_pos", input = "wd_pos_")
    #ntm.add_node(Reshape((1,128)), input = "ld_pos", name = "ld_pos_")
    ntm.add_node(Activation("softmax"), name = "ld_neg", input = "wd_neg_")
    #ntm.add_node(Reshape((1,128)), input = "ld_neg", name = "ld_neg_")
    ls_pos = Layer()
    ntm.add_node(ls_pos,
                       name = "ls_pos", 
                       inputs = ["lt", "ld_pos"], 
                       merge_mode = 'dot', dot_axes = -1)
    ls_neg = Layer()
    ntm.add_node(ls_neg, 
                       name = "ls_neg", 
                       inputs = ["lt", "ld_neg"], 
                        merge_mode = 'dot', dot_axes = -1)
    return ntm

#SVG(to_graph(ntm).create(prog='dot', format='svg'))

In [12]:
# Pre-training
ntm = build_ntm()
ntm.add_output("pos", input = "ls_pos")
ntm.add_output("neg", input = "ls_neg")

ntm.compile(loss = {'pos' : "binary_crossentropy",
                   'neg' : 'binary_crossentropy'}, 
           optimizer = "Adadelta")

In [None]:
# In this pre-training stage, try to target pos toward 1 and neg toward 0.
# The goal is to prevent the topics from all distributing evenly, which causes
# the losses to underflow to zero. 

trainer = examples#[0:100000,:]
train_shape = (trainer.shape[0],1)
        
ntm.fit(data = {
            "g" : numpy.reshape(trainer[:,1], train_shape), 
            "d_pos" : numpy.reshape(trainer[:,0], train_shape), 
            "d_neg" : numpy.reshape(trainer[:,trainer.shape[1]-2], train_shape),
            "pos" : numpy.reshape(numpy.ones(trainer.shape[0], 
                                                  dtype = theano.config.floatX), train_shape),
            "neg" : numpy.reshape(numpy.zeros(trainer.shape[0], 
                                                  dtype = theano.config.floatX), train_shape)
        },
            nb_epoch = 1, 
            batch_size = 5000)

Epoch 1/1
  105000/11682144 [..............................] - ETA: 3688s - loss: 1.3863

In [None]:
# Fine-tuning
ntm2 = build_ntm()
ntm2.set_weights(ntm.get_weights())

def output_shape(input_shape):
    return (None, 1)

def sumLam(x):
    return (x[1] - x[0])
summer = LambdaMerge(layers = [ntm.nodes["ls_pos"], 
                               ntm.nodes["ls_neg"]], 
                     function = sumLam,
                    output_shape = output_shape)
ntm.add_node(summer, inputs = ["ls_pos", "ls_neg"], name = "summed")

ntm.add_input(name = "gradient_enhancer", input_shape = (1,), dtype = "float")
ntm.add_node(Layer(), name = "enhanced", 
             inputs = ["gradient_enhancer", "summed"],
            merge_mode = "mul")

ntm.add_node(ThresholdedReLU(0.5), input = "enhanced", name = "threshold")
ntm.add_output(name = "loss_out",  input= "threshold")

ntm.compile(loss = {'loss_out' : "rmse"},
           optimizer = "Adadelta")

checkpointer = ModelCheckpoint(filepath="./checkpointweights.hdf5", verbose = 1, save_best_only=True)
from keras.callbacks import Callback
class CheckFit(Callback):
    def on_epoch_end(self, logs = {}):
        print logs.keys()

checker = CheckFit()

remote = RemoteMonitor(root="http://localhost:9000")

validation_column = examples.shape[1] - 1
validation_length = 20000
validation_indices = numpy.random.choice(examples.shape[0], 
                                  size = validation_length, 
                                  replace = False)
validation_shape = (validation_length,1)

trainer = examples#[0:100000,:]
train_shape = (trainer.shape[0],1)
        
for epoch in xrange(3, trainer.shape[1] - 3):
    ntm.fit(data = {
            "g" : numpy.reshape(trainer[:,1], train_shape), 
            "d_pos" : numpy.reshape(trainer[:,0], train_shape), 
            "d_neg" : numpy.reshape(trainer[:,epoch + 2], train_shape),
            "loss_out" : numpy.reshape(numpy.zeros(trainer.shape[0], 
                                                  dtype = theano.config.floatX), train_shape),
            "gradient_enhancer" : numpy.reshape(numpy.repeat(max(1., n_docs / (10 ** epoch)), trainer.shape[0], 0), 
                                                train_shape)
        }, callbacks = [checkpointer, remote], 
            validation_data = {
            "g" : numpy.reshape(examples[validation_indices,1], 
                                validation_shape), 
            "d_pos" : numpy.reshape(examples[validation_indices,0], 
                                    validation_shape), 
            "d_neg" : numpy.reshape(examples[validation_indices,validation_column], 
                                    validation_shape),
            "loss_out" : numpy.reshape(numpy.ones(validation_shape[0], 
                         dtype = theano.config.floatX), validation_shape),
            "gradient_enhancer" : numpy.reshape(numpy.repeat(n_docs, validation_shape[0], 0), 
                                                validation_shape)
        },
            nb_epoch = 1, 
            batch_size = 5000)

In [None]:
# Fine-tune
checkpointer = ModelCheckpoint(filepath="./checkpointweights.hdf5", verbose = 1, save_best_only=True)
from keras.callbacks import Callback
import theano
class CheckFit(Callback):
    def on_epoch_end(self, logs = {}):
        print logs.keys()

checker = CheckFit()

remote = RemoteMonitor(root="http://localhost:9000")

validation_column = examples.shape[1] - 1
validation_length = 20000
validation_indices = numpy.random.choice(examples.shape[0], 
                                  size = validation_length, 
                                  replace = False)
validation_shape = (validation_length,1)

trainer = examples#[0:100000,:]
train_shape = (trainer.shape[0],1)
        
for epoch in xrange(trainer.shape[1] - 3):
    ntm.fit(data = {
            "g" : numpy.reshape(trainer[:,1], train_shape), 
            "d_pos" : numpy.reshape(trainer[:,0], train_shape), 
            "d_neg" : numpy.reshape(trainer[:,epoch + 2], train_shape),
            "loss_out" : numpy.reshape(numpy.ones(trainer.shape[0], 
                                                  dtype = theano.config.floatX), train_shape),
            "gradient_enhancer" : numpy.reshape(numpy.repeat(max(1., n_docs / (10 ** epoch)), trainer.shape[0], 0), 
                                                train_shape)
        }, callbacks = [checkpointer, remote], 
            validation_data = {
            "g" : numpy.reshape(examples[validation_indices,1], 
                                validation_shape), 
            "d_pos" : numpy.reshape(examples[validation_indices,0], 
                                    validation_shape), 
            "d_neg" : numpy.reshape(examples[validation_indices,validation_column], 
                                    validation_shape),
            "loss_out" : numpy.reshape(numpy.ones(validation_shape[0], 
                         dtype = theano.config.floatX), validation_shape),
            "gradient_enhancer" : numpy.reshape(numpy.repeat(n_docs, validation_shape[0], 0), 
                                                validation_shape)
        },
            nb_epoch = 1, 
            batch_size = 5000)

In [None]:
[(x,  type(ntm.nodes[x]), ntm.nodes[x].output_shape) for x in ntm.nodes]

In [None]:
# debug
ntm.add_node(Layer(), name = "out_monitor", 
            inputs = ["ls_pos", "ls_pos_neg", "ls_neg", "summed", "enhanced", "threshold"],
            merge_mode="concat")
ntm.add_node(Layer(), name = "mult_monitor", inputs = ["lt", "ld_neg", "ld_pos"], 
            merge_mode="concat", concat_axis = 0)
ntm.add_output(name = "multmonitor", input = "mult_monitor")
ntm.add_output(name = "monitor", input = "out_monitor")


ntm.add_output(name = "pos_out", input = "ls_pos")
ntm.add_output(name = "neg_out", input = "ls_neg")
import theano
get_monitor = theano.function([ntm.inputs[i].input for i in ntm.input_order],
                                        [ntm.outputs['monitor'].get_output(train=False),
                                         ntm.outputs['multmonitor'].get_output(train=False)],
                                        on_unused_input='ignore',
                                        allow_input_downcast=True)
tod = get_monitor([[125418]], 
            [[29141]], 
            [[0]], 
            [[10000000]])
(tod[0], tod[1])

In [None]:
sgd = SGD(lr = 0.01, decay = 1e-6, momentum = 0.95)
def threshold(y_true, y_pred):
    return K.maximum(y_true * y_pred, 0.)#, axis=1

ntm.compile(loss = {'loss_out' : threshold},
#                   'monitor' : 'mse'}, 
           optimizer = "Adadelta")
#ntm.compile(loss = {'pos_out' : "hinge", 
#                   'neg_out' : neg_hinge}, 
#            optimizer = sgd)#) "Adadelta")

In [None]:
# A large batch size does not have a detrimental effect on training W1
# because with 1 million rows, its unlikely two will be active in a single batch anyway. 
# So the pattern will be to train an epoch with a batch size of 100, for W2, 
# with the loss functions set so that ls_pos chases 1 and ls_neg chases 0.  
# Then, to fine-tune W1, we'll take the same weights, use the 0.5 separation recommended in
# the paper, and train several epochs with as large a batch size as the GPU can hold.

checkpointer = ModelCheckpoint(filepath="./checkpointweights.hdf5", verbose = 1, save_best_only=True)
from keras.callbacks import Callback
import theano
class CheckFit(Callback):
    def on_epoch_end(self, logs = {}):
        print logs.keys()

checker = CheckFit()

remote = RemoteMonitor(root="http://localhost:9000")

validation_column = examples.shape[1] - 1
validation_length = 20000
validation_indices = numpy.random.choice(examples.shape[0], 
                                  size = validation_length, 
                                  replace = False)
validation_shape = (validation_length,1)

trainer = examples#[0:100000,:]
train_shape = (trainer.shape[0],1)
        
for epoch in xrange(3, trainer.shape[1] - 3):
    ntm.fit(data = {
            "g" : numpy.reshape(trainer[:,1], train_shape), 
            "d_pos" : numpy.reshape(trainer[:,0], train_shape), 
            "d_neg" : numpy.reshape(trainer[:,epoch + 2], train_shape),
            "loss_out" : numpy.reshape(numpy.ones(trainer.shape[0], 
                                                  dtype = theano.config.floatX), train_shape),
            "gradient_enhancer" : numpy.reshape(numpy.repeat(max(1., n_docs / (10 ** epoch)), trainer.shape[0], 0), 
                                                train_shape)
        }, callbacks = [checkpointer, remote], 
            validation_data = {
            "g" : numpy.reshape(examples[validation_indices,1], 
                                validation_shape), 
            "d_pos" : numpy.reshape(examples[validation_indices,0], 
                                    validation_shape), 
            "d_neg" : numpy.reshape(examples[validation_indices,validation_column], 
                                    validation_shape),
            "loss_out" : numpy.reshape(numpy.ones(validation_shape[0], 
                         dtype = theano.config.floatX), validation_shape),
            "gradient_enhancer" : numpy.reshape(numpy.repeat(n_docs, validation_shape[0], 0), 
                                                validation_shape)
        },
            nb_epoch = 1, 
            batch_size = 5000)

In [None]:
ntm.load_weights('ntm2.h5')
start = 6000
end = 10000
length = end - start
loss = ntm.test_on_batch(data = {
            "g" : numpy.reshape(examples[start:end,1], (length,1)), 
            "d_pos" : numpy.reshape(examples[start:end,0], (length,1)), 
            "d_neg" : numpy.reshape(examples[start:end,2], (length,1)),
            "pos_out" : numpy.reshape(numpy.ones(length), (length,1)),
            "neg_out" : numpy.reshape(numpy.zeros(length), (length,1))
        })
loss

In [None]:
#json_string = ntm.to_json()ntm.save_weights(to_path + 'ntm2' + str(epoch) + '.h5', overwrite=True)
#open('ntm.json', 'w').write(json_string)
ntm.save_weights(to_path + 'ntm4adadelta_' + str(1) + 'epoch.h5', overwrite=True)

In [None]:
#ntm.fit(data = {
#        "g" : numpy.reshape(examples[:,1], (examples.shape[0],1)), 
#        "d_pos" : numpy.reshape(examples[:,0], (examples.shape[0],1)), 
#        "d_neg" : numpy.reshape(examples[:,2], (examples.shape[0],1)),
#        "loss_out" : numpy.reshape(numpy.ones(examples.shape[0]), (examples.shape[0],1))
#    }, 
#        nb_epoch = 4, 
#        batch_size = 2000)

In [17]:
weights = ntm.get_weights()
(weights[0].shape, weights[1].shape, weights[2].shape, 
 weights[3].shape)

((954905, 128), (28956, 300), (300, 128), (128,))

In [18]:
numpy.sum(weights[0][0,:])

0.0

In [20]:
softies = weights[0][100000,:]
numpy.exp(softies)/numpy.sum(numpy.exp(softies))

array([ -5.05579978e-09,  -2.03678163e-09,  -4.53540050e-09,
        -6.96222413e-10,  -1.03804121e-09,   1.40375767e-09,
         2.78581869e-09,   4.54158655e-10,   2.89217295e-09,
         1.46871804e-09,   1.61418123e-09,   8.70731209e-10,
        -3.20436677e-09,   6.64569622e-10,  -4.49658533e-09,
         3.06334647e-09,   7.66106956e-10,   3.56550900e-09,
        -3.38904149e-09,   2.63906030e-09,   1.32899647e-09,
        -2.94247093e-09,  -3.19365578e-09,  -4.35888756e-11,
         7.52533924e-10,  -6.76807221e-10,   2.06828354e-09,
        -9.62975588e-10,   1.37461112e-10,  -5.16154763e-09,
         2.07746664e-09,   8.62301008e-10,  -5.29927879e-09,
        -7.00579372e-10,  -2.78500689e-09,   1.58791402e-09,
         9.98281235e-10,   2.54655741e-09,  -3.02575032e-09,
         1.83062887e-09,   1.87147653e-09,   1.40025525e-09,
        -3.48459661e-09,  -3.28445204e-09,   6.48316345e-10,
         1.35633560e-09,   1.36060108e-09,  -1.29507927e-09,
         8.37023006e-10,

In [None]:
weights[0][0,:]

In [None]:
ntm.nodes.get("wd_pos").output_shape

In [None]:
# sNTM
n_categories = 3
ntm.add_node(Dense(n_categories, activation = "sigmoid"), input = "ld_pos", name = "ll")
ntm.add_output(name = "label", input = "ll")
ntm.compile(loss = {'loss_out' : threshold,
                   'label' : 'categorical_crossentropy'}, 
           optimizer = "Adadelta")