The purpose of this tutorial is to review logistic regression, autoencoders, and the Multilayer Perceptron (MLP)

- create a one-layer autoencoder to encode word windows into 100-dim vectors



In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import codecs
import re
import json
import random
import math
import os
import gzip
import cPickle
import sys
from collections import Counter, defaultdict

import numpy as np
import matplotlib.pyplot as plt
import pylab
import pandas as pd
from scipy.stats import norm
import nltk

pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [2]:
from fuel.datasets import H5PYDataset

DATASET_LOCATION = 'datasets/'

# the pos dataset consists of windows around words
POS_DATASET_NAME = 'brown_pos_dataset.hdf5'
POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

CORPUS_INDICES = 'brown_pos_dataset.indices'
WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'

# Indexes for mapping words <--> ints
with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
    corpus_indices = cPickle.load(indices_file)

train_X, train_y = H5PYDataset(
    POS_DATASET_PATH, which_sets=('train',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

dev_X, dev_y = H5PYDataset(
    POS_DATASET_PATH, which_sets=('dev',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

test_X, test_y = H5PYDataset(
    POS_DATASET_PATH, which_sets=('test',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

# make sure the dl4mt module is on the path
sys.path.append('../..')
# TODO: user needs to set this to the place where they cloned the repo
# TODO: user also needs to run the ipython notebook from inside the directory for that day, 
# otherwise the paths will be messed up
sys.path.append('/home/chris/projects/dl4mt_labs/dcu_deep_learning/')

In [3]:
%%writefile ../../dl4mt/autoencoder_layer.py
# Uncomment to save this cell to file in order to import it in later notebooks

"""
 Most of this class was taken from the lisa-lab deeplearning tutorials 
 -- https://github.com/lisa-lab/DeepLearningTutorials

"""

import os
import sys
import timeit

import numpy

import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams

class dA(object):
    """Denoising Auto-Encoder class (dA)

    A denoising autoencoders tries to reconstruct the input from a corrupted
    version of it by projecting it first in a latent space and reprojecting
    it afterwards back in the input space. Please refer to Vincent et al.,2008
    for more details. If x is the input then equation (1) computes a partially
    destroyed version of x by means of a stochastic mapping q_D. Equation (2)
    computes the projection of the input into the latent space. Equation (3)
    computes the reconstruction of the input, while equation (4) computes the
    reconstruction error.

    .. math::

        \tilde{x} ~ q_D(\tilde{x}|x)                                     (1)

        y = s(W \tilde{x} + b)                                           (2)

        x = s(W' y  + b')                                                (3)

        L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]      (4)

    """

    def __init__(
        self,
        numpy_rng,
        theano_rng=None,
        input=None,
        n_visible=784,
        n_hidden=500,
        W=None,
        W_prime=None,
        bhid=None,
        bvis=None
    ):
        """
        Initialize the dA class by specifying the number of visible units (the
        dimension d of the input ), the number of hidden units ( the dimension
        d' of the latent or hidden space ) and the corruption level. The
        constructor also receives symbolic variables for the input, weights and
        bias. Such a symbolic variables are useful when, for example the input
        is the result of some computations, or when weights are shared between
        the dA and an MLP layer. When dealing with SdAs this always happens,
        the dA on layer 2 gets as input the output of the dA on layer 1,
        and the weights of the dA are used in the second stage of training
        to construct an MLP.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: number random generator used to generate weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                     generated based on a seed drawn from `rng`

        :type input: theano.tensor.TensorType
        :param input: a symbolic description of the input or None for
                      standalone dA

        :type n_visible: int
        :param n_visible: number of visible units

        :type n_hidden: int
        :param n_hidden:  number of hidden units

        :type W: theano.tensor.TensorType
        :param W: Theano variable pointing to a set of weights that should be
                  shared belong the dA and another architecture; if dA should
                  be standalone set this to None
                  
        :type W_prime: theano.tensor.TensorType
        :param W_prime: Theano variable pointing to a set of weights that should be
                  shared belong the dA and another architecture; if dA should
                  be standalone set this to None

        :type bhid: theano.tensor.TensorType
        :param bhid: Theano variable pointing to a set of biases values (for
                     hidden units) that should be shared belong dA and another
                     architecture; if dA should be standalone set this to None

        :type bvis: theano.tensor.TensorType
        :param bvis: Theano variable pointing to a set of biases values (for
                     visible units) that should be shared belong dA and another
                     architecture; if dA should be standalone set this to None


        """
        self.n_visible = n_visible
        self.n_hidden = n_hidden

        # create a Theano random generator that gives symbolic random values
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # note : W' was written as `W_prime` and b' as `b_prime`
        if not W:
            # W is initialized with `initial_W` which is uniformly sampled
            # from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
            # converted using asarray to dtype
            # theano.config.floatX so that the code is runable on GPU
            initial_W = numpy.asarray(
                numpy_rng.uniform(
                    low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                    high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                    size=(n_visible, n_hidden)
                ),
                dtype=theano.config.floatX
            )
            W = theano.shared(value=initial_W, name='W', borrow=True)

        # Note that this is not the "tied-weights" autoencoder
        if not W_prime:
            # W is initialized with `initial_W` which is uniformly sampled
            # from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
            # converted using asarray to dtype
            # theano.config.floatX so that the code is runable on GPU
            initial_W_prime = numpy.asarray(
                numpy_rng.uniform(
                    low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                    high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                    size=(n_hidden, n_visible)
                ),
                dtype=theano.config.floatX
            )
            W_prime = theano.shared(value=initial_W_prime, name='W', borrow=True)

            
        if not bvis:
            bvis = theano.shared(
                value=numpy.zeros(
                    n_visible,
                    dtype=theano.config.floatX
                ),
                borrow=True
            )

        if not bhid:
            bhid = theano.shared(
                value=numpy.zeros(
                    n_hidden,
                    dtype=theano.config.floatX
                ),
                name='b',
                borrow=True
            )

        self.W = W
        # b corresponds to the bias of the hidden
        self.b = bhid
        # b_prime corresponds to the bias of the visible
        self.b_prime = bvis
        
        # TODO -- try untying the weights
        # Autoencoder with tied weights, therefore W_prime is W transpose
#         self.W_prime = self.W.T
        self.W_prime = W_prime
        
        self.theano_rng = theano_rng
        # if no input is given, generate a variable representing the input
        if input is None:
            # we use a matrix because we expect a minibatch of several
            # examples, each example being a row
            self.x = T.dmatrix(name='input')
        else:
            self.x = input

        self.params = [self.W, self.b, self.b_prime]

    # TODO: user should implement this function -- to start, just return identity
    def get_corrupted_input(self, input, corruption_level):
        """This function keeps ``1-corruption_level`` entries of the inputs the
        same and zero-out randomly selected subset of size ``coruption_level``
        Note : first argument of theano.rng.binomial is the shape(size) of
               random numbers that it should produce
               second argument is the number of trials
               third argument is the probability of success of any trial

                this will produce an array of 0s and 1s where 1 has a
                probability of 1 - ``corruption_level`` and 0 with
                ``corruption_level``

                The binomial function return int64 data type by
                default.  int64 multiplicated by the input
                type(floatX) always return float64.  To keep all data
                in floatX when floatX is float32, we set the dtype of
                the binomial to floatX. As in our case the value of
                the binomial is always 0 or 1, this don't change the
                result. This is needed to allow the gpu to work
                correctly as it only support float32 for now.

        """
        return self.theano_rng.binomial(size=input.shape, n=1,
                                        p=1 - corruption_level,
                                        dtype=theano.config.floatX) * input
 
    def get_hidden_values(self, input):
        """ Computes the values of the hidden layer """
        return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
    
    def predict(self, input):
        """ Alias to get_hidden_values for consistent API """
        return self.get_hidden_values(input)

    def get_reconstructed_input(self, hidden):
        """Computes the reconstructed input given the values of the
        hidden layer

        """
        return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)

    def get_cost_updates(self, corruption_level, learning_rate):
        """ This function computes the cost and the updates for one trainng
        step of the dA """

        tilde_x = self.get_corrupted_input(self.x, corruption_level)
        y = self.get_hidden_values(tilde_x)
        z = self.get_reconstructed_input(y)
        
        # sum over the columns of each training instance
        L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1)
        
        # note : L is now a vector, where each element is the
        #        cross-entropy cost of the reconstruction of the
        #        corresponding example of the minibatch. We need to
        #        compute the average of all these to get the cost of
        #        the minibatch
        cost = T.mean(L)

        # compute the gradients of the cost of the `dA` with respect
        # to its parameters
        gparams = T.grad(cost, self.params)
        # generate the list of updates
        updates = [
            (param, param - learning_rate * gparam)
            for param, gparam in zip(self.params, gparams)
        ]

        return (cost, updates)


Overwriting ../../dl4mt/autoencoder_layer.py


In [8]:
# TODO: we need a consistent interface to the functionality of the network for every model
# trained models need to be able to predict and ideally persist their params for loading later
# the interface to persistence and loading trained models should also be consistent
import numpy
import theano
import theano.tensor as T
import timeit
from theano.tensor.shared_randomstreams import RandomStreams

from dl4mt.autoencoder_layer import dA

# TODO: remove training_epochs param
def initialize_dA(train_dataset, learning_rate=0.1, corruption_level=0.0, batch_size=10):

    """
    :type learning_rate: float
    :param learning_rate: learning rate used for training the Autoencoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    
    train_set_x, train_set_y = train_dataset
        
    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x = T.matrix('x')  # the data will be concatenated word vector windows

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    # TODO: move "n_hidden" out to a more obvious spot so that the user can play with different vector sizes
    da = dA(
        numpy_rng=rng,
        theano_rng=theano_rng,
        input=x,
        n_visible=train_set_x.get_value().shape[1],
        n_hidden=2
    )

    cost, updates = da.get_cost_updates(
        corruption_level=corruption_level,
        learning_rate=learning_rate
    )

    train_model_func = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size]
        }
    )
    
#     classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches
    return (da, train_model_func, None, n_train_batches, 0)

In [9]:

#     start_time = timeit.default_timer()
    
    # TODO: the epoch code below is common to all training loops
    # TODO: there should be a function called "train" which takes in the training function and any other args
    # then it transparently runs the loop below -- or one of the more complex ones from later notebooks
    # the advantage here is that we can modify the training loop in one place and it will work anywhere

    ############
    # TRAINING #
    ############

#     # go through training epochs applying dropout each time
#     for epoch in xrange(training_epochs):
#         # go through training set
#         c = []
#         for batch_index in xrange(n_train_batches):
#             c.append(train_da(batch_index))

#         print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

#     end_time = timeit.default_timer()

#     training_time = (end_time - start_time)
# % ((training_time) / 60.)
#     print('The training code for file ran for %.2fm' % ((training_time) / 60.))

#     return da


In [15]:
from dl4mt.datasets import prep_dataset
from dl4mt.training import train_model

# load the training data
VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)
CUTOFF = 1000
  
train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'], cutoff=CUTOFF)

# initialize the autoencoder model
initialization_data = initialize_dA(train_dataset, learning_rate=0.1, corruption_level=0.3, batch_size=50)

classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data
    
# load the training function and train the LR model 
# TODO: -- train_model should return the validation error as a list, then we can plot it
# in general, train_model should return any useful information about the training process
train_model(train_model_func,  n_train_batches, validate_model=validate_model_func,
            n_valid_batches=n_valid_batches, training_epochs=10)

epoch 1, average training cost 403.82
epoch 2, average training cost 367.49
epoch 3, average training cost 352.36
epoch 4, average training cost 344.99
epoch 5, average training cost 340.99
epoch 6, average training cost 338.65
epoch 7, average training cost 337.11
epoch 8, average training cost 336.09
epoch 9, average training cost 335.33
epoch 10, average training cost 334.79
Optimization complete!


In [11]:
# TODO: make the dev and test data loadable in the same way as the training data below

# make a theano function to get predictions from a trained model
training_data = theano.tensor.matrix('training_X')
predictions = classifier.predict(training_data)
get_predictions = theano.function([training_data], predictions)

In [13]:
# get predictions and evaluate
p = get_predictions(train_dataset[0].get_value())

In [14]:
%matplotlib qt

# import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# TODO: Set this at the top of the notebook -- this is the visualization cutoff, not the training cutoff
CUTOFF_BEGIN=0
CUTOFF_END=1000

# TODO: this method of visualizing the labels loses the class tags -- we want to know which color is which
y_vals = np.array([y[0] for y in train_y[CUTOFF_BEGIN:CUTOFF_END]])
norm_y_vals = y_vals / float(np.amax(y_vals))

jitter1 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN)
jitter2 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN)
x1 = p[CUTOFF_BEGIN:CUTOFF_END,0] + jitter1
x2 = p[CUTOFF_BEGIN:CUTOFF_END,1] + jitter2


plt.scatter(x1, x2, c=norm_y_vals, s=20)
plt.show()

In [20]:
y_counts = Counter(y_vals)
y_counts

Counter({0: 17,
         1: 319,
         2: 118,
         3: 19,
         4: 133,
         5: 125,
         6: 12,
         7: 152,
         9: 18,
         10: 31,
         11: 56})

In [None]:
# WORKING: how to visualize the filters in an autoencoder for text?
# Idea: pass the training data through and store the top N instances which activate this node
# at the end, output the top 5 instances for each node (convert window instance back to text)

# WORKING: visualize 2d autoencoder output

# WORKING: vizualize 2d stacked vs 2d "tied weights" autoencoders

# WORKING: Documents don't really give us any information about POS -- we need a better way of representing words
# idea: replicate most common 1000 words 4x
# encode each word with a 4000-dimensional vector indicating words which occur in POS-2, -1, +1, +2

# A stacked autoencoder can be used as part of an MLP, where the layers are "fine-tuned" for the task

# SIMPLIFICATION IDEA -- ONLY SELECT A SUBSET OF CLASSES (I.e. just noun vs. adj), THIS SHOULD MAKE COMPARISON EASIER

In [2]:
# Load fuel dataset (POS windows) DONE
# Load one-hot Word X Document numpy matrix DONE

# train autoencoder, stop when reconstruction performance doesn't improve any more, or after a fixed number of epochs

# CHALLENGE: implement denoising/dropout -- i.e. implement the corruption

# save the parameters of the first part of the network (the encoder)
# use the encoder parameters to build the prediction transformation

# evaluate the performance of the autoencoder features vs. hstacked LSI and W2V features 

In [23]:
len(dataset_X.get_value()[1].nonzero()[0])
len(dataset_X.get_value()[1])
dataset_X.get_value().shape
[corpus_indices['idx2word'][w] for w in train_X[63624]]
corpus_indices['idx2tag'][train_y[63624][0]]

u'DET'

In [9]:
# Working Notes
# In a separate notebook, compare 2d autoencoder vs 2d LSI -- visualize with scatter plots
# cite Hinton etc for this visualization methodology 