In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import gensim
import numpy as np
from scipy.spatial.distance import cdist

  from ._conv import register_converters as _register_converters


In [2]:
# from tf.keras.models import Sequential  # This does not work!
from keras.callbacks import TensorBoard
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


## Tokenization

Get the average number of tokens

In [3]:
x_train_labels = np.load('tokenized-label.npy');
x_train_labels = np.array(x_train_labels);
x_train_labels = to_categorical(x_train_labels)

In [4]:
x_train_tokens = np.load('tokenized.npy');
num_tokens = [len(tokens) for tokens in x_train_tokens]
num_tokens = np.array(num_tokens)

In [5]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

388

In [6]:
print(len(x_train_tokens))

25000


The 388 number of tokens are 95% of the total text tokens. We can padd the data with this length, to get constant length for all sentences.

In [7]:
np.sum((num_tokens < max_tokens)) / len(num_tokens)

0.94572

When padding or truncating the sequences that have a different length, we need to determine if we want to do this padding or truncating 'pre' or 'post'. If a sequence is truncated, it means that a part of the sequence is simply thrown away. If a sequence is padded, it means that zeros are added to the sequence.

So the choice of 'pre' or 'post' can be important because it determines whether we throw away the first or last part of a sequence when truncating, and it determines whether we add zeros to the beginning or end of the sequence when padding. This may confuse the Recurrent Neural Network.

In [8]:
pad = 'pre'

In [9]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [10]:
x_train_pad.shape

(25000, 388)

In [11]:
x_train_pad[1]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

You should be able to run the following command in a terminal to install them both:

pip install h5py scikit-optimize

In [12]:
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.utils import use_named_args

## Hyper-Parameters

Hyperparameters that we want to learn
- Number of rnn units in layer
- Number of layers
- Maximum iterations

This is the search-dimension for the number of nodes for each rnn layer. This is also an integer and we want at least 5 and at most 512 nodes in each layer of the neural network.

In [13]:
dim_num_dense_nodes = Integer(low=50, high=512, name='num_rnn_nodes')

This is the search-dimension for the number of RNN layers in the neural network. This is an integer and we want at least 1 dense layer and at most 3 RNN layers in the neural network.

In [14]:
dim_num_dense_layers = Integer(low=2, high=3, name='num_rnn_layers')

In [15]:
iterations = Integer(low=1, high=4, name='iterations')

We then combine all these search-dimensions into a list.

In [16]:
dimensions  = [dim_num_dense_nodes,
               dim_num_dense_layers,
               iterations
              ]

default parameters are the initial parameters that the optimization starts from

In [17]:
default_parameters = [50, 2, 2]

Log the hyper parameters in file

In [18]:
def log_dir_name(num_rnn_nodes, num_rnn_layers,
                 iterations):

    # The dir-name for the TensorBoard log-dir.
    s = "./logs/nodes_{0}_layer_{1}_iterations_{2}/"

    # Insert all the hyper-parameters in the dir-name.
    log_dir = s.format(num_rnn_nodes,
                       num_rnn_layers,
                       iterations
                      )

    return log_dir

In [19]:
# We know that the data loaded have following word embedding parameters
vocab = 35268
dim = 600

# Images are stored in one-dimensional arrays of this length.
vectorsFlat = vocab * dim

# Tuple with vocab and dim of vectors used to reshape arrays.
# This is used for plotting the images.
img_shape = (vocab, dim)

# Tuple with vocab, dim and depth used to reshape arrays.
# This is used for reshaping in Keras.
img_shape_full = (vocab, dim, 1)

# Number of classes, positive or negative sentiment
num_classes = 2

In [20]:
selectedModel = 'model-sg-1-size-600-window-10'
modelLoad = gensim.models.Word2Vec.load('./wordToVec/%s'%(selectedModel))

Adjust the input length of embedding layer according to the average length calculated. By default the input_length parameter is set to **None**. The embedding layer is pretrained so this would not be trained when the model learns. The parameter train_embeddings is set to **False**

In [21]:
#embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
#for word, i in word_index.items():
#    if word in word2vec.vocab:
#        embedding_matrix[i] = word2vec.word_vec(word)
#print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
embeddedLayer = modelLoad.wv.get_keras_embedding()
embeddedLayer.input_length=max_tokens
embeddedLayer

<keras.layers.embeddings.Embedding at 0x11cd531d0>

## Create Model

In [22]:
def create_model(num_rnn_nodes, num_rnn_layers):
    """
    Hyper-parameters:
    num_dense_nodes:   Number of nodes in each dense layer.
    num_dense_layers:  Number of dense layers.
    iterations:        Iterations 
    """
    
    # Start construction of a Keras Sequential model.
    model = Sequential()

    # Add an input layer which is similar to a feed_dict in TensorFlow.
    # Note that the input-shape must be a tuple containing the vectorSize.
#    model.add(InputLayer(input_shape=(vectorsFlat,)))

    # The input from MNIST is a flattened array with 784 elements,
    # but the convolutional layers expect images with shape (28, 28, 1)
 #   model.add(Reshape(img_shape_full))

    model.add(embeddedLayer)
    # Add fully-connected / dense layers.
    # The number of layers is a hyper-parameter we want to optimize.
    for i in range(num_rnn_layers):
        name = 'layer_rnn_{0}'.format(i+1)
        if i == num_rnn_layers -1 :
            model.add(GRU(units=num_rnn_nodes))
        else:
            model.add(GRU(units=num_rnn_nodes, return_sequences=True))
        
    model.add(Dense(2,activation='sigmoid'))
    optimizer = Adam(lr=1e-3)    
    
    # In Keras we need to compile the model so it can be trained.
    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model

Place to store best model

In [23]:
path_best_model = 'nlp_best_model.keras'

This is the classification accuracy for the model saved to disk. It is a global variable which will be updated during optimization of the hyper-parameters.

In [24]:
best_accuracy = 0.0

In [25]:
@use_named_args(dimensions=dimensions)
def fitness(num_rnn_nodes, num_rnn_layers, iterations
           ):
    """
    Hyper-parameters:
    num_rnn_layers:  Number of rnn layers.
    num_rnn_nodes:   Number of nodes in each rnn layer.
    iterations:      iterations to run program for  
    """

    # Print the hyper-parameters.
    print('num_rnn_layers:', num_rnn_layers)
    print('num_rnn_nodes:', num_rnn_nodes)
    print('iteration:', iterations)
    print()
    
    # Create the neural network with these hyper-parameters.
    model = create_model(num_rnn_nodes=num_rnn_nodes, 
                         num_rnn_layers= num_rnn_layers)

    # Dir-name for the TensorBoard log-files.
    log_dir = log_dir_name(num_rnn_nodes, num_rnn_layers,
                           iterations)
    
    # Create a callback-function for Keras which will be
    # run after each epoch has ended during training.
    # This saves the log-files for TensorBoard.
    # Note that there are complications when histogram_freq=1.
    # It might give strange errors and it also does not properly
    # support Keras data-generators for the validation-set.
    callback_log = TensorBoard(
        log_dir=log_dir,
        histogram_freq=0,
        batch_size=32,
        write_graph=True,
        write_grads=False,
        write_images=False)
   
    # Use Keras to train the model.
    history = model.fit(x=x_train_pad,
                        y=x_train_labels,
                        epochs=iterations,
                        batch_size=64,
                        validation_split=0.05,
                        callbacks=[callback_log])

    # Get the classification accuracy on the validation-set
    # after the last training-epoch.
    accuracy = history.history['val_acc'][-1]

    # Print the classification accuracy.
    print()
    print("Accuracy: {0:.2%}".format(accuracy))
    print()

    # Save the model if it improves on the best-found performance.
    # We use the global keyword so we update the variable outside
    # of this function.
    global best_accuracy

    # If the classification accuracy of the saved model is improved ...
    if accuracy > best_accuracy:
        # Save the new model to harddisk.
        model.save(path_best_model)
        
        # Update the classification accuracy.
        best_accuracy = accuracy

    # Delete the Keras model with these hyper-parameters from memory.
    del model
    
    # Clear the Keras session, otherwise it will keep adding new
    # models to the same TensorFlow graph each time we create
    # a model with a different set of hyper-parameters.
    K.clear_session()
    
    # NOTE: Scikit-optimize does minimization so it tries to
    # find a set of hyper-parameters with the LOWEST fitness-value.
    # Because we are interested in the HIGHEST classification
    # accuracy, we need to negate this number so it can be minimized.
    return -accuracy

## Test Run
Before we run the hyper-parameter optimization, let us first check that the various functions above actually work, when we pass the default hyper-parameters.

In [26]:
fitness(x=default_parameters)

num_rnn_layers: 2
num_rnn_nodes: 50
iteration: 2

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 388, 600)          21160800  
_________________________________________________________________
gru_1 (GRU)                  (None, 388, 50)           97650     
_________________________________________________________________
gru_2 (GRU)                  (None, 50)                15150     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 102       
Total params: 21,273,702
Trainable params: 112,902
Non-trainable params: 21,160,800
_________________________________________________________________
Train on 23750 samples, validate on 1250 samples
Epoch 1/2
   64/23750 [..............................] - ETA: 7:33 - loss: 0.6956 - acc: 0.4688

KeyboardInterrupt: 