In [1]:
import tensorflow as tf
import numpy as np
import nltk
import re
import dictionary
import os

%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


In [2]:
'''
    Text processing functions
'''
def preprocess_sentence(sentence):
    # Transform some punctuation to space
    line = re.sub(r"[,.;@#?!]+\ *", " ", sentence)
    
    # Convert to lower case
    line = line.lower()

    # Tokenize words
    default_wt = nltk.word_tokenize
    line = default_wt(line)
    
    return line


def max_length_sentence(dataset):
    return max([len(line) for line in dataset])


def pad_sentence(tokenized_sentence, max_length_sentence, padding_value=0):
    
    pad_length = max_length_sentence - len(tokenized_sentence)
    sentence = list(tokenized_sentence)
    
    if pad_length > 0:
        return np.pad(tokenized_sentence, (0, pad_length), mode='constant', constant_values=int(padding_value))
    else:
        return sentence[:max_length_sentence]


# Dataset format: "sentence \t score \n"
# Score is either 1 (for positive) or 0 (for negative)
def get_data(directory):
    X, Y = [], []

    # Iterate over fils names in the directory
    for filename in os.listdir(directory):

        if not filename.startswith('.'):            
            with open(str(directory + '/' + filename)) as file:
                
                for line in file:
                    splitted = line.split("\t")
                    X.append(preprocess_sentence(splitted[0]))
                    Y.append(int(splitted[1].split("\n")[0]))
        
    return np.array(X), np.array(Y)



'''
    Neural Network functions
'''
def new_weights(shape, name=None):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1), name=name)


def new_biases(length, name=None):
    return tf.Variable(tf.constant(0.1, shape=[length]), name=name)


def embedding_layer(input_x, vocabulary_size, embedding_size):
    init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
    embeddings = tf.Variable(init_embeds)
    layer = tf.nn.embedding_lookup(embeddings, input_x)
    
    return layer

In [3]:
# Load all data
X, Y = get_data("dataset")
assert(X.shape == Y.shape)

# DEBUG PURPOSES
X = X[:1000]
Y = Y[:1000]



# Set seed for randomness and shuffle dataset
np.random.seed(42)
index_shuf = list(range(len(X)))
np.random.shuffle(index_shuf)

X = X[index_shuf]
Y = Y[index_shuf]

# Calculate indeces to split
split_train = 0.75
train_size = int(split_train * len(X))
val_size = int((round(1 - split_train, 3) / 2) * len(X))

# Split dataset
X_train, Y_train = X[:train_size], Y[:train_size]
X_val, Y_val = X[train_size:(train_size + val_size)], Y[train_size:(train_size + val_size)]
X_test, Y_test = X[(train_size + val_size):], Y[(train_size + val_size):]

print("Train: " + str(X_train.shape[0]) + ", Validation: " + str(X_val.shape[0]) + ", Test: " + str(X_test.shape[0]))
print(" ".join(X_train[0]))
print(Y_train[0])

Train: 750, Validation: 125, Test: 125
if you have n't gone here go now
1


In [4]:
max_length = max_length_sentence(X_train)

# Build dictionary
vocab = dictionary.LanguageDictionary(X_train, max_length)

# Transform word to indices
X_train_indices = np.array([pad_sentence(vocab.text_to_indices(tmp), max_length, padding_value=0) for tmp in X_train])
X_val_indices = np.array([pad_sentence(vocab.text_to_indices(tmp), max_length, padding_value=0) for tmp in X_val])
X_test_indices = np.array([pad_sentence(vocab.text_to_indices(tmp), max_length, padding_value=0) for tmp in X_test])


# Shapes
print(X_train_indices.shape)
print(X_val_indices.shape)
print(X_test_indices.shape)

(750, 32)
(125, 32)
(125, 32)


In [5]:
# Useful variables
timesteps = X_train_indices.shape[1]
vocabulary_size = len(vocab.index_to_word)

lr = 0.001
epochs = 50
batch_size = 64
embedding_size = 50
hidden_units = 64

In [6]:
tf.reset_default_graph()

# Placeholders
inputs = tf.placeholder(tf.int32, (None, timesteps), 'inputs')
labels = tf.placeholder(tf.int32, (None), 'output')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Embedding layer => Output shape is [batch_size, timesteps, embedding_size]
embedding = embedding_layer(inputs, vocabulary_size, embedding_size)

'''
    Bidirectional LSTM
'''
# Forward direction cell
lstm_fw_cell = tf.contrib.rnn.LSTMCell(hidden_units, forget_bias=1.0)

# Backward direction cell
lstm_bw_cell = tf.contrib.rnn.LSTMCell(hidden_units, forget_bias=1.0)

# Input shape of any RNN should be [batch_size, embedding_size]
outputs, last_states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, embedding, dtype=tf.float32)

# Unpack forward and backward outputs
outputs_fw, outputs_bw = outputs[0], outputs[1]

# This is a MANY-to-ONE model (sequence classification) => I only take output from last timestamp
outputs_fw = tf.transpose(outputs_fw, [1, 0, 2])
last_output_fw = tf.gather(outputs_fw, int(outputs_fw.get_shape()[0]) - 1)

# Get last output of backward LSTM
outputs_bw = tf.transpose(outputs_bw, [1, 0, 2])
last_output_bw = tf.gather(outputs_bw, int(outputs_bw.get_shape()[0]) - 1)

# Concat outputs
outputs_concat = tf.concat([last_output_fw, last_output_bw], 1) 
logits = tf.layers.dense(inputs=outputs_concat, units=2, activation=None)


''' If I wanted to use only "Unidirectional LSTM"

cell = tf.contrib.rnn.LSTMCell(num_units=hidden_units, state_is_tuple=True)
outputs, last_states = tf.nn.dynamic_rnn(cell=cell, dtype=tf.float32, inputs=embedding)
outputs = tf.transpose(outputs, [1, 0, 2])
last_output = tf.gather(outputs, int(outputs.get_shape()[0]) - 1)'''

' If I wanted to use only "Unidirectional LSTM"\n\ncell = tf.contrib.rnn.LSTMCell(num_units=hidden_units, state_is_tuple=True)\noutputs, last_states = tf.nn.dynamic_rnn(cell=cell, dtype=tf.float32, inputs=embedding)\noutputs = tf.transpose(outputs, [1, 0, 2])\nlast_output = tf.gather(outputs, int(outputs.get_shape()[0]) - 1)'

In [7]:
print(inputs)
print(embedding)
print(outputs_fw)
print(outputs_bw)
print(last_output_fw)
print(outputs_concat)
print(logits)

Tensor("inputs:0", shape=(?, 32), dtype=int32)
Tensor("embedding_lookup/Identity:0", shape=(?, 32, 50), dtype=float32)
Tensor("transpose:0", shape=(32, ?, 64), dtype=float32)
Tensor("transpose_1:0", shape=(32, ?, 64), dtype=float32)
Tensor("GatherV2:0", shape=(?, 64), dtype=float32)
Tensor("concat:0", shape=(?, 128), dtype=float32)
Tensor("dense/BiasAdd:0", shape=(?, 2), dtype=float32)


In [None]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
loss = tf.reduce_mean(ce)

# Optimizer for gradients
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=1))
correct_mask = tf.to_float(tf.equal(predictions, labels))
accuracy = tf.reduce_mean(tf.cast(correct_mask, tf.float32))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
num_iterations_training = max(len(X_train_indices) // batch_size, 1)
print("Num iterations training " + str(num_iterations_training))

# Initializer for variables in the graph
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Before each epoch, shuffle training dataset
indices = list(range(len(X_train_indices)))

# Validation data variables
max_val_acc = 0
val_batch_size = 64
num_iterations_validation = max(len(X_val_indices) // val_batch_size, 1)


with tf.Session() as sess:
    sess.run(init)
    
    for i in range(epochs):  
        
        # Shuffle indices with a random seed
        np.random.seed(42)
        np.random.shuffle(indices)
        
        X_train_indices = X_train_indices[indices]
        Y_train = Y_train[indices]

        for j in range(num_iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size

            # Forward and backpropagation on training data
            _, train_loss, train_acc = sess.run([optimizer, loss, accuracy], feed_dict={
                                                            inputs : X_train_indices[start_index:end_index],
                                                            labels : Y_train[start_index:end_index]})
            
            # Print training loss and accuracy
            if j % 30 == 0:
                print("Accuracy: " + str(train_acc) + ", Loss: " + str(train_loss))
                
                
            # Check accuracy on validation 
            if j % 30 == 0:
                
                # Accumulate loss and accuracy
                val_loss_arr, val_acc_arr = [], []
                
                # Iterate over validation mini-batches
                for k in range(num_iterations_validation):
                    start_index_val = k * val_batch_size
                    end_index_val = (k + 1) * val_batch_size
                    
                    val_loss, val_acc = sess.run([loss, accuracy], feed_dict={
                                                            inputs : X_val_indices[start_index:end_index],
                                                            labels : Y_val[start_index:end_index]})
                    val_loss_arr.append(val_loss)
                    val_acc_arr.append(val_acc)

                val_acc = np.mean(val_acc_arr)

                # Save model if validation accuracy better
                if val_acc > max_val_acc:
                    max_val_acc = val_acc
                    print("VALIDATION loss: " + str(np.mean(val_loss_arr)) + ", accuracy: " + str(val_acc))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")

Num iterations training 11
Accuracy: 0.515625, Loss: 0.69495094
VALIDATION loss: 0.69567144, accuracy: 0.515625
Accuracy: 0.59375, Loss: 0.6922639
Accuracy: 0.5, Loss: 0.6878749
Accuracy: 0.453125, Loss: 0.69198775
Accuracy: 0.578125, Loss: 0.6850026
Accuracy: 0.640625, Loss: 0.6478636
Accuracy: 0.59375, Loss: 0.6514138
VALIDATION loss: 0.6974224, accuracy: 0.53125
Accuracy: 0.75, Loss: 0.5396397
VALIDATION loss: 0.65720737, accuracy: 0.640625
