# Data preprocessing and building the vocabulary

In [17]:
import numpy as np
from tensorflow.contrib import learn
import tensorflow as tf


In [2]:
x_text = ['This is a cat','This must be boy', 'This is a a dog']
max_document_length = max([len(x.split(" ")) for x in x_text])

In [3]:
max_document_length

5

In [4]:
## Create the vocabularyprocessor object, setting the max lengh of the documents.
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)

In [5]:
vocab_processor

<tensorflow.contrib.learn.python.learn.preprocessing.text.VocabularyProcessor at 0x7efeb6a2d2b0>

In [6]:
## Transform the documents using the vocabulary.
x = np.array(list(vocab_processor.fit_transform(x_text)))

In [8]:
x

array([[1, 2, 3, 4, 0],
       [1, 5, 6, 7, 0],
       [1, 2, 3, 3, 8]])

In [9]:
## Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping

In [10]:
vocab_dict

{'<UNK>': 0,
 'This': 1,
 'a': 3,
 'be': 6,
 'boy': 7,
 'cat': 4,
 'dog': 8,
 'is': 2,
 'must': 5}

In [11]:
## Sort the vocabulary dictionary on the basis of values(id).
## Both statements perform same task.
#sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1))
sorted_vocab = sorted(vocab_dict.items(), key = lambda x : x[1])

In [12]:
sorted_vocab

[('<UNK>', 0),
 ('This', 1),
 ('is', 2),
 ('a', 3),
 ('cat', 4),
 ('must', 5),
 ('be', 6),
 ('boy', 7),
 ('dog', 8)]

In [13]:
## Treat the id's as index into list and create a list of words in the ascending order of id's
## word with id i goes at index i of the list.
vocabulary = list(list(zip(*sorted_vocab))[0])

In [14]:
vocabulary

['<UNK>', 'This', 'is', 'a', 'cat', 'must', 'be', 'boy', 'dog']

In [15]:
print(vocabulary)
print(x)

['<UNK>', 'This', 'is', 'a', 'cat', 'must', 'be', 'boy', 'dog']
[[1 2 3 4 0]
 [1 5 6 7 0]
 [1 2 3 3 8]]


------------

# Embedding Layer

In [25]:
sequence_length = 6 # max length of the sentences
num_classes = 2 # positive and negative
vocab_size = 9
embedding_size = 128

input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")

# using None allows the network to handle arbitrarily sized batches.

dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

In [28]:
with tf.device('/cpu:0'), tf.name_scope("embedding"):
# the embedding implementation doesn’t currently have GPU support and throws an error if placed on the GPU.
# you get a nice hierarchy when visualizing your network in TensorBoard.

    W = tf.Variable(                                                     # Embedding matrix
        tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
        name="W")
        
    embedded_chars = tf.nn.embedding_lookup(W, input_x) 
    embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

# Convolution Layer

In [40]:
filter_sizes = [3, 4, 5]
num_filters = 128

pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
    with tf.name_scope("conv-maxpool-%s" % filter_size):
        # Convolution Layer
        filter_shape = [filter_size, embedding_size, 1, num_filters]
        W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
        conv = tf.nn.conv2d(
            embedded_chars_expanded,
            W,
            strides=[1, 1, 1, 1],
            padding="VALID",
            name="conv")
        # Apply nonlinearity
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
        # Max-pooling over the outputs
        pooled = tf.nn.max_pool(
            h,
            ksize=[1, sequence_length - filter_size + 1, 1, 1],
            strides=[1, 1, 1, 1],
            padding='VALID',
            name="pool")
        pooled_outputs.append(pooled)
        
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat( pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])



In [41]:
print(pooled_outputs)
print(h_pool)
print(h_pool_flat)

[<tf.Tensor 'conv-maxpool-3_8/pool:0' shape=(?, 1, 1, 128) dtype=float32>, <tf.Tensor 'conv-maxpool-4_7/pool:0' shape=(?, 1, 1, 128) dtype=float32>, <tf.Tensor 'conv-maxpool-5_7/pool:0' shape=(?, 1, 1, 128) dtype=float32>]
Tensor("concat_3:0", shape=(?, 1, 1, 384), dtype=float32)
Tensor("Reshape:0", shape=(?, 384), dtype=float32)


# Dropout Layer

In [42]:
with tf.name_scope("dropout"):
    h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

# Output Layer

In [43]:
with tf.name_scope("output"):
    W = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name="W")
    b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
    scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
    predictions = tf.argmax(scores, 1, name="predictions")

In [44]:
# Calculate mean cross-entropy loss
with tf.name_scope("loss"):
    losses = tf.nn.softmax_cross_entropy_with_logits(scores, input_y)
    loss = tf.reduce_mean(losses)
    
# Calculate Accuracy
with tf.name_scope("accuracy"):
    correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

ValueError: Only call `softmax_cross_entropy_with_logits` with named arguments (labels=..., logits=..., ...)