In [2]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import itertools
import gc
import gensim
from collections import Counter
from gensim.models.word2vec import Word2Vec

In [3]:
# Load data from files
positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg").readlines())
negative_examples = [s.strip() for s in negative_examples]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
# Concatenate positive and negative examples
x_text = positive_examples + negative_examples
y = positive_labels + negative_labels

In [4]:
len(y)

10662

In [5]:
# Padding our sentences to the same length
x_text_split = [s.split(" ") for s in x_text]
padded_sentences = [] 
# We pad all sentences to the maximum sentence length in the dataset
SEQUENCE_LENGTH = max(len(x) for x in x_text_split)
PADDING_WORD = "<PAD/>"
for i in range(len(x_text_split)):
    sentence = x_text_split[i]
    num_padding = SEQUENCE_LENGTH - len(sentence)
    new_sentence = [PADDING_WORD] * num_padding + sentence
    padded_sentences.append(new_sentence)

In [6]:
SEQUENCE_LENGTH

59

In [7]:
# Build vocabulary
word_counts = Counter(itertools.chain(*padded_sentences))
# Mapping from word to index
vocabulary = {x[0]: i for i, x in enumerate(word_counts.most_common())}
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]

In [8]:
len(vocabulary)

21426

In [205]:
# # Use word2vec for initial embeddings (optional)

# Start with zero embeddings
# initial_embeddings = np.zeros(len(vocabulary), 300)

# Load word2vec and get embeddings
# w2v_model = Word2Vec.load_word2vec_format("/Users/dennybritz/Downloads/GoogleNews-vectors-negative300.bin", binary=True)
# for idx, word in enumerate(vocabulary_inv):
#     if word in w2v_model:
#         initial_embeddings[idx,:] = w2v_model[word]

# # Save initial embeddings to avoid loading word2vec every time
# np.save("./data/rt-polaritydata/initial_embeddings", initial_embeddings)

# # Clear word2vec
# w2v_model = None

TypeError: argument of type 'NoneType' is not iterable

In [9]:
# Optional: Load initial embeddings
initial_embeddings = np.load("./data/rt-polaritydata/initial_embeddings.npy")

In [10]:
# Our training data
x = np.array([[vocabulary[word] for word in sentence] for sentence in padded_sentences])
y = np.array(y)
# Randomly shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]
print("Training examples: {:d}".format(len(x)))

Training examples: 10662


In [11]:
# TODO: Do this properly?
x_train, x_dev, x_test = x[:-2000], x[-2000:-1000], x[-1000:]
y_train, y_dev, y_test = y[:-2000], y[-2000:-1000], y[-1000:]

In [12]:
# Helper function to batch data
def batch_iter(data, batch_size, num_epochs):
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index]

In [13]:
# Test data!?


In [54]:
# CNN Implementation
class CharCNN(object):
    """
    A CNN for text classifications
    Embedding -> Convolutinal Layer -> Affine Layer -> Softmax Prediction
    """
    def __init__(
        self, vocabulary_size, sequence_length, num_classes=2, embedding_size=128,
            filter_sizes=[3, 4, 5], num_filters=100, affine_dim=256, dropout_keep_prob=0.5):
        
        # Placeholders for our input and output
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length])
        self.input_y = tf.placeholder(tf.float32, [None, num_classes])
        
        # Embedding layer
        self.embedded_chars = self._build_embedding([vocabulary_size, embedding_size], self.input_x)
        # Add another dimension, expected by the convolutional layer
        self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.variable_scope("filter-%s" % filter_size):
                # Define the shape of the filter
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                # We pool over the complete output of the convolution
                pool_ksize = [1, sequence_length - filter_size + 1, 1, 1]
                # Build the layer
                pooled = self._build_conv_maxpool(filter_shape, pool_ksize, self.embedded_chars_expanded)
                # Keep track of the layer
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Affine Layer with dropout
        self.h_affine = self._build_affine([num_filters_total, affine_dim], self.h_pool_flat)
        self.h_affine_drop = tf.nn.dropout(self.h_affine, dropout_keep_prob)

        # Softmax Layer (Final output)
        self.y = self._build_softmax([affine_dim, num_classes], self.h_affine_drop)
        self.predictions = tf.argmax(self.y, 1)
        # self.y = self._build_softmax([num_filters_total, num_classes], self.h_pool_flat)
        # self.predictions = tf.argmax(self.y, 1)        

        # Loss
        self.loss = self._build_mean_ce_loss(self.y, self.input_y)
        # Accuracy
        self.accuracy = self._build_accuracy(self.y, self.input_y)

        # Summaries
        total_loss_summary = tf.scalar_summary("loss", self.loss)
        accuracy_summmary = tf.scalar_summary("accuracy", self.accuracy)

    def _build_embedding(self, shape, input_tensor):
        """
        Builds an embedding layer. Returns the embedded tensor.
        """
        # We force this on the CPU because the op isn't implemented for the GPU
        with tf.device('/cpu:0'):
            W_intializer = tf.random_uniform_initializer(-1.0, 1.0)
            W_embeddings = tf.get_variable("W", shape, initializer=W_intializer)
            return tf.nn.embedding_lookup(W_embeddings, input_tensor)

    def _build_affine(self, shape, input_tensor, activation_func=tf.nn.relu):
        """
        Builds an affine (fully-connected) layer
        """
        with tf.variable_scope("affine"):
            W = tf.Variable(tf.truncated_normal(shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=shape[-1:]), name="b")
            h = activation_func(tf.matmul(input_tensor, W) + b, name="h")
        return h

    def _build_softmax(self, shape, input_tensor):
        """
        Builds a softmax layer
        """
        with tf.variable_scope("softmax"):
            W_initializer = tf.truncated_normal_initializer(stddev=0.1)
            b_initializer = tf.constant_initializer(0.1)
            W = tf.get_variable("W", shape, initializer=W_initializer)
            b = tf.get_variable("b", shape[-1:], initializer=b_initializer)
            return tf.nn.softmax(tf.matmul(input_tensor, W) + b, name="y")
    
    def _build_mean_ce_loss(self, predictions, labels):
        """
        Calculates the mean cross-entropy loss
        """
        with tf.variable_scope("mean-ce-loss"):
            return -tf.reduce_mean(labels * tf.log(predictions), name="mean_ce_loss")
        
    def _build_accuracy(self, predictions, labels):
        """
        Returns accuracy tensor
        """
        with tf.variable_scope("accuracy"):
            correct_predictions = tf.equal(tf.argmax(predictions, 1), tf.argmax(labels, 1))
            return tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")                
    
    def _build_conv_maxpool(self, filter_shape, pool_shape, input_tensor):
        """
        Builds a convolutional layer with ReLU activation followed by a  max-pooling layer.
        """
        with tf.variable_scope("conv-maxpool"):
            # Convolution Filter
            W = tf.get_variable("W", filter_shape, initializer=tf.truncated_normal_initializer(stddev=0.1))
            # Performs the convolution
            conv = tf.nn.conv2d(input_tensor, W, strides=[1, 1, 1, 1], padding="VALID")
            # Bias term
            b = tf.get_variable("b", filter_shape[-1], initializer=tf.constant_initializer(0.1))
            # Nonlinearity
            h = tf.nn.relu(conv + b, name="conv")
            # Maxpooling
            return tf.nn.max_pool(h, ksize=pool_shape, strides=[1, 1, 1, 1], padding='VALID', name="pool")

In [38]:
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False)      
    sess = tf.Session(config=session_conf)  
    with sess.as_default():
        cnn = CharCNN(len(vocabulary), SEQUENCE_LENGTH, 2, embedding_size=300)
        
        # Define Training procedure
        # A variable to keep track of the global step. Tensorflow uses this automatically.
        global_step = tf.Variable(0, name="global_step")
        # Our optimizer
        optimizer = tf.train.AdamOptimizer(1e-4)
        # We want to minmize the loss
        train_op = optimizer.minimize(cnn.loss, global_step=global_step)
        
        # Summary Writer
        summary_op = tf.merge_all_summaries()
        out_dir = os.path.join(os.path.curdir, "runs", str(int(time.time())))        
        train_summary_dir = os.path.abspath(os.path.join(out_dir, "summaries", "train"))
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
        
        # Checkpointing
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        # Tensorflow assumes this directory already exists so we need to create it
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())
        
        # Initialize all variables
        sess.run(tf.initialize_all_variables())
        
        # Optional: Initialize embeddings
        # W_tensor = sess.graph.get_tensor_by_name("embedding/W:0")
        # sess.run(cnn.W_embeddings.assign(initial_embeddings))
        
        def print_summaries(summaries):
            """
            Prints Event summary protocol buffers
            """
            summary_obj = tf.Summary.FromString(summaries)
            # Don't include summaries about queues
            filtered_summaries = [v for v in summary_obj.value if "queue/" not in v.tag]
            summary_str = "\n".join(["{}: {:f}".format(v.tag, v.simple_value) for v in filtered_summaries])
            print("\n{}\n".format(summary_str))
        
        def eval_loss(x_batch, y_batch):
            feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch }
            step, loss, accuracy = sess.run([global_step, cnn.loss, cnn.accuracy], feed_dict)
            print("loss {:g}, acc: {:g}".format(loss, accuracy))
        
        # A single training step
        def train_step(x_batch, y_batch):
            feed_dict = {
                cnn.input_x: x_batch,
                cnn.input_y: y_batch
            }
            # Run the graph
            _, step, summaries = sess.run([train_op, global_step, summary_op], feed_dict)
            # Print Step
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}".format(time_str, step))
            # print_summaries(summaries)
            # Write summaries
            train_summary_writer.add_summary(summaries, step)
            # Maybe checkpoint the model: TODO
            return 
        
        x_batches = batch_iter(x_train, 128, 20)
        y_batches = batch_iter(y_train, 128, 20)
        for x_batch, y_batch in zip(x_batches, y_batches):
            train_step(x_batch, y_batch)
            if tf.train.global_step(sess, global_step) % 50 == 0:
                eval_loss(x_dev, y_dev)
                eval_loss(x_train, y_train)
        
        

AttributeError: 'Tensor' object has no attribute 'assign'

In [45]:
t = sess.graph.get_collection(tf.GraphKeys.VARIABLES)

In [51]:
tf.trainable_variables()

[]

In [53]:
cnn.embedded_chars.get_shape()

TensorShape([Dimension(None), Dimension(59), Dimension(300)])