In [87]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import itertools
import gc
import gensim
from collections import Counter
from gensim.models.word2vec import Word2Vec

In [88]:
np.random.seed(10)

In [101]:
# Load data from files
positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg").readlines())
negative_examples = [s.strip() for s in negative_examples]

# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
# Concatenate positive and negative examples
x_text = positive_examples + negative_examples
y = np.concatenate([positive_labels, negative_labels], 0)

In [102]:
print x_text[len(positive_examples) + 242]
print y[len(positive_examples) + 242]

at 90 minutes this movie is short , but it feels much longer .
[1 0]


In [103]:
# Padding our sentences to the same length
x_text_split = [s.split(" ") for s in x_text]
padded_sentences = [] 
# We pad all sentences to the maximum sentence length in the dataset
SEQUENCE_LENGTH = max(len(x) for x in x_text_split)
PADDING_WORD = "<PAD/>"
for i in range(len(x_text_split)):
    sentence = x_text_split[i]
    num_padding = SEQUENCE_LENGTH - len(sentence)
    new_sentence = sentence + [PADDING_WORD] * num_padding
    padded_sentences.append(new_sentence)

In [104]:
SEQUENCE_LENGTH

59

In [105]:
# Build vocabulary
word_counts = Counter(itertools.chain(*padded_sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}

In [106]:
word_counts.most_common(500)

[('<PAD/>', 404991),
 ('.', 14010),
 ('the', 10096),
 (',', 10037),
 ('a', 7281),
 ('and', 6195),
 ('of', 6061),
 ('to', 4233),
 ('is', 3367),
 ('in', 2628),
 ('that', 2470),
 ('it', 2281),
 ('as', 1801),
 ('but', 1637),
 ('with', 1560),
 ('film', 1445),
 ('this', 1440),
 ('for', 1436),
 ('its', 1335),
 ('an', 1321),
 ('movie', 1268),
 ("it's", 1119),
 ('be', 939),
 ('on', 895),
 ('you', 892),
 ('not', 803),
 ('by', 795),
 ('about', 733),
 ('one', 727),
 ('more', 727),
 ('like', 720),
 ('has', 709),
 ('are', 708),
 ('at', 705),
 ('from', 673),
 ('than', 664),
 ('"', 655),
 ('all', 641),
 ('--', 629),
 ('his', 628),
 ('have', 623),
 ('so', 555),
 ('if', 537),
 ('or', 519),
 ('story', 476),
 ('i', 466),
 ('too', 459),
 ('just', 438),
 ('who', 432),
 ('into', 417),
 ('what', 413),
 ('most', 402),
 ('out', 398),
 ('no', 387),
 ('much', 386),
 ('even', 382),
 ('good', 377),
 ('up', 376),
 ('will', 374),
 ('comedy', 353),
 ('time', 339),
 ('can', 337),
 ('some', 334),
 ('characters', 313),
 

In [111]:
# Use word2vec for initial embeddings (optional)

# Start with zero embeddings
# initial_embeddings = np.random.randn(len(vocabulary), 300)

# # Load word2vec and get embeddings
# w2v_model = Word2Vec.load_word2vec_format("/Users/dennybritz/Downloads/GoogleNews-vectors-negative300.bin", binary=True)
# for idx, word in enumerate(vocabulary_inv):
#     if word in w2v_model:
#         initial_embeddings[idx,:] = w2v_model[word]

# # Save initial embeddings to avoid loading word2vec every time
# np.save("./data/rt-polaritydata/initial_embeddings", initial_embeddings)

# # Clear word2vec
# w2v_model = None

  return word in self.vocab


In [112]:
# Optional: Load initial embeddings
initial_embeddings = np.load("./data/rt-polaritydata/initial_embeddings.npy")

In [113]:
np.random.seed(10)

# Our training data
x = np.array([[vocabulary[word] for word in sentence] for sentence in padded_sentences])
y = np.array(y)

# Randomly shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

print("Training examples: {:d}".format(len(x)))

Training examples: 10662


In [114]:
def print_example(x, y):
    text = " ".join([vocabulary_inv[i] for i in x])
    label = "POS" if y[1] == 1 else "NEG"
    print("{}: {}".format(y, text))

In [115]:
# TODO: Do this properly?
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]

In [116]:
print_example(x_train[2], y_train[2])

[1 0]: this is surely one of the most frantic , virulent and foul-natured christmas season pics ever delivered by a hollywood studio . <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/> <PAD/>


In [117]:
# Helper function to batch data
def batch_iter(data, batch_size, num_epochs):
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_data = data[shuffle_indices]      
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [118]:
# Test data!?
with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        x = tf.convert_to_tensor([[1,1,1,0,0,0,0],[1,1,1,0,0,0,0]])
        mask = tf.argmin(x, 1)
        print mask.eval()

[3 3]


In [119]:
class TextCNN(object):
    """
    A CNN for text classifications
    """
    def __init__(
        self, vocabulary_size, sequence_length, num_classes=2, embedding_size=300,
        filter_sizes=[3, 4, 5], num_filters=100, affine_dim=256):
        
        # Placeholders for our input and output
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32)
        
        # Embedding layer
        self.embedded_chars = self._build_embedding(
            [vocabulary_size, embedding_size], self.input_x)
        # Add another dimension, expected by the convolutional layer
        self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.variable_scope("filter-%s" % filter_size):
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                pool_ksize = [1, sequence_length - filter_size + 1, 1, 1]
                pooled = self._build_conv_maxpool(filter_shape, pool_ksize, self.embedded_chars_expanded)
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Affine Layer with dropout
        # self.h_affine = self._build_affine([num_filters_total, affine_dim], self.h_pool_flat)
        # self.h_drop = tf.nn.dropout(self.h_affine, dropout_keep_prob)
        
        # Add dropout
        self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
        
        # Softmax Layer (Final output)
        self.y = self._build_softmax([num_filters_total, num_classes], self.h_drop)
        self.predictions = tf.argmax(self.y, 1, name="predictions")

        # Our loss expression
        self.loss = self._build_mean_ce_loss(self.y, self.input_y)
        # Expression for the accuracy
        self.accuracy = self._build_accuracy(self.y, self.input_y)

        # Summaries
        total_loss_summary = tf.scalar_summary("loss", self.loss)
        accuracy_summmary = tf.scalar_summary("accuracy", self.accuracy)

    def _build_embedding(self, shape, input_tensor):
        """
        Builds an embedding layer. Returns the embedded tensor.
        """
        # We force this on the CPU because the op isn't implemented for the GPU
        with tf.device('/cpu:0'), tf.variable_scope("embedding"):
            W_intializer = tf.random_uniform_initializer(-1.0, 1.0)
            W_embeddings = tf.get_variable("W", shape, initializer=W_intializer)
            return tf.nn.embedding_lookup(W_embeddings, input_tensor)

    def _build_affine(self, shape, input_tensor, activation_func=tf.nn.relu):
        """
        Builds an affine (fully-connected) layer
        """
        with tf.variable_scope("affine"):
            W = tf.get_variable("W", shape, initializer=tf.truncated_normal_initializer(stddev=0.1))
            b = tf.get_variable("b", shape[-1], initializer=tf.constant_initializer(0.1))
            h = activation_func(tf.matmul(input_tensor, W) + b, name="h")
        return h

    def _build_softmax(self, shape, input_tensor):
        """
        Builds a softmax layer
        """
        with tf.variable_scope("softmax"):
            W_initializer = tf.truncated_normal_initializer(stddev=0.1)
            b_initializer = tf.constant_initializer(0.1)
            W = tf.get_variable("W", shape, initializer=W_initializer)
            b = tf.get_variable("b", shape[-1:], initializer=b_initializer)
            return tf.nn.softmax(tf.nn.bias_add(tf.matmul(input_tensor, W), b), name="y")
    
    def _build_mean_ce_loss(self, predictions, labels):
        """
        Calculates the mean cross-entropy loss
        """
        with tf.variable_scope("mean-ce-loss"):
            return -tf.reduce_mean(labels * tf.log(predictions), name="mean_ce_loss")
        
    def _build_accuracy(self, predictions, labels):
        """
        Returns the accuracy
        """
        with tf.variable_scope("accuracy"):
            correct_predictions = tf.equal(tf.argmax(predictions, 1), tf.argmax(labels, 1))
            return tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
    
    def _build_conv_maxpool(self, filter_shape, pool_shape, input_tensor):
        """
        Builds a convolutional layer with ReLU activation followed by a  max-pooling layer.
        """
        with tf.variable_scope("conv-maxpool"):
            W = tf.get_variable("W", filter_shape, initializer=tf.truncated_normal_initializer(stddev=0.1))
            conv = tf.nn.conv2d(input_tensor, W, strides=[1, 1, 1, 1], padding="VALID")
            begin = tf.to_int32(tf.zeros([4]))
            b = tf.get_variable("b", filter_shape[-1], initializer=tf.constant_initializer(0.1))
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="conv")
            return tf.nn.max_pool(h, ksize=pool_shape, strides=[1, 1, 1, 1], padding='VALID', name="pool")

In [123]:
BATCH_SIZE = 128
NUM_EPOCHS = 500
EVALUATE_EVERY = CHECKPOINT_EVERY = 100

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=True)      
    sess = tf.Session(config=session_conf)  
    with sess.as_default():
        cnn = TextCNN(
            vocabulary_size=len(vocabulary),
            sequence_length=SEQUENCE_LENGTH,
            num_classes=2,
            embedding_size=300,
            filter_sizes=[3, 4, 5],
            num_filters=100)
        
        # Define Training procedure
        global_step = tf.Variable(0, name="global_step")
        optimizer = tf.train.AdamOptimizer(1e-4)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        # Keep track of gradient values and sparsity
        for g, v in grads_and_vars:
            if g is not None:
                tf.histogram_summary("{}/grad".format(v.name), g)
                tf.scalar_summary("{}/grad-sparsity".format(v.name), tf.nn.zero_fraction(g))
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        
        # Summary Writers
        summary_op = tf.merge_all_summaries()
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def)
        print("Writing to {}\n".format(out_dir))
        
        # Checkpointing
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        # Tensorflow assumes this directory already exists so we need to create it
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())
        
        # Initialize all variables
        sess.run(tf.initialize_all_variables())
        
        # Optional: Initialize embeddings
#         with tf.variable_scope("embedding", reuse=True):
#             embedding_W = tf.get_variable("W")
#         sess.run(embedding_W.assign(initial_embeddings))
    
        def eval_loss(x_batch, y_batch, writer=None):
            feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 }
            step, summaries, loss, accuracy = sess.run(
                [global_step, summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)
        
        # A single training step
        def train_step(x_batch, y_batch):
            feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 0.25 }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            # Print and write metrics
            time_str = datetime.datetime.now().isoformat()
            # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)
        
        batches = batch_iter(zip(x_train, y_train), BATCH_SIZE, NUM_EPOCHS)
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % EVALUATE_EVERY == 0:
                print("\nDev Set:")
                eval_loss(x_dev, y_dev, writer=dev_summary_writer)
                eval_loss(x_train[:1000], y_train[:1000], writer=None)
                print("")
            if current_step % CHECKPOINT_EVERY == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))                
        
        

Writing to /Users/dennybritz/projects/wildml/cnn-text-classification-tf/runs/1449189839


Dev Set:
2015-12-04T09:45:37.519971: step 100, loss 0.48426, acc 0.506
2015-12-04T09:45:43.162219: step 100, loss 0.44472, acc 0.541

Saved model checkpoint to /Users/dennybritz/projects/wildml/cnn-text-classification-tf/runs/1449189839/checkpoints/model-100


Dev Set:
2015-12-04T09:47:22.212004: step 200, loss 0.467162, acc 0.531
2015-12-04T09:47:27.342086: step 200, loss 0.403516, acc 0.578

Saved model checkpoint to /Users/dennybritz/projects/wildml/cnn-text-classification-tf/runs/1449189839/checkpoints/model-200


Dev Set:
2015-12-04T09:49:05.575714: step 300, loss 0.460155, acc 0.557
2015-12-04T09:49:10.936702: step 300, loss 0.378775, acc 0.591

Saved model checkpoint to /Users/dennybritz/projects/wildml/cnn-text-classification-tf/runs/1449189839/checkpoints/model-300


Dev Set:
2015-12-04T09:50:48.187866: step 400, loss 0.446946, acc 0.555
2015-12-04T09:50:54.736985: step 400, loss 0.3541, 

KeyboardInterrupt: 