# Text classification using a TensorFlow CNN in DeepWater on SparklingWater 

This notebook is based on:

* [Convolutional Neural Networks for Sentence Classification paper](https://arxiv.org/abs/1408.5882)
* http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

With a few tweaks which I will mention down the road.

This notebook will build a Convolutional Neural Network in TensorFlow for sentence sentiment analysis using Rotten Tomatoes
movie review sentences for training (where negative reviews will be labeled with a score of 0, positive with a score of 1 and neutral with a 0.5).

## Data preparation

In [7]:
from urllib.request import urlopen
import re

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Pull scentences with positive sentiment
    pos_file = urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
    # Pull scentences with negative sentiment
    neg_file = urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
    
    # Load data from files and add labels
    positive_examples = sc.parallelize(pos_file.readlines()).map(lambda s: (1, s.decode('latin-1').strip()))
    negative_examples = sc.parallelize(neg_file.readlines()).map(lambda s: (0, s.decode('latin-1').strip()))
    
    # Split by words
    labeled = positive_examples.union(negative_examples).map(lambda ls: (ls[0], clean_str(ls[1]).split(" ")))
    return labeled
    
def pad_sentences(sentences, padding_word="</s>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = sentences.values().map(lambda a: len(a)).max()
    sentences.map(lambda ls: (ls[0], ls[1] + [padding_word] * (sequence_length - len(ls[1]))))
    return sentences
    
# TODO rewrite to Spark
def build_input_data(sentences, vocab_size):
    """
    Maps sentences to vector representations.
    """
    hashingTF = HashingTF(vocab_size)
    return sentences.map(lambda ls: (ls[0], hashingTF.transform(ls[1])))

vocab_size = 65536

labeled_sentences = load_data_and_labels()
padded_senteces = pad_sentences(labeled_sentences)

vectorized_sentences = build_input_data(padded_senteces, vocab_size)

## Architecting the network

In [1]:
import tensorflow as tf

# Placeholder for input data
input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
# Placeholder for output label
input_y = tf.placeholder(tf.float32, [None, 2], name="input_y")

dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

# Embedding layer - learns how to represent words in lower dimension subspace

# Dimensions to embed words into
num_embed = 300

# Change to /gpu:0 for GPU computation
with tf.device('/cpu:0'), tf.name_scope("embedding"):
    W = tf.Variable(
        tf.random_uniform([vocab_size, num_embed], -1.0, 1.0),
        name="W")
    embedded_chars = tf.nn.embedding_lookup(W, input_x)
    embedded_chars_expanded = ``tf.expand_dims(embedded_chars, -1)

## Bootstrapping SparklingWater

## Learning using DeepWater 

In [None]:
import json

# Parameters
# ==================================================

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

# Creating the network
# ==================================================

with tf.Graph().as_default():
    sequence_length = 65536
    num_classes = 2
    vocab_size = 18758
    embedding_size = FLAGS.embedding_dim
    filter_sizes = list(map(int, FLAGS.filter_sizes.split(",")))
    num_filters = FLAGS.num_filters
    l2_reg_lambda = FLAGS.l2_reg_lambda
    # ========================================================================================

    # Placeholders for input, output and dropout
    input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
    input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
    # dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
    dropout_keep_prob = tf.constant(0.5)
    # Keeping track of l2 regularization loss (optional)
    l2_loss = tf.constant(0.0)

    # Embedding layer
    with tf.device('/cpu:0'), tf.name_scope("embedding"):
        W = tf.Variable(
            tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
            name="W")
        embedded_chars = tf.nn.embedding_lookup(W, input_x)
        embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

    # Create a convolution + maxpool layer for each filter size
    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution Layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
            conv = tf.nn.conv2d(
                embedded_chars_expanded,
                W,
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="conv")
            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")  # Maxpooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, sequence_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="pool")
            pooled_outputs.append(pooled)

    # Combine all the pooled features
    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(3, pooled_outputs)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

    # Add dropout
    with tf.name_scope("dropout"):
        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

    # Final (unnormalized) scores and predictions
    with tf.name_scope("output"):
        W = tf.get_variable(
            "W",
            shape=[num_filters_total, num_classes],
            initializer=tf.contrib.layers.xavier_initializer())
        b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
        l2_loss += tf.nn.l2_loss(W)
        l2_loss += tf.nn.l2_loss(b)
        scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
        predictions = tf.argmax(scores, 1, name="predictions")


    # Accuracy
    with tf.name_scope("accuracy"):
        correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

    # CalculateMean cross-entropy loss
    with tf.name_scope("loss"):
        losses = tf.nn.softmax_cross_entropy_with_logits(scores, input_y)
        loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

    # ========================================================================================

    # Define Training procedure
    # global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    global_step = tf.Variable(0, name="global_step", trainable=False)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
    tf.add_to_collection("train", train_op)

    init = tf.initialize_all_variables()
    tf.add_to_collection("init", init)
    tf.add_to_collection("logits", scores)
    saver = tf.train.Saver()
    meta = json.dumps({
        "inputs": {"batch_image_input": input_x.name, "categorical_labels": input_y.name},
        "outputs": {"categorical_logits": scores.name},
        "metrics": {"accuracy": accuracy.name, "total_loss": loss.name},
        "parameters": {"global_step": global_step.name},
    })
    tf.add_to_collection("meta", meta)
    filename = "/tmp/cnn_text_tensorflow.meta"
    tf.train.export_meta_graph(filename, saver_def=saver.as_saver_def())