# Text classification using a TensorFlow CNN in DeepWater on SparklingWater 

This notebook is based on:

* [Convolutional Neural Networks for Sentence Classification paper](https://arxiv.org/abs/1408.5882)
* http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

With a few tweaks which I will mention down the road.

This notebook will build a Convolutional Neural Network in TensorFlow for sentence sentiment analysis using Rotten Tomatoes
movie review sentences for training (where negative reviews will be labeled with a score of 0, positive with a score of 1 and neutral with a 0.5).

## Data preparation

In [7]:
from urllib.request import urlopen
import re

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Pull scentences with positive sentiment
    pos_file = urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
    # Pull scentences with negative sentiment
    neg_file = urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
    
    # Load data from files and add labels
    positive_examples = sc.parallelize(pos_file.readlines()).map(lambda s: (1, s.decode('latin-1').strip()))
    negative_examples = sc.parallelize(neg_file.readlines()).map(lambda s: (0, s.decode('latin-1').strip()))
    
    # Split by words
    labeled = positive_examples.union(negative_examples).map(lambda ls: (ls[0], clean_str(ls[1]).split(" ")))
    return labeled
    
def pad_sentences(sentences, padding_word="</s>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = sentences.values().map(lambda a: len(a)).max()
    sentences.map(lambda ls: (ls[0], ls[1] + [padding_word] * (sequence_length - len(ls[1]))))
    return sentences
    
# TODO rewrite to Spark
def build_input_data(sentences, vocab_size):
    """
    Maps sentences to vector representations.
    """
    hashingTF = HashingTF(vocab_size)
    return sentences.map(lambda ls: (ls[0], hashingTF.transform(ls[1])))

vocab_size = 65536

labeled_sentences = load_data_and_labels()
padded_senteces = pad_sentences(labeled_sentences)

vectorized_sentences = build_input_data(padded_senteces, vocab_size)

## Architecting the network

In [1]:
import tensorflow as tf

# Placeholder for input data
input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
# Placeholder for output label
input_y = tf.placeholder(tf.float32, [None, 2], name="input_y")

dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

# Embedding layer - learns how to represent words in lower dimension subspace

# Dimensions to embed words into
num_embed = 300

# Change to /gpu:0 for GPU computation
with tf.device('/cpu:0'), tf.name_scope("embedding"):
    W = tf.Variable(
        tf.random_uniform([vocab_size, num_embed], -1.0, 1.0),
        name="W")
    embedded_chars = tf.nn.embedding_lookup(W, input_x)
    embedded_chars_expanded = ``tf.expand_dims(embedded_chars, -1)

## Bootstrapping SparklingWater

## Learning using DeepWater 