# Text classification using a TensorFlow CNN in DeepWater on SparklingWater 

This notebook is based on:

* [Convolutional Neural Networks for Sentence Classification paper](https://arxiv.org/abs/1408.5882)
* http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

With a few tweaks which I will mention down the road.

This notebook will build a Convolutional Neural Network in TensorFlow for sentence sentiment analysis using Rotten Tomatoes
movie review sentences for training (where negative reviews will be labeled with a score of 0, positive with a score of 1 and neutral with a 0.5).

## Data preparation

In [7]:
from urllib.request import urlopen
import re

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Pull scentences with positive sentiment
    pos_file = urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
    # Pull scentences with negative sentiment
    neg_file = urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
    
    # Load data from files and add labels
    positive_examples = sc.parallelize(pos_file.readlines()).map(lambda s: (1, s.decode('latin-1').strip()))
    negative_examples = sc.parallelize(neg_file.readlines()).map(lambda s: (0, s.decode('latin-1').strip()))
    
    # Split by words
    labeled = positive_examples.union(negative_examples).map(lambda ls: (ls[0], clean_str(ls[1]).split(" ")))
    return labeled
    
def pad_sentences(sentences, padding_word="</s>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = sentences.values().map(lambda a: len(a)).max()
    sentences.map(lambda ls: (ls[0], ls[1] + [padding_word] * (sequence_length - len(ls[1]))))
    return sentences
    
# TODO rewrite to Spark
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]
    
# TODO rewrite to Spark
def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

x, y = build_input_data(sentences_padded, labels, vocabulary)
    
labeled_sentences = load_data_and_labels()
padded_senteces = pad_sentences(labeled_sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)

# TODO rewrite to Spark
vocab_size = len(vocabulary)

# randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# split train/dev set
# there are a total of 10662 labled examples to train on
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]

sentence_size = x_train.shape[1]

print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev))
print 'train shape:', x_train.shape
print 'dev shape:', x_dev.shape
print 'vocab_size', vocab_size
print 'sentence max words', sentence_size

## Architecting the network

## Bootstrapping SparklingWater

## Learning using DeepWater 