In [34]:
from keras.layers.embeddings import Embedding
from keras.layers import Input, Dense
from keras.models import Model
import numpy as np

# to load data
import re, io

embedding_dim = 128
filter_sizes = [3, 4, 5]
num_filters = 128
dropout_keep_prob = 0.5

num_epochs = 200

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)  # replace everything that is not a number, letter or a few chars for a white space
    string = re.sub(r"\'s", " \'s", string)     # adds a space for "'s". Example: that's -> that 's
    string = re.sub(r"\'ve", " \'ve", string)   # adds a space for "'ve". Example: you've -> you 've
    string = re.sub(r"n\'t", " n\'t", string)   # adds a space for "n't". Example: can't -> ca n't
    string = re.sub(r"\'re", " \'re", string)   # adds a space for "'re". Example: you're -> you 're
    string = re.sub(r"\'d", " \'d", string)     # adds a space for "'d". Example: you'd -> you 'd
    string = re.sub(r"\'ll", " \'ll", string)   # adds a space for "'ll". Example: you'll -> you 'll
    string = re.sub(r",", " , ", string)        # adds a space for ",". Example: you, me -> you , me
    string = re.sub(r"!", " ! ", string)        # adds a space for "!". Example: not! -> not !
    string = re.sub(r"\(", " \( ", string)      # adds a slash and space for "(". Example: and) -> and \)
    string = re.sub(r"\)", " \) ", string)      # adds a slash and space for ")". Example: (and -> \( and
    string = re.sub(r"\?", " \? ", string)      # adds a slash and space for "?". Example: and? -> and \?
    string = re.sub(r"\s{2,}", " ", string)     # Replace 2 or more whitespaces for only one
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    with io.open(positive_data_file, encoding='latin-1') as positive_file:
        positive_examples = [str(s).strip() for s in positive_file.readlines()]

    with io.open(negative_data_file, encoding='latin-1') as negative_file:
        negative_examples = [str(s).strip() for s in negative_file.readlines()]

    # Split by words
    x_text = [clean_str(sent) for sent in positive_examples + negative_examples]

    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


# Load data
print("Loading data...")
x_text, y = load_data_and_labels("../datasets/rt-polarity.pos", "../datasets/rt-polarity.neg")

Loading data...


In [35]:
# Taken from http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

from tensorflow.contrib import learn


# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

print(x[:5])

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
dev_sample_index = -1 * int(len(y) * 0.1)    # Uses 10% as test (dev)
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

del x, y, x_shuffled, y_shuffled

print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


[[ 1  2  3  4  5  6  1  7  8  9 10 11 12 13 14  9 15  5 16 17 18 19 20 21
  22 23 24 25 26 27 28 29 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1 31 32 33 34  1 35 34  1 36 37  3 38 39 13 17 40 34 41 42 43 44 45 46
  47 48 49  9 50 51 34 52 53 53 54  9 55 56  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [57 58 59 60 61  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [62 63 64 65  5 66  5  1 67  5 68 69 70  3 17 71 72  5 73  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [74 75 76 77 78 79 80 13  9 38 81 12 82 83 13 84 85 86 87 65 88  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]]
Vocabulary Size: 18758
Train/Dev split: 9596/1066


In [3]:
from keras.layers.embeddings import Embedding
from keras.layers.core import Reshape, Dense
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.merge import Concatenate
from keras.models import Model

# Embedding layer 
embedding = Embedding(input_dim=len(vocab_processor.vocabulary_),
                      output_dim=embedding_dim, 
                     input_length=max_document_length,
                     name="embedding")

input_sentence = Input(shape=(max_document_length,), name="input_sentence")


sentence_vector = embedding(input_sentence)   # expected shape = (batch_size, max_doc_length, embedding_dim)
sentence_vector = Reshape((1, max_document_length, embedding_dim))(sentence_vector)  # This is necessary
                        # because Conv2D expects a 4-D tensor (counting the batch_size)

# 3 Conv2D layers, with num_filters (128) of filters size = (filter_len=[3,4,5], output_dim)
# each filter produces an output of expected shape (max_doc_len - filter_len + 1)
# the input of each Conv2D layer is the same sentence_vector


pool_outputs = []

for filter_len in filter_sizes:
    conv = Conv2D(filters=num_filters, kernel_size=(filter_len, embedding_dim), strides=(1,1), 
                  activation='relu', data_format='channels_first', padding='valid')
    # expected output shape = (samples?, num_filters, new_rows=max_doc_len - filter_len + 1, new_cols=1)
    conv_output = conv(sentence_vector)
    
    pooling = MaxPooling2D(pool_size=(max_document_length - filter_len + 1,1), data_format='channels_first')
    # expected output (batch_size, num_filters, pooled_rows=1, pooled_cols=1)
    pool_output = pooling(conv_output)
    pool_outputs.append(pool_output)
    
# Concatenate the len(filter_sizes) outputs in only one
concatenated = Concatenate(axis=1)(pool_outputs)
# expected concatenated.shape = (batch_size, num_filters * len(filter_sizes), 1, 1)

feature_vector = Reshape((num_filters * len(filter_sizes),))(concatenated)
# expected feature_vector.shape = (batch_size, num_filters * len(filter_sizes))

final_output = Dense(2, activation='softmax')(feature_vector) # 2 because it can be positive or negative
# expected final_output.shape = (batch_size, 2)


model = Model(inputs=input_sentence, outputs=final_output)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

#model.fit(x_train, y_train, batch_size=64, epochs=10, verbose=2)

NameError: name 'vocab_processor' is not defined

In [4]:

from keras.layers.embeddings import Embedding
from keras.layers.core import Reshape, Dense
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.merge import Concatenate
from keras.models import Model
from keras.layers import Input
import numpy as np

# to load data
import re, io

embedding_dim = 128
filter_sizes = [3, 4, 5]
num_filters = 128
dropout_keep_prob = 0.5

num_epochs = 200

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)  # replace everything that is not a number, letter or a few chars for a white space
    string = re.sub(r"\'s", " \'s", string)     # adds a space for "'s". Example: that's -> that 's
    string = re.sub(r"\'ve", " \'ve", string)   # adds a space for "'ve". Example: you've -> you 've
    string = re.sub(r"n\'t", " n\'t", string)   # adds a space for "n't". Example: can't -> ca n't
    string = re.sub(r"\'re", " \'re", string)   # adds a space for "'re". Example: you're -> you 're
    string = re.sub(r"\'d", " \'d", string)     # adds a space for "'d". Example: you'd -> you 'd
    string = re.sub(r"\'ll", " \'ll", string)   # adds a space for "'ll". Example: you'll -> you 'll
    string = re.sub(r",", " , ", string)        # adds a space for ",". Example: you, me -> you , me
    string = re.sub(r"!", " ! ", string)        # adds a space for "!". Example: not! -> not !
    string = re.sub(r"\(", " \( ", string)      # adds a slash and space for "(". Example: and) -> and \)
    string = re.sub(r"\)", " \) ", string)      # adds a slash and space for ")". Example: (and -> \( and
    string = re.sub(r"\?", " \? ", string)      # adds a slash and space for "?". Example: and? -> and \?
    string = re.sub(r"\s{2,}", " ", string)     # Replace 2 or more whitespaces for only one
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    with io.open(positive_data_file, encoding='latin-1') as positive_file:
        positive_examples = [str(s).strip() for s in positive_file.readlines()]

    with io.open(negative_data_file, encoding='latin-1') as negative_file:
        negative_examples = [str(s).strip() for s in negative_file.readlines()]

    # Split by words
    x_text = [clean_str(sent) for sent in positive_examples + negative_examples]

    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


# Load data
print("Loading data...")
x_text, y = load_data_and_labels("../datasets/rt-polarity.pos", "../datasets/rt-polarity.neg")




# Taken from http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
from tensorflow.contrib import learn


# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

print(x[:5])

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
dev_sample_index = -1 * int(len(y) * 0.1)    # Uses 10% as test (dev)
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

del x, y, x_shuffled, y_shuffled

print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))









# Embedding layer 
embedding = Embedding(input_dim=len(vocab_processor.vocabulary_),
                      output_dim=embedding_dim, 
                     input_length=max_document_length,
                     name="embedding")

input_sentence = Input(shape=(max_document_length,), name="input_sentence")


sentence_vector = embedding(input_sentence)   # expected shape = (batch_size, max_doc_length, embedding_dim)
sentence_vector = Reshape((1, max_document_length, embedding_dim))(sentence_vector)  # This is necessary
                        # because Conv2D expects a 4-D tensor (counting the batch_size)

# 3 Conv2D layers, with num_filters (128) of filters size = (filter_len=[3,4,5], output_dim)
# each filter produces an output of expected shape (max_doc_len - filter_len + 1)
# the input of each Conv2D layer is the same sentence_vector


pool_outputs = []

for filter_len in filter_sizes:
    conv = Conv2D(filters=num_filters, kernel_size=(filter_len, embedding_dim), strides=(1,1), 
                  activation='relu', data_format='channels_first', padding='valid')
    # expected output shape = (samples?, num_filters, new_rows=max_doc_len - filter_len + 1, new_cols=1)
    conv_output = conv(sentence_vector)
    
    pooling = MaxPooling2D(pool_size=(max_document_length - filter_len + 1,1), data_format='channels_first')
    # expected output (batch_size, num_filters, pooled_rows=1, pooled_cols=1)
    pool_output = pooling(conv_output)
    pool_outputs.append(pool_output)
    
# Concatenate the len(filter_sizes) outputs in only one
concatenated = Concatenate(axis=1)(pool_outputs)
# expected concatenated.shape = (batch_size, num_filters * len(filter_sizes), 1, 1)

feature_vector = Reshape((num_filters * len(filter_sizes),))(concatenated)
# expected feature_vector.shape = (batch_size, num_filters * len(filter_sizes))

final_output = Dense(2, activation='softmax')(feature_vector) # 2 because it can be positive or negative
# expected final_output.shape = (batch_size, 2)


model = Model(inputs=input_sentence, outputs=final_output)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=64, epochs=10, verbose=2)


Loading data...
Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
[[ 1  2  3  4  5  6  1  7  8  9 10 11 12 13 14  9 15  5 16 17 18 19 20 21
  22 23 24 25 26 27 28 29 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1 31 32 33 34  1 35 34  1 36 37  3 38 39 13 17 40 34 41 42 43 44 45 46
  47 48 49  9 50 51 34 52 53 53 54  9 55 56  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [57 58 59 60 61  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [62 63 64 65  5 66  5  1 67  5 68 69 70  3 17 71 72  5 73  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0

<keras.callbacks.History at 0x7f3af2f3b4a8>

In [5]:
score = model.evaluate(x_dev, y_dev, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 1.1847093440801968
Test accuracy: 0.7448405253842445
