In [1]:
import keras
import numpy as np
import re
import datetime

Using TensorFlow backend.


In [3]:
from google.colab import files

uploaded = files.upload()

Saving rt-polarity.neg to rt-polarity.neg
Saving rt-polarity.pos to rt-polarity.pos


In [0]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r", encoding='windows-1252').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r", encoding='windows-1252').readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def preprocess():
  print("Loading data...")
  x_text, y = load_data_and_labels("rt-polarity.pos", "rt-polarity.neg")
  t = Tokenizer()
  t.fit_on_texts(x_text)
  x = t.texts_to_sequences(x_text)
  max_doc_length = max([len(seq) for seq in x])
  print(max_doc_length)
  
  x = pad_sequences(x, maxlen=max_doc_length)
  print(x.shape)
  
  # Randomly shuffle data
  np.random.seed(10)
  shuffle_indices = np.random.permutation(np.arange(len(y)))
  x_shuffled = x[shuffle_indices]
  y_shuffled = y[shuffle_indices]

  # Split train/test set
  # TODO: This is very crude, should use cross-validation
  dev_sample_percentage = .1 #Percentage of the training data to use for validation
  dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
  x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
  y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

  del x, y, x_shuffled, y_shuffled

  print("Vocabulary Size: {:d}".format(len(t.word_counts)))
  print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
  return x_train, y_train, t, x_dev, y_dev

ModuleNotFoundError: No module named 'keras'

In [7]:
x_train, y_train, tokenizer, x_dev, y_dev = preprocess()

Loading data...
53
(10662, 53)
Vocabulary Size: 18757
Train/Dev split: 9596/1066


In [0]:
from keras.layers import Input, Embedding, Reshape, Concatenate, Dropout, Dense, Conv2D, MaxPooling2D
from keras.models import Model

def cnn_model(sequence_length, vocab_size, embedding_dim, num_filters, filter_sizes, dropout_rate):
  embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length)#Input length required if going to connect flatten and dense layers
  sequence_input = Input(shape=(sequence_length,), dtype='int32')
  embedded_sequences = embedding_layer(sequence_input)
  expanded_embedding = Reshape((sequence_length, embedding_dim, 1))(embedded_sequences)
  print(expanded_embedding)
  pooled = []
  for i, filter_size in enumerate(filter_sizes):
    # Conv2d expects input in format (samples, rows, cols, channels)
    conv2d3 = Conv2D(filters=num_filters, kernel_size=(filter_size,embedding_dim), strides=1, padding='valid', activation="relu")(expanded_embedding)
    pool2d = MaxPooling2D(pool_size=(sequence_length - filter_size + 1,1), strides=1, padding="valid")(conv2d3)
    pooled.append(pool2d)
  num_filters_total = num_filters * len(filter_sizes)
  concatenated = Concatenate(axis=3)(pooled)
  print(concatenated)
  flat = Reshape((num_filters_total,))(concatenated)
  print(flat)
  dropout = Dropout(rate=dropout_rate)(flat)
  print(dropout)
  output = Dense(2, activation='softmax')(dropout)
  print(output)

  model = Model(sequence_input, output)
  #If use cat labels i.e. one hot vectors, then you want categorical_crossentropy. If u have two classes, they will be represented as 0, 1 in binary
  #labels and 10, 01 in categorical label format.
  model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['acc']
  )
  model.summary()
  return model

In [0]:
sequence_length=x_train.shape[1]
vocab_size = len(tokenizer.word_counts)
embedding_dim = 128
filter_sizes = [3,4,5]
num_filters = 128
dropout_rate = 0.5

In [27]:
model = cnn_model(
  sequence_length=sequence_length,
  vocab_size=vocab_size+1,
  embedding_dim=128,
  num_filters=num_filters, 
  filter_sizes=filter_sizes, 
  dropout_rate=dropout_rate
)

Tensor("reshape_7/Reshape:0", shape=(?, 53, 128, 1), dtype=float32)
Tensor("concatenate_4/concat:0", shape=(?, 1, 1, 384), dtype=float32)
Tensor("reshape_8/Reshape:0", shape=(?, 384), dtype=float32)
Tensor("dropout_4/cond/Merge:0", shape=(?, 384), dtype=float32)
Tensor("dense_3/Softmax:0", shape=(?, 2), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 53)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 53, 128)      2401024     input_4[0][0]                    
__________________________________________________________________________________________________
reshape_7 (Reshape)             (None, 53, 128, 1)   0           embedding_4[0][0]    

In [0]:
batch_size = 64
num_epochs = 100
evaluate_every = 100 

In [28]:
model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_dev, y_dev))

Train on 9596 samples, validate on 1066 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200

KeyboardInterrupt: ignored

In [20]:
x_train.shape

(9596, 53)

In [21]:
x_dev.shape

(1066, 53)

In [22]:
y_train.shape

(9596, 2)

In [23]:
y_dev.shape

(1066, 2)

2