In [13]:
"""
    References:  https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
                 https://github.com/keras-team/keras/blob/master/examples
                 http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9745
                 https://github.com/airalcorn2/Recurrent-Convolutional-Neural-Network-Text-Classifier/blob/master/recurrent_convolutional_keras.py
"""

from __future__ import print_function

import os
import re
import nltk
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.initializers import Constant
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import string
from keras import backend
from keras.layers import Conv1D, Dense, Input, Lambda, CuDNNLSTM, CuDNNGRU
from keras.layers import Bidirectional, GlobalMaxPooling1D, MaxPooling1D, Dropout, SpatialDropout1D
from keras.layers.merge import concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model

In [14]:
stop_words = set(stopwords.words('english') + list(string.punctuation))

def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    e.g.
    Input: 'It is a nice day. I am happy.'
    Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
    """
    tokens = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stop_words and not word.isnumeric():
            tokens.append(word)
    return tokens

def clean_str(strings):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    new_strings = []
    for string in strings:
        string = re.sub(r"\\", "", string)
        string = re.sub(r"\'", "", string)
        string = re.sub(r"\"", "", string)
        string = " ".join(tokenize(string))
        new_strings.append(string)
    return new_strings

In [20]:
count = 0
for i in texts:
    if count == 10:
        break
    if not "a-z" in i:
        print(i)
        count = count + 1

student back neck constant strain dr serrick manages set straight every time highly recommend thumper treatment really great muscle pain relief overall would say dr serrick knowledgeable empathetic thorough highly recommended
stayed football game university phoenix stadium 10mins drive rooms average fine overnight stay great service good breakfast ... bacon biscuits gravy yum
good salads generous portions either get mexican asian cant break away two
experience company growing awesome least memories fallen hard years recently ordered pizza different toppings ordered cheese dont gluten free crust dairy allergy friendly one according allergy website use thin crust- yuck ordered spinach ham pepperoni pineapple sounds like would lot top right yeah..no wasnt even enough spinach cover slice pizza pizza looked like barren wasteland crust sauce wont ordering unless im desperate even ill still go somewhere else
easiest furniture purchase great price delivery quick support local business awesome


In [17]:
BASE_DIR = 'data'
GLOVE_DIR = os.path.join(BASE_DIR)
TEXT_DATA_DIR = os.path.join(BASE_DIR)
MAX_SEQUENCE_LENGTH = 250
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.0909090909090909090909

MAX_TOKENS = 30000
hidden_dim_1 = 200
hidden_dim_2 = 100
NUM_CLASSES = 6

# first, build index mapping words in the embeddings set
# to their embedding vector
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='UTF-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels = []  # list of labels

files = ['train.csv', 'valid.csv']
for file_name in files:
    file = pd.read_csv(os.path.join(TEXT_DATA_DIR, file_name))
    for line in file['text']:
        texts.append(line)
    for label in file['stars']:
        labels.append(label)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
texts = clean_str(texts)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
left = list()
right = list()
for token_list in sequences:
    # We shift the document to the right to obtain the left-side contexts.
    left.append([MAX_TOKENS] + token_list[:-1])
    # We shift the document to the left to obtain the right-side contexts.
    right.append(token_list[1:] + [MAX_TOKENS])

left_context_as_array = pad_sequences(left, maxlen=MAX_SEQUENCE_LENGTH)
right_context_as_array = pad_sequences(right, maxlen=MAX_SEQUENCE_LENGTH)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train_left = left_context_as_array[:-num_validation_samples]
x_train_right = right_context_as_array[:-num_validation_samples]
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]

x_val_left = right_context_as_array[-num_validation_samples:]
x_val_right = right_context_as_array[-num_validation_samples:]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
document = Input(shape=(None,), dtype="int32")
left_context = Input(shape=(None,), dtype="int32")
right_context = Input(shape=(None,), dtype="int32")

embedder = Embedding(num_words,
                     EMBEDDING_DIM,
                     embeddings_initializer=Constant(embedding_matrix),
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)
doc_embedding = embedder(document)
l_embedding = embedder(left_context)
r_embedding = embedder(right_context)

Indexing word vectors.
Found 400000 word vectors.
Processing text dataset
Found 110000 texts.
Found 86127 unique tokens.
Shape of data tensor: (110000, 250)
Shape of label tensor: (110000, 6)
Preparing embedding matrix.


In [10]:
drop_out_r = SpatialDropout1D(0.25)(r_embedding)
drop_out_l = SpatialDropout1D(0.25)(l_embedding)
forward = Bidirectional(CuDNNGRU(hidden_dim_1, return_sequences=True))(drop_out_l)  # See equation (1).
backward = Bidirectional(CuDNNGRU(hidden_dim_1, return_sequences=True, go_backwards=True))(drop_out_r)  # See equation (2).
# Keras returns the output sequences in reverse order.
backward = Lambda(lambda x: backend.reverse(x, axes=1))(backward)
together = concatenate([forward, doc_embedding, backward], axis=2)  # See equation (3).

drop_out_c = Dropout(0.25)(together)
semantic = Conv1D(hidden_dim_2, kernel_size=5, activation="tanh")(drop_out_c)  # See equation (4).

# Keras provides its own max-pooling layers, but they cannot handle variable length input
# (as far as I can tell). As a result, I define my own max-pooling layer here.
# pool_rnn = Lambda(lambda x: backend.max(x, axis=1), output_shape=(hidden_dim_2,))(semantic)  # See equation (5).
pool_rnn = GlobalMaxPooling1D()(semantic)

output = Dense(NUM_CLASSES, input_dim=hidden_dim_2, activation="softmax")(pool_rnn)  # See equations (6) and (7).

model = Model(inputs=[document, left_context, right_context], outputs=output)

optimizer = 
model.compile(optimizer="adadelta", loss="categorical_crossentropy", metrics=["accuracy"])

model.fit([x_train, x_train_left, x_train_right], y_train, epochs=10, verbose=1, batch_size=128, initial_epoch=0,
          validation_data=([x_val, x_val_left, x_val_right], y_val))

Train on 100000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2337ce90eb8>

In [12]:
model.fit([x_train, x_train_left, x_train_right], y_train, epochs=20, verbose=1, batch_size=128, initial_epoch=10,
          validation_data=([x_val, x_val_left, x_val_right], y_val))

Train on 100000 samples, validate on 10000 samples
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x23201fa44a8>