# Sentiment Classifier with Keras

## Reading Dataset and making matrices

In [129]:
import glob
import os
import numpy as np
import re


class DocumentReader(object):
    def __init__(self, path):
        self.path_ = path
        
    def __len__(self):
        return len([_ for _ in glob.glob(os.path.join(self.path_,
                                                      '*.txt'))])
        
    def __iter__(self):
        for f in glob.glob(os.path.join(self.path_,
                                        '*.txt')):
            yield open(f, 'r').read()

def parse_text(text):
    # preprocessing
    text = text.lower().strip()  # lowercase 
    text = re.sub('[0-9]*', 'num', text)  # replace numbers
    
    return text


def documents_to_matrix(document_reader,
                        voc_size=50000,
                        seq_max_len=1000):
    sequences = []
    for text in document_reader:
        text = parse_text(text)
        text = one_hot(text, voc_size)
        sequences.append(text)
    
    # padding sequences
    X = pad_sequences(np.asarray(sequences),
                      maxlen=seq_max_len,
                      padding='post')
    
    return X


def make_X_and_y(pos_reader, neg_reader):
    X = np.vstack([documents_to_matrix(pos_reader),
                   documents_to_matrix(neg_reader)])

    y = np.vstack([np.ones(len(train_pos_reader)).reshape(-1, 1),
                   np.zeros(len(train_neg_reader)).reshape(-1, 1)])
    
    rand_indices = np.random.permutation(X.shape[0])
    
    X = X[rand_indices]
    y = y[rand_indices]
    
    return X, y


train_pos_reader = DocumentReader('aclImdb/train/pos/')
train_neg_reader = DocumentReader('aclImdb/train/neg/')
test_pos_reader = DocumentReader('aclImdb/test/pos/')
test_neg_reader = DocumentReader('aclImdb/test/neg/')

X_train, y_train = make_X_and_y(train_pos_reader,
                                train_neg_reader)

X_test, y_test = make_X_and_y(test_pos_reader,
                              test_neg_reader)

print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

(25000, 1000) (25000, 1)
(25000, 1000) (25000, 1)


## Neural Network Models

In [131]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Flatten
from keras.utils import to_categorical
from keras.callbacks import History
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

### Multi Layer Perceptron with Embedding Layer

In [132]:
model = Sequential()
model.add(Embedding(input_dim=50000,
                    input_length=1000,
                    output_dim=64))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = History()

model.fit(x=X_train,
          y=y_train,
          validation_data=(X_test,
                           y_test),
          epochs=1,
          callbacks=[history])

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f6c28c0df50>