In [152]:
import os
import numpy as np

np.random.seed(42)

from keras.datasets import imdb
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Flatten

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report

### Datasets

In [None]:
# MR:Moviereviewswithonesentenceperre- view.
#
# https://www.cs.cornell.edu/people/pabo/movie-review-data/


# Stanford Sentiment Treebank—an extension of MR but with train/dev/test splits provided and 
# fine-grained labels (very pos- itive, positive, neutral, negative, very nega- tive), 
# re-labeled by Socher et al
#
# https://nlp.stanford.edu/sentiment/

# CR
# Custom Review
# http://www.cs.uic.edu/⇠liub/FBS/sentiment-analysis.html

In [153]:
# Processing the labels of the raw IMDB data
imdb_dir = '/Users/dsbatista/PycharmProjects/other/keras-tutorial/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [154]:
len(texts)

25000

In [155]:
len(labels)

25000

### load GloVe embeddings

In [156]:
glove_dir = '/Users/dsbatista/resources/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


## Tokenizing the text of the raw IMDB data

In [157]:
max_words = 10000
max_len = 500

training_samples = 5000
test_samples = 1000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=max_len)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_test = data[-test_samples:]
y_test = labels[-test_samples:]

Found 88582 unique tokens.
Shape of data tensor: (25000, 500)
Shape of label tensor: (25000,)


## Create Embeddings matrix

In [158]:
embedding_dim = 100
max_words = 10000

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [133]:
x_train.shape

(5000, 500)

In [134]:
y_train.shape

(5000,)

In [135]:
x_test.shape

(1000, 500)

In [136]:
y_test.shape

(1000,)

In [137]:
embedding_matrix.shape

(10000, 100)

In [138]:
max_len

500

In [172]:
embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1], 
                            input_length=max_len, 
                            weights=[embedding_matrix], 
                            trainable=False, 
                            name='embedding_layer')

In [176]:
i = Input(shape=(max_len,), dtype='int32', name='main_input')
x = embedding_layer(i)
x = Flatten()(x)
o = Dense(1, activation='sigmoid', name='output')(x)
model = Model(inputs=i, outputs=o)

In [177]:
model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 500)               0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 500, 100)          1000000   
_________________________________________________________________
flatten_16 (Flatten)         (None, 50000)             0         
_________________________________________________________________
output (Dense)               (None, 1)                 50001     
Total params: 1,050,001
Trainable params: 50,001
Non-trainable params: 1,000,000
_________________________________________________________________


In [178]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.3, shuffle=False)

Train on 3500 samples, validate on 1500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
threshold = 0.5
predictions = [1 if x>threshold else 0 for x in model.predict(x_test)]

In [None]:
print(classification_report(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions))

In [164]:
from keras.layers import Conv1D
from keras.layers import MaxPooling1D

In [165]:
input_layer = Input(shape=(max_len,), dtype='int32', name='main_input')
x = embedding_layer(input_layer)
# rectified linear units, filter windows (h) of 3, 4, 5 with 100 feature maps each
x = Conv1D(filters=embedding_dim, kernel_size=5, name='Conv_5')(x)
x = MaxPooling1D(pool_size=2, strides=None, padding='valid')(x)
x = Flatten()(x)
o = Dense(1, activation='sigmoid', name='output')(x)

In [143]:
model = Model(inputs=input_layer, outputs=o)
model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 500)               0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 500, 100)          1000000   
_________________________________________________________________
Conv_5 (Conv1D)              (None, 496, 100)          50100     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 248, 100)          0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 24800)             0         
_________________________________________________________________
output (Dense)               (None, 1)                 24801     
Total params: 1,074,901
Trainable params: 74,901
Non-trainable params: 1,000,000
_____________________________________________________________

In [None]:
%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from keras.utils import plot_model
plot_model(model, to_file='model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.3, shuffle=False)

In [None]:
threshold = 0.5
predictions = [1 if x>threshold else 0 for x in model.predict(x_test)]

In [None]:
print(classification_report(y_test, predictions, digits=3))

In [None]:
print(classification_report(y_test, predictions, digits=3))

### with more filters

In [None]:
from keras.layers import Concatenate
from keras import layers

In [None]:
from keras.layers import Dropout

### with more filters

In [None]:
features_maps = 100
n_grams = [3,4,5]

input_layer = Input(shape=(max_len,), dtype='int32', name='main_input')
x = embedding_layer(input_layer)

# rectified linear units, filter windows (h) of 3, 4, 5 with 100 feature maps each
branches = []
for n in n_grams:
    branch = Conv1D(filters=100, kernel_size=3, name='Conv_'+str(n))(x)
    branch = MaxPooling1D(pool_size=2, strides=None, padding='valid', name='MaxPooling_'+str(n))(branch)
    branch = Flatten(name='Flatten_'+str(n))(branch)
    branches.append(branch)

z = layers.concatenate(branches, axis=-1)
o = Dense(1, activation='sigmoid', name='output')(z)

In [None]:
model = Model(inputs=input_layer, outputs=o)
model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam')
SVG(model_to_dot(model).create(prog='dot', format='svg'))
model = Model(inputs=input_layer, outputs=o)
model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam')
model.summary()
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.3, shuffle=False)

In [None]:
threshold = 0.5
predictions = [1 if x>threshold else 0 for x in model.predict(x_test)]
print(classification_report(y_test, predictions, digits=3))

### experiments with two input data channels – static and non-static word vectors. We use only one channel.