This notebook implements the model defined in "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional
Neural Networks for Sentence Classification", which can be found at https://arxiv.org/abs/1510.03820 .

It is tested on Kaggle's IMDb dataset.

```
@InProceedings{I17-1026,
  author = "Zhang, Ye and Wallace, Byron",
  title = "A Sensitivity Analysis of (and Practitioners' Guide to)
           Convolutional Neural Networks for Sentence Classification",
  booktitle ="Proceedings of the Eighth International Joint 
  Conference on Natural Language Processing (Volume 1: Long Papers)",
  year = "2017",
  publisher = "Asian Federation of Natural Language Processing",
  pages = "253--263",
  location = "Taipei, Taiwan",
  url = "http://aclweb.org/anthology/I17-1026"
}
```

## Imports and Constants Definitions

In [None]:
! wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
! mkdir glove

In [None]:
! unzip glove.6B.zip

In [112]:
import numpy as np

from keras.datasets import imdb
from keras.models import Sequential, Model
from keras.layers import Conv2D, GlobalMaxPool2D, concatenate, Input, Dense, Embedding, Reshape
from keras.preprocessing.sequence import pad_sequences

In [80]:
VOCAB_SIZE = 40000
MAX_LEN = 610
EMBEDDING_DIM = 300

## Data Import

In [81]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

## Data Exploration

In [82]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(25000,)
(25000,)
(25000,)
(25000,)


In [86]:
print('Amount of words: ', np.max(np.max(x_train)))

sentence_lengths = np.vectorize(len)(x_train)
print('Maximum length of sentences: %i' % np.max(sentence_lengths))
print('Average length of sentences: %i' % np.mean(sentence_lengths))
print('Standard variance of length of sentences: %i' % np.std(sentence_lengths))
for percentile in [75, 95, 99]:
    print(f'0.{percentile} percentile of length of sentences: %i ' % np.percentile(sentence_lengths, percentile))

## Model Definition

In [None]:
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index or out of vocab will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [120]:
inp = Input(shape=(MAX_LEN,))

# emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix])(inp)
emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inp)
emb = Reshape((emb.shape[1], emb.shape[2], 1))(emb)

max_pools = []
filter_sizes = [2, 3, 4]
for filter_size in filter_sizes:
    for i in range(2):
        # test elu activation function
        x = Conv2D(1, kernel_size=(filter_size, EMBEDDING_DIM), activation='relu')(emb)
        max_pools.append(GlobalMaxPool2D()(x))
conc = concatenate(max_pools)
outp = Dense(1, activation="sigmoid")(conc)

model = Model(inputs=inp, outputs=outp)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [122]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_36 (InputLayer)           (None, 610)          0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 610, 300)     12000000    input_36[0][0]                   
__________________________________________________________________________________________________
reshape_4 (Reshape)             (None, Dimension(610 0           embedding_14[0][0]               
__________________________________________________________________________________________________
conv2d_52 (Conv2D)              (None, Dimension(609 601         reshape_4[0][0]                  
__________________________________________________________________________________________________
conv2d_53 

In [118]:
data = pad_sequences(x_train, maxlen=MAX_LEN)

In [119]:
model.fit(x=data, y=y_train, validation_split=0.1, batch_size=32, epochs=10)

Train on 22500 samples, validate on 2500 samples
Epoch 1/10


KeyboardInterrupt: 