In [40]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import LSTM
from keras.datasets import imdb

In [10]:
max_features = 5000
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [26]:
def max_words(arrrrrr):
    l = 0 
    for i in range(25000):
        temp = len(arrrrrr[i])
        if temp>l:
            l = temp
        else:
            continue
    return l

In [None]:
max_words(x_train)

In [None]:
from numpy import savetxt
savetxt('../datasets/imdb_xtrain.csv', x_train, delimiter=',')
savetxt('../datasets/imdb_xtest.csv', x_test, delimiter=',')
savetxt('../datasets/imdb_ytrain.csv', y_train, delimiter=',')
savetxt('../datasets/imdb_ytest.csv', y_test, delimiter=',')

In [None]:
xtrainsample = pd.read_csv('../datasets/imdb_xtrain.csv', header=None)


## 400-Word Run CNN (4 Epochs)

In [32]:
# Default Parameters 
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 4

x_train_400 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test_400 = sequence.pad_sequences(x_test, maxlen=maxlen)

model_1 = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model_1.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model_1.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model_1.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model_1.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model_1.add(Dense(hidden_dims))
model_1.add(Dropout(0.2))
model_1.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model_1.add(Dense(1))
model_1.add(Activation('sigmoid'))

model_1.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)              

In [34]:
model_1.fit(x_train_400, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_400, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x24f03c902e8>

## 1000-Word Run CNN (4 Epochs, and then another 4, and then another 4)

Results weren't much better than the 400 word run

In [35]:
# Increasing maximum length of text to 1000 words, 2 epochs
max_features = 5000
maxlen = 1000
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 4


x_train_1000 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test_1000 = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train',x_train_1000.shape)
print('x_test',x_test_1000.shape)

model_2 = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model_2.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model_2.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model_2.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model_2.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model_2.add(Dense(hidden_dims))
model_2.add(Dropout(0.2))
model_2.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model_2.add(Dense(1))
model_2.add(Activation('sigmoid'))

model_2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_2.summary()

x_train (25000, 1000)
x_test (25000, 1000)
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 50)          250000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 1000, 50)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 998, 250)          37750     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 250)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_4 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_

In [36]:
model_2.fit(x_train_1000, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_1000, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x24f04498128>

In [37]:
model_2.fit(x_train_1000, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_1000, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x24f044c2fd0>

In [38]:
model_2.fit(x_train_1000, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_1000, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x24f044fe780>

## 400-Word Run CNN - Double Kernel Size
Faster than 1000 word run. Achieves over .89 val_accuracy by 3rd epoch, which is slightly higher than default kernel size of 3. 

In [41]:
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 6
hidden_dims = 250
epochs = 4

model_3 = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model_3.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model_3.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model_3.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model_3.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model_3.add(Dense(hidden_dims))
model_3.add(Dropout(0.2))
model_3.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model_3.add(Dense(1))
model_3.add(Activation('sigmoid'))

model_3.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_3.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 395, 250)          75250     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 250)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_6 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_5 (Activation)    (None, 250)              

In [43]:
model_3.fit(x_train_400, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_400, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x24f0486fc88>

In [44]:
model_3.fit(x_train_400, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_400, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x24f080da588>

## 80-word LSTM 
Max `val_accuracy` of ~.82, peaking between epoch 3-5.

In [None]:
maxlen=80
x_train_80 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test_80 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

model_4 = Sequential()
model_4.add(Embedding(max_features, 128))
model_4.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_4.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model_4.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_4.summary()

In [46]:
model_4.fit(x_train_80, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test_80, y_test))
score, acc = model_4.evaluate(x_test_80, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.8972654736280441
Test accuracy: 0.8151599764823914


## 80-Word LSTM (Increase dropout to .3, `relu` activation)
Worse results than default of .2 dropout and sigmoid activation. Probably should not have tested both at the same time. 

In [48]:
maxlen=80
x_train_80 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test_80 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

model_5 = Sequential()
model_5.add(Embedding(max_features, 128))
model_5.add(LSTM(128, dropout=0.3, recurrent_dropout=0.2))
model_5.add(Dense(1, activation='relu'))

# try using different optimizers and different optimizer configs
model_5.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_5.summary()

x_train shape: (25000,)
x_test shape: (25000,)
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 128)         640000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 129       
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


In [49]:
model_5.fit(x_train_80, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test_80, y_test))
score, acc = model_5.evaluate(x_test_80, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 1.056603301486969
Test accuracy: 0.781719982624054


## 120-Word LSTM
Increase in `val_accuracy` from ~.82 to ~.86. Reaches max by epoch 4.

In [53]:
maxlen=120
x_train_120 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test_120 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train_120.shape)
print('x_test shape:', x_test_120.shape)

model_6 = Sequential()
model_6.add(Embedding(max_features, 128))
model_6.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_6.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model_6.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_6.summary()

model_6.fit(x_train_120, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test_120, y_test))
score, acc = model_6.evaluate(x_test_120, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

x_train shape: (25000, 120)
x_test shape: (25000, 120)
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 128)         640000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 129       
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.7769462085628509
Test accuracy: 0.8424800038337708


## 240-Word LSTM
Reaches max `val_accuracy` by epoch 10 with score .8723. Slight improvement on the 120-word.

In [54]:
maxlen=240
x_train_240 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test_240 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train_240.shape)
print('x_test shape:', x_test_240.shape)

model_7 = Sequential()
model_7.add(Embedding(max_features, 128))
model_7.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_7.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model_7.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_7.fit(x_train_240, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test_240, y_test))
score, acc = model_7.evaluate(x_test_240, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

x_train shape: (25000, 240)
x_test shape: (25000, 240)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.516984115600586
Test accuracy: 0.8563600182533264


## 400-Word LSTM 
Starts off with lower `val_accuracy` than previous runs. Hits ~.86 accuracy by epoch 3. 

In [55]:
maxlen=400
x_train_400 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test_400 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train_400.shape)
print('x_test shape:', x_test_400.shape)

model_8 = Sequential()
model_8.add(Embedding(max_features, 128))
model_8.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_8.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model_8.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_8.fit(x_train_400, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(x_test_400, y_test))
score, acc = model_8.evaluate(x_test_400, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

x_train shape: (25000, 400)
x_test shape: (25000, 400)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.35736553396701815
Test accuracy: 0.8708800077438354


In [None]:
model_8.fit(x_train_400, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(x_test_400, y_test))
score, acc = model_8.evaluate(x_test_400, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)