In [1]:
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2019-CS109B/master/content/styles/cs109.css").text
HTML(styles)

In [2]:
import numpy as np
# import tensorflow
# from keras.datasets import imdb
# from keras.models import Sequential
# from keras.layers import Dense, LSTM, SimpleRNN, Flatten
# from keras.preprocessing import sequence
# from keras.layers.convolutional import Conv1D, MaxPooling1D
# from keras.layers.embeddings import Embedding

from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, SimpleRNN
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
np.random.seed(1)

In [3]:
# We want to have a finite vocabulary to make sure that our word matrices are not arbitrarily small
vocabulary_size = 10000

#We also want to have a finite length of reviews and not have to process really long sentences.
max_review_length = 500

In [4]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Number of reviews', len(x_train))
print('Length of first and fifth review before padding', len(x_train[0]) ,len(x_train[4]))
print('First review', x_train[0])
print('First label', y_train[0])

Number of reviews 25000
Length of first and fifth review before padding 218 147
First review [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103

In [5]:
# for feeding into RNN in batches the inputs need to be of same dimensions

x_train = sequence.pad_sequences(x_train, maxlen = max_review_length)
x_test = sequence.pad_sequences(x_test, maxlen = max_review_length)
print('length of 1st and 5th review after padding:', len(x_train[0]), len(x_train[4]))

length of 1st and 5th review after padding: 500 500


## Model 1A : FFN w/o Embeddings

In [6]:
model = Sequential()

model.add(Dense(250, activation = 'relu', input_dim = max_review_length))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 2, batch_size = 128, verbose = 2)
scores = model.evaluate(x_test, y_test, verbose = 1)

print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 250)               125250    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 251       
Total params: 125,501
Trainable params: 125,501
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
25000/25000 - 3s - loss: 186.7220 - accuracy: 0.5030 - val_loss: 96.7029 - val_accuracy: 0.5055
Epoch 2/2
25000/25000 - 1s - loss: 48.8586 - accuracy: 0.5882 - val_loss: 48.8360 - val_accuracy: 0.5018


Accuracy: 50.18%


## Model 1B: FFN with Embeddings



In [7]:
embedding_dim = 100

In [8]:
model = Sequential()

# inputs will be converted from batch size * sentence length to
# batch_size * sentence_length * embedding_dim

model.add(Embedding(vocabulary_size, embedding_dim, input_length = max_review_length))
model.add(Flatten())
model.add(Dense(250, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 100)          1000000   
_________________________________________________________________
flatten (Flatten)            (None, 50000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 250)               12500250  
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 251       
Total params: 13,500,501
Trainable params: 13,500,501
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 2, batch_size = 128, verbose = 1 )

# Evaluate
scores = model.evaluate(x_test, y_test, verbose = 0)
print('Accuracy: %.2f%%' % (scores[1]*100))


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2
Accuracy: 87.05%


## Model 2 : CNN

* Text can be thought of as a 1D sequence
* Apply 1D convolutions over a set of word embeddings

In [10]:
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length = max_review_length))
model.add(Conv1D(filters = 200, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Flatten())
model.add(Dense(250, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 500, 200)          60200     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 250, 200)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 250)               12500250  
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 251       
Total params: 13,560,701
Trainable params: 13,560,701
Non-trainable params: 0
__________________________________________

In [11]:
model.fit(x_train, y_train, epochs = 2, batch_size = 128)

# evalute the CNN
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Train on 25000 samples
Epoch 1/2
Epoch 2/2
Accuracy: 88.66%


## Model 3 : Simple RNN

In [12]:
model = Sequential()

model.add(Embedding(vocabulary_size, embedding_dim, input_length = max_review_length))
model.add(SimpleRNN(100))

model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 100)               20100     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 1,020,201
Trainable params: 1,020,201
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(x_train, y_train, epochs = 3, batch_size = 64)

Train on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5009fd4150>

In [14]:
scores = model.evaluate(x_test, y_test, verbose = 0)

print("Accuracy: %.2f%%"% (scores[1]*100))

Accuracy: 65.12%


## Model 4 : LSTM

In [17]:
model = Sequential()

model.add(Embedding(vocabulary_size, embedding_dim, input_length = max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
model.fit(x_train, y_train, epochs = 3, batch_size = 128)

Train on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5000ad5850>

In [20]:
scores = model.evaluate(x_test, y_test, verbose = 2)
print('Accuracy: %.2f%%'% (scores[1]*100))

25000/1 - 25s - loss: 0.2823 - accuracy: 0.8688
Accuracy: 86.88%


## Model 5 : CNN + LSTM

* CNNs are good at learning spatial features
* Sentences can be thought of as 1D spatial vectors
* Apply LSTM over features learned by a CNN (after maxpooling layer)
* This combines CNN and LSTM layers
* CNN would be able to pick up invariant features across the 1-D spatial structure that characterize good and bad sentiment
* This learned spatial features can be learned as sequences by an LSTM
* Final classification is done by a feed-forward connection to a single node.

In [25]:
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length = max_review_length))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 32)           9632      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101       
Total params: 1,062,933
Trainable params: 1,062,933
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
model.fit(x_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 87.90%
