In [1]:
import numpy as np
np.random.seed(0)

from keras.models import Model
from keras.layers import Input, Dense, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.initializers import glorot_uniform

from sklearn.model_selection import train_test_split
import tensorflow as tf
# Configure keras backend to run on CPU
tf.config.set_visible_devices([], 'GPU')

import argparse

from utils import *
from parameters import FLAGS
from preprocessing import review_to_indices, embedding_layer_glove

from models.uni_2_LSTM import Uni_2_LSTM
from models.bi_2_LSTM import Bi_2_LSTM
from models.cnn import CNN1


Parameters:
	dev_sample_percentage: 0.05
	data_file: /home/ubuntu/cs230_project/data/amazon_reviews_us_Electronics_v1_00.tsv
	word2vec_file: /home/ubuntu/cs230_project/data/glove.6B.100d.txt
	min_total_votes: 10
	max_review_word_count: 200
	keep_start_of_longer_reviews: True
	batch_size: 64
	num_epochs: 20
	debug_mode: False


In [2]:
X, Y = load_data(FLAGS.data_file,
                 FLAGS.min_total_votes,
                 FLAGS.max_review_word_count,
                 FLAGS.keep_start_of_longer_reviews)

X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=FLAGS.dev_sample_percentage, random_state=42)


Loading data ...
Chunk loaded. Found 136128 data points with >= 10 total votes.


In [3]:
# print(f'Examples in X_train: {X_train.shape[0]}')
# print(f'Examples in X_dev:   {X_dev.shape[0]}')

In [4]:
# for idx in range(3):
#     print(X_train[idx], '\n', Y_train[idx])
#     print('\n\n')

# print('#'*50, '\n\n')

# for idx in range(3):
#     print(X_dev[idx], '\n', Y_dev[idx])
#     print('\n\n')

In [3]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(FLAGS.word2vec_file)


Creating word embeddings matrix ...
Done


In [6]:
# X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
# X1_indices = review_to_indices(X1, word_to_index, max_review_word_count=5)
# print("X1 =", X1)
# print("X1_indices =\n", X1_indices)

In [4]:
model = Bi_2_LSTM((FLAGS.max_review_word_count, ),
                   word_to_vec_map,
                   word_to_index,
                   lstm_units=[32, 64])
model.summary()

Model: "uni_2_LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 256)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 256, 100)          40000100  
_________________________________________________________________
bidirectional (Bidirectional (None, 256, 64)           34048     
_________________________________________________________________
dropout (Dropout)            (None, 256, 64)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1

In [14]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, Conv2D, ReLU, MaxPooling2D, Reshape, Flatten

from preprocessing import embedding_layer_glove


def CNN1(input_shape, word_to_vec_map, word_to_index, FLAGS):
    """
    Implements a two-layer unidirectional LSTM with dropout after each
    LSTM layer as a Keras model object.
    
    Inputs:
        input_shape:     shape of input review, put it as (FLAGS.max_review_word_count, )
        word_to_vec_map: dictionary mapping from word to 100-dim vector embedding
        word_to_index:   dictionary mapping from word to index in vocabulary
        FLAGS:           hyperparameter settings
    
    Outputs:
        model: A keras model object
    """
    # b = batch_size
    # e = embedding_dim = 100
    
    # Input layer
    word_indices = Input(shape=input_shape, dtype='int32')                                        # (b, 200)
    
    # Embedding layer (pretrained with GloVe-100)
    embeddings = embedding_layer_glove(word_to_vec_map, word_to_index)(word_indices)              # (b, 200, 100)
    # Add 'channel' dimension of 1
    embeddings_w_channels = Reshape((embeddings.shape[1], embeddings.shape[2], 1))(embeddings)    # (b, 200, 100, 1)

    
    ########### CNN part (start) #######################
    Z1 = Conv2D(filters=8, kernel_size=(3, 3), strides=1, padding='valid')(embeddings_w_channels) # (b, 198, 98,  8)
    print('Z1.shape', Z1.shape)
    A1 = ReLU()(Z1)                                                                               # (b, 198, 98,  8)
    print('A1.shape', A1.shape)
    P1 = MaxPooling2D(pool_size=(2, 3), strides=(2, 3), padding='same')(A1)                       # (b,  99, 33,  8)
    print('P1.shape', P1.shape, '\n')
    
    Z2 = Conv2D(filters=16, kernel_size=(3, 3), strides=1, padding='valid')(P1)                   # (b,  97, 31, 16)
    print('Z2.shape', Z2.shape)
    A2 = ReLU()(Z2)                                                                               # (b,  97, 31, 16)
    print('A2.shape', A2.shape)
    P2 = MaxPooling2D(pool_size=(2, 3), strides=(2, 3), padding='same')(A2)                       # (b,  49, 11, 16)
    print('P2.shape', P2.shape, '\n')
    
    Z3 = Conv2D(filters=32, kernel_size=(5, 5), strides=1, padding='valid')(P2)                   # (b,  45,  7, 32)
    print('Z3.shape', Z3.shape)
    A3 = ReLU()(Z3)                                                                               # (b,  45,  7, 32)
    print('A3.shape', A3.shape)
    P3 = MaxPooling2D(pool_size=(2, 3), strides=(2, 3), padding='same')(A3)                       # (b,  23,  3, 32)
    print('P3.shape', P3.shape, '\n')   
    ########### CNN part (end) #######################
    
    F = Flatten()(P3)
    print('F.shape', F.shape)                                                                     # (b, 2208)
    out = Dense(units=1)(F)
    print('out.shape', out.shape)                                                                 # (b, 1)
    
    
    # Finally, create the model object
    model = Model(inputs=word_indices, outputs=out, name='CNN1')
    
    return model

In [16]:
model = CNN1((FLAGS.max_review_word_count, ),
              word_to_vec_map,
              word_to_index,
              FLAGS)
model.summary()

Z1.shape (None, 198, 98, 8)
A1.shape (None, 198, 98, 8)
P1.shape (None, 99, 33, 8) 

Z2.shape (None, 97, 31, 16)
A2.shape (None, 97, 31, 16)
P2.shape (None, 49, 11, 16) 

Z3.shape (None, 45, 7, 32)
A3.shape (None, 45, 7, 32)
P3.shape (None, 23, 3, 32) 

F.shape (None, 2208)
out.shape (None, 1)
Model: "CNN1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 200, 100)          40000100  
_________________________________________________________________
reshape_6 (Reshape)          (None, 200, 100, 1)       0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 198, 98, 8)        80        
_________________________________________________________________
re_lu_11 (ReLU)              (N

In [8]:
# import matplotlib.pyplot as plt
# plt.hist([len(X_train[i].split(' ')) for i in range(X_train.shape[0])], bins=40)
# plt.xlim([0, 1000])
# plt.show()

# plt.hist(Y_train, bins=100)
# plt.xlim([0, 1])
# plt.show()

In [5]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

In [6]:
X_train_indices = review_to_indices(X_train, word_to_index, FLAGS.max_review_word_count)
X_dev_indices   = review_to_indices(X_dev,   word_to_index, FLAGS.max_review_word_count)

model.fit(X_train_indices, Y_train, epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size, shuffle=True,
          validation_data=(X_dev_indices, Y_dev))

Epoch 1/20

KeyboardInterrupt: 