To generate subword vectors, we use Google's sentencepiece (https://github.com/google/sentencepiece) with pretrained byte-pair encoding language models trained on wikipedia (https://github.com/bheinzerling/bpemb).

The cleaned sentences are lowercase only, unified all digits to 0, rid of URL, and with some additional manual anomaly removal.

To generate the subword sequences, we run this following line in Terminal (e.g. merge op = 3000)
spm_encode --model en.wiki.bpe.op3000.model < amazon_sentences.txt.clean > amazon_sentences.bpe3000

In [1]:
# Sentence Embedding with Conv1d
# trained/tested with Amazon review dataset

# Load the necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import gc

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Masking
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.callbacks import ModelCheckpoint

# The dimensionality of all sentence vectors
vector_dim = 25

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
from gensim.models import KeyedVectors
import codecs

# Load the pre-trained subwrod language model and subword sequences
# Training data and test data are preemptively concatenated in one single training file for convenience
# First 40k for training, last 10k for testing

model = KeyedVectors.load_word2vec_format("../input/en.wiki.bpe.op3000.d" + str(vector_dim) + ".w2v.bin", binary=True)
# f_train = codecs.open("../input/movie_phrases.bpe3000", "r", "utf-8")
f_train = codecs.open("../input/amazon_train_sentences.bpe3000", "r", "utf-8")
amazon_train = [line for line in f_train.readlines() if line.strip()]
f_train.close()

# Same goes for loading the labels
sentiment_train = np.genfromtxt('../input/amazon_train_labels.txt', delimiter=',')
# sentiment_train = np.genfromtxt('../input/movie_labels.txt', delimiter=',')
num_labels = len(np.unique(sentiment_train))

batch_size = 256  # Batch size for training
epochs = 10  # Number of epochs to train for
num_samples = 50000 # Total number of samples used
num_split = 40000

amazon_train = amazon_train[:num_samples] 
sentiment_train = sentiment_train[:num_samples]
amazon_train_labels = np_utils.to_categorical(sentiment_train, num_labels)

# Due to memory limitation, we only take in the first 800 subwords of each sentence
# for zero padding
max_encoder_seq_length = 800

gc.collect()

7

In [3]:
encoder_input_data = np.zeros(
    (num_samples, max_encoder_seq_length, vector_dim),
    dtype='float32')

# Feed in the subword vectors
for i in range(len(amazon_train)):
    try:
        input_vec = model[amazon_train[i].split()]
    except KeyError:
        # In case of non-existing vocabulary, we simply replace it with the previous subword
        # Very rare
        print(i)
        input_vec = model[amazon_train[i-1].split()]
    
    for j in range(len(input_vec)):
        encoder_input_data[i][j] = + input_vec[j][:]     

In [4]:
# 3-layer 1D-CNN with maxpooling + 2-layer fully-connected MLP for sentiment classification

sequence_input = Input(shape=(max_encoder_seq_length,vector_dim,), dtype='float32')
l_cov1= Conv1D(vector_dim, 25, activation='relu')(sequence_input)
l_pool1 = MaxPooling1D(3)(l_cov1)
l_cov2 = Conv1D(vector_dim, 15, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(3)(l_cov2)
l_cov3 = Conv1D(vector_dim, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(40)(l_cov3)  # global max pooling
l_flat = Flatten(name="sent_embedding")(l_pool3)
l_dense = Dense(vector_dim*2, activation='relu')(l_flat)
l_dense_2 = Dense(vector_dim, activation='relu')(l_dense)
preds = Dense(num_labels, activation='softmax')(l_dense)

model_k = Model([sequence_input], preds)
model_k.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 800, 25)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 776, 25)           15650     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 258, 25)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 244, 25)           9400      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 81, 25)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 77, 25)            3150      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1, 25)             0         
__________

In [7]:
# Compile and start training
model_k.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# We feed the test data as the validation data in this Keras model to check out the overall accuracy
# Validation data are NOT used in training per Keras design

# We could use Keras to the save the best model using its checkpoint feature, which is not allowed on Kaggle
# checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)

model_k.fit(encoder_input_data[:num_split], amazon_train_labels[:num_split],
          batch_size=batch_size,
          epochs=1,
          # epochs = epochs,  
          validation_data=[encoder_input_data[num_split:], amazon_train_labels[num_split:]],
          # callback = [checkpointer]
           )

# Save the model (not allowed on Kaggle)
# model_k.save('/amazon_40k_sample_d' + str(vector_dim) + '.h5')

# Get the trained sentence vectors for sentimental analysis and tSNE
intermediate_layer_model = Model(inputs=model_k.input,
                                 outputs=model_k.get_layer("sent_embedding").output)

X = [encoder_input_data]
intermediate_output = intermediate_layer_model.predict(X)
# Save all the sentence vectors (not allowed on Kaggle)
# np.savetxt("data/amazon/amazon_embedding_40k_conv1d_d" +str(vector_dim)+ ".csv", intermediate_output, delimiter=",")
print(intermediate_output.shape)

Train on 40000 samples, validate on 10000 samples
Epoch 1/1
(50000, 25)


In [None]:
# Generate the 2D tSNE graph using the 10k sentence vectors of the test data/labels
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(intermediate_output[num_split:])
import matplotlib.pyplot as plt
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=sentiment_train[num_split:])
plt.show()