# Bag of Words Meets Bags of Popcorn

In [24]:
import numpy as np
import pandas as pd

from emo_utils import *
from utils import *

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import nltk
from nltk.tokenize import TweetTokenizer

### Glove

In [15]:
glove_filename = 'glove/glove.6B/glove.6B.50d.txt'
# glove_filename = 'glove/glove.6B/glove.6B.300d.txt'
# glove_filename = 'glove/glove.twitter.27B/glove.twitter.27B.50d.txt'
# glove_filename = 'glove/glove.twitter.27B/glove.twitter.27B.200d.txt'
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(glove_filename)

word_vec_dims = word_to_vec_map['apple'].shape[0]
print('word vectors dimensions: ' + str(word_vec_dims))

word vectors dimensions: 50


In [16]:
word = "cucumber"
index = 289846

word_to_index[word]
index_to_word[index]
word_to_vec_map[word]

array([ 0.68224 , -0.31608 , -0.95201 ,  0.47108 ,  0.56571 ,  0.13151 ,
        0.22457 ,  0.094995, -1.3237  , -0.51545 , -0.39337 ,  0.88488 ,
        0.93826 ,  0.22931 ,  0.088624, -0.53908 ,  0.23396 ,  0.73245 ,
       -0.019123, -0.26552 , -0.40433 , -1.5832  ,  1.1316  ,  0.4419  ,
       -0.48218 ,  0.4828  ,  0.14938 ,  1.1245  ,  1.0159  , -0.50213 ,
        0.83831 , -0.31303 ,  0.083242,  1.7161  ,  0.15024 ,  1.0324  ,
       -1.5005  ,  0.62348 ,  0.54508 , -0.88484 ,  0.53279 , -0.085119,
        0.02141 , -0.56629 ,  1.1463  ,  0.6464  ,  0.78318 , -0.067662,
        0.22884 , -0.042453])

### Utils

In [29]:
# tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

def split_sentence(sentence):
#     preproc = pt.tokenize(sentence)
#     return tokenizer.tokenize(preproc)
    return nltk.word_tokenize(sentence)

def sentence_len(sentence):
#     preproc = pt.tokenize(sentence)
    return len(split_sentence(sentence))

### Data

In [19]:
data = pd.read_csv('kg-data/labeledTrainData.tsv', sep='\t')
# data = shuffle(data)
data.shape

(25000, 3)

In [20]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [22]:
idx = np.random.randint(len(data))
data.iloc[idx]

id                                                      4967_1
sentiment                                                    0
review       SciFi has been having some extremely bad luck ...
Name: 14752, dtype: object

In [23]:
X = data['review'].values
Y = data['sentiment'].values

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, shuffle=False)

In [30]:
# max_len = 15
max_len = int(np.ceil(np.mean(np.vectorize(lambda x: len(split_sentence(x)))(X_train))))
# max_len = len(split_sentence(max(X_train, key=sentence_len)))

print("max len: " + str(max_len))

max len: 285


### Model

In [31]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [32]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        j = 0
        sentence_words = split_sentence(X[i].lower())
        for w in sentence_words:
            if j >= max_len:
                break
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
            j = j + 1
    return X_indices

In [33]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_vec_dims      # define dimensionality of your GloVe word vectors (= 50)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    for word, index in word_to_index.items():
        try:
            emb_matrix[index, :] = word_to_vec_map[word]
        except:
            print("bad vector at " + str(index) + ": " + word)
            emb_matrix[index, :] = np.zeros((word_vec_dims, ))
            continue

    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

In [37]:
def MyModel_v1(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices) 
    X = LSTM(128, return_sequences = True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128)(X)
    X = Dropout(0.5)(X)
#     X = LSTM(128)(X)
#     X = Dropout(0.5)(X)
#     X = Dense(128, activation='relu')(X)    
    X = Dense(1, activation='sigmoid', kernel_initializer='random_normal')(X)
    model = Model(inputs=sentence_indices, outputs=X)
    return model

### Compile & fit

In [None]:
del model

In [39]:
model = MyModel_v1((max_len,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 285)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 285, 50)           20000050  
_________________________________________________________________
lstm_3 (LSTM)                (None, 285, 128)          91648     
_________________________________________________________________
dropout_3 (Dropout)          (None, 285, 128)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total para

In [43]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_len)
history = model.fit(X_train_indices, Y_train, epochs=10, batch_size=64, validation_split=0.1)

Train on 20250 samples, validate on 2250 samples
Epoch 1/10
 3008/20250 [===>..........................] - ETA: 3:03 - loss: 0.6935 - acc: 0.5113

In [None]:
plot_model_accuracy(history)
plot_model_loss(history)

In [None]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len)
loss, acc = model.evaluate(X_test_indices, Y_test)
print("Test loss = ", loss)
print("Test accuracy = ", acc)

### Kaggle

In [None]:
!kaggle competitions download -c word2vec-nlp-tutorial -p 'kg-data'