# Learning Embeddings from scratch

In [23]:
#importing libraries
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import *
from keras.layers import *
from keras.callbacks import *

In [24]:
# dataset available - https://www.kaggle.com/columbine/imdb-dataset-sentiment-analysis-in-csv-format/
#reading csv files
train = pd.read_csv('../IMDB_Train.csv')
valid = pd.read_csv('../IMDB_Valid.csv') 

In [25]:
#train_test split
x_train, y_train = train['text'].values, train['label'].values
x_val, y_val = valid['text'].values, valid['label'].values

In [26]:
#Tokenize the sentences
tokenizer = Tokenizer()

In [27]:
#preparing vocabulary
tokenizer.fit_on_texts(list(x_train))

In [28]:
#converting text into integer sequences
x_train_seq  = tokenizer.texts_to_sequences(x_train)
x_val_seq = tokenizer.texts_to_sequences(x_val)

In [54]:
x_train_seq

array([[  477,     5,    63, ...,  5988,     4,  5890],
       [  232,    25, 65084, ...,    20,  6238,   240],
       [38401,   669,   273, ...,    19,  2293,  5475],
       ...,
       [    1,  1376, 33722, ...,  4706,   785,    31],
       [  140,     5,   114, ...,     7,   733,   154],
       [  174,     5,   412, ...,    57,   943,  6779]])

In [55]:
x_val_seq

array([[ 111,   93,   50, ...,   11,  339,  154],
       [1018,   11,  821, ...,  710,    4, 7856],
       [   0,    0,    0, ...,    8,   11,   19],
       ...,
       [8609,   42,    4, ...,  126,  104, 1493],
       [1313, 2044, 8480, ...,   63,    1,  182],
       [   3,  114,    4, ...,    2, 1297,  498]])

In [31]:
#padding to prepare sequences of same length
x_train_seq  = pad_sequences(x_train_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)

In [32]:
x_train_seq

array([[  477,     5,    63, ...,  5988,     4,  5890],
       [  232,    25, 65084, ...,    20,  6238,   240],
       [38401,   669,   273, ...,    19,  2293,  5475],
       ...,
       [    1,  1376, 33722, ...,  4706,   785,    31],
       [  140,     5,   114, ...,     7,   733,   154],
       [  174,     5,   412, ...,    57,   943,  6779]])

In [33]:
x_val_seq

array([[ 111,   93,   50, ...,   11,  339,  154],
       [1018,   11,  821, ...,  710,    4, 7856],
       [   0,    0,    0, ...,    8,   11,   19],
       ...,
       [8609,   42,    4, ...,  126,  104, 1493],
       [1313, 2044, 8480, ...,   63,    1,  182],
       [   3,  114,    4, ...,    2, 1297,  498]])

In [34]:
# build model
model=Sequential()

size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding

#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=100,trainable=True))

In [35]:
#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid'))

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["acc"]) 

In [36]:
#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

In [37]:
# summary of model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 300)          33661200  
                                                                 
 lstm_1 (LSTM)               (None, 100, 128)          219648    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 33,889,169
Trainable params: 33,889,169
Non-trainable params: 0
__________________________________________

In [38]:
# train the model
history = model.fit(np.array(x_train_seq),np.array(y_train), 
                    batch_size=128, 
                    epochs=10,
                    validation_data=(np.array(x_val_seq),np.array(y_val)),
                    verbose=1,
                    callbacks=[es,mc])

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.86420, saving model to best_model.h5
Epoch 2/10
Epoch 00002: val_acc did not improve from 0.86420
Epoch 3/10
Epoch 00003: val_acc did not improve from 0.86420
Epoch 4/10
Epoch 00004: val_acc did not improve from 0.86420
Epoch 00004: early stopping


In [39]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

In [41]:
#evaluation 
_,val_acc = model.evaluate(x_val_seq, y_val, batch_size=128)
print('Validation Accuracy: ', val_acc)

Validation Accuracy:  0.8641999959945679


# Learning Embeddings using GloVe pretrained word embeddings

In [46]:
embeddings_index = dict()
f = open('../Glove/glove.6B.300d.txt', encoding="utf8") # download glove embeddings from nlp.stanford.edu/data/glove.6B.zip

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [47]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [48]:
model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,weights=[embedding_matrix],input_length=100,trainable=False)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

In [49]:
#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model_pretrain.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)

#summary of model
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 300)          33661200  
                                                                 
 lstm_2 (LSTM)               (None, 100, 128)          219648    
                                                                 
 global_max_pooling1d_2 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 33,889,169
Trainable params: 227,969
Non-trainable params: 33,661,200
____________________________________

In [50]:
# train the model
history = model.fit(np.array(x_train_seq),np.array(y_train), 
                    batch_size=128,
                    epochs=10,
                    validation_data=(np.array(x_val_seq),np.array(y_val)),
                    verbose=1,
                    callbacks=[es,mc])

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.84880, saving model to best_model.h5
Epoch 2/10
Epoch 00002: val_acc improved from 0.84880 to 0.85900, saving model to best_model.h5
Epoch 3/10
Epoch 00003: val_acc improved from 0.85900 to 0.86380, saving model to best_model.h5
Epoch 4/10
Epoch 00004: val_acc improved from 0.86380 to 0.86760, saving model to best_model.h5
Epoch 5/10
Epoch 00005: val_acc improved from 0.86760 to 0.87780, saving model to best_model.h5
Epoch 6/10
Epoch 00006: val_acc did not improve from 0.87780
Epoch 7/10
Epoch 00007: val_acc did not improve from 0.87780
Epoch 8/10
Epoch 00008: val_acc did not improve from 0.87780
Epoch 00008: early stopping


In [52]:
#loading best model
from keras.models import load_model
model = load_model('best_model_pretrain.h5')

In [53]:
#evaluation 
_,val_acc = model.evaluate(x_val_seq,y_val, batch_size=128)
print('Validation Accuracy: ', val_acc)

Validation Accuracy:  0.8777999877929688
