# pre-Train word2vec Embedding with CNN model
word2vec algorithm is an approach to learning a word embedding from a text corpus in a standalone way. The benefit of this method is reduce high-quality word embeddings, in terms of space and time complexity.

word2vec algorithm processes document by sentences. Passing cleaned sentences from the training data with specify the size of the embedding vector space (vector_size=100), the number of words' correlation will take into account (window=5) maximizing, number of CPU cores (workers=4) or unsetting, and the minimun occurence count for word consider in the vocabulary (min_count=1) 

In [1]:
from string import punctuation
from os import listdir
from gensim.models import Word2Vec
# from nltk.corpus import stopwords
from nltk import sent_tokenize


def load_doc(fn):
    file = open(fn, 'r')
    doc = file.read()
    file.close()
    return doc


def clean_doc(doc, vocab):
    lines = list()
    sentences = sent_tokenize(doc)
    for sent in sentences:
        tokens = sent.split()
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [w for w in tokens if w in vocab]
        lines.append(tokens)
    return lines


def process_docs(directory, vocab, is_train):
    lines = list()
    for fn in listdir(directory):
        if is_train and fn.startswith('cv9'):
            continue
        if not is_train and not fn.startswith('cv9'):
            continue
        path = directory + '/' + fn
        doc = load_doc(path)
        tokens = clean_doc(doc, vocab)
        lines += tokens
    return lines

vocab = load_doc('pickled_data/vocab.txt')
vocab = set(vocab.split('\n'))

In [3]:
pos = process_docs('data/txt_sentoken/pos', vocab, True)
neg = process_docs('data/txt_sentoken/neg', vocab, True)
sentences = neg + pos
print(len(sentences), sentences[:5])

64190 [['dont', 'think', 'kevin', 'kline', 'drag', 'funny', 'wait'], ['til', 'see', 'smith', 'even', 'less', 'funny'], ['time', 'jim', 'west', 'smith', 'disguised', 'belly', 'dancer', 'bail', 'captured', 'comrade', 'artemus', 'gordon', 'kline', 'clutches', 'evil', 'dr'], ['loveless', 'branagh', 'unequivocally', 'bored', 'wild', 'wild', 'west', 'new', 'summer', 'blockbuster', 'men', 'black', 'director', 'barry', 'sonnenfeld'], ['old', 'west', 'really', 'breeding', 'ground', 'high', 'comedy', 'anyway']]


In [None]:
# training model
model = Word2Vec(sentences, vector_size=100, window=5, workers=3, min_count=1, epochs=50)
words = model.wv.key_to_index
print('vocab size= %d' % len(words))

In [None]:
## save model
# model.save('pickled_data/word2vec_embedding.mod')

In [None]:
## save wv in text
# model.wv.save_word2vec_format('embedding_w2v.txt', binary=False, write_header=False)

In [10]:
import numpy as np


# load and convert w2v vector spaces into embedding layer
def load_embedding(fn):
    file = open(fn, 'r')
    lines = file.readlines()
    file.close()
    embedding = dict()
    for line in lines:
        tokens = line.split()
        embedding[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
        
    return embedding


# create a weight mextrix for embedding
def get_weight_matrix(embedding, vocab):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 100))
    for word, i in vocab.items():
        # Tokenizer's integer mapping for get vocab
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

embedding = load_embedding('embedding_w2v.txt')
display(len(embedding))

26896

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPooling1D

In [6]:
# max length of training docs
max_length = max([len(s) for s in sentences])
# convert tokens to lines for Tokenizer
sentences = [' '.join(sent) for sent in sentences]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

# encoding training docs
encoded_docs = tokenizer.texts_to_sequences(sentences)

# pad sequences
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
ytrain = np.array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
display(Xtrain.shape, ytrain.shape, Xtrain[0])

(64190, 553)

(64190,)

array([11,  8,  6,  5,  5, 15,  2,  6, 22, 22,  1, 21,  2,  6, 22,  9,  2,
        6,  1, 11,  7,  3, 14, 18, 13,  6,  6, 17, 20,  3,  2,  5,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [8]:
# loading test sets
pos = process_docs('data/txt_sentoken/pos', vocab, False)
neg = process_docs('data/txt_sentoken/neg', vocab, False)
sentences = neg + pos
sentences = [' '.join(sent) for sent in sentences]

# encoding test sets
encoded_docs = tokenizer.texts_to_sequences(sentences)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
ytest = np.array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
display(Xtest.shape, ytest.shape, Xtest[0])

(7342, 553)

(7342,)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
vocab_size = len(tokenizer.word_index) + 1

# convert the embedding to vector and transform with a Embedding() layer
embedding_vectors = get_weight_matrix(embedding, tokenizer.word_index)
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], 
                            input_length=max_length, trainable=False)

## trainable=False to ensure network does not try to adap the pre-learned vectors as a part of training network

In [14]:
# define keras model
model = Sequential()
model.add(embedding_layer)
# kernel_size = window in w2v for neighbors of word
model.add(Conv1D(filters=128, kernel_size=5, activation='relu')) 
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 553, 100)          2700      
_________________________________________________________________
conv1d (Conv1D)              (None, 549, 128)          64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 274, 128)          0         
_________________________________________________________________
flatten (Flatten)            (None, 35072)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 35073     
Total params: 101,901
Trainable params: 99,201
Non-trainable params: 2,700
_________________________________________________________________


In [15]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# train model
model.fit(Xtrain, ytrain, epochs=5, verbose=2)

Epoch 1/5
2006/2006 - 152s - loss: nan - accuracy: 0.4966
Epoch 2/5
2006/2006 - 142s - loss: nan - accuracy: 0.4965
Epoch 3/5
2006/2006 - 142s - loss: nan - accuracy: 0.4965
Epoch 4/5
2006/2006 - 153s - loss: nan - accuracy: 0.4965
Epoch 5/5
2006/2006 - 154s - loss: nan - accuracy: 0.4965


<keras.callbacks.History at 0x7fc5878617d0>

In [16]:
model.evaluate(Xtest, ytest, verbose=0)

[nan, 0.4934622645378113]

In [17]:
# save model
model.save('pickled_data/embedding_cnn.h5') # indicate saved file under HDF5 optimizer

## for load model
# #import tensorflow as tf

# #new_model = tf.keras.models.load_model('pickled_data/embedding_cnn.h5')
# #new_model.summary()
# #loss, acc = new_model.evaluate(Xtesting, ytesting, verbose=0)

### Summarizing Results
Note: Your results may vary given the stochastic nature of the algorithm or evaluation procedure, or differences in numerical precision. Consider running the example a few times and compare the average outcome.

In fact, performance was a lot worse. The results show that the training dataset was learned successfully, but evaluation on the test dataset was very poor, at just above 50% accuracy.

The cause of the poor test performance may be because of the chosen word2vec configuration or the chosen neural network configuration.

The weights in the embedding layer can be used as a starting point for the network, and adapted during the training of the network. We can do this by setting ‘trainable=True‘ (the default) in the creation of the embedding layer.

Repeating the experiment with this change shows slightly better results, but still poor.

It is possible to use pre-trained word vectors prepared on very large corpora of text data.

For example, both Google and Stanford provide pre-trained word vectors that you can download, trained with the efficient word2vec and GloVe methods respectively.

We can download pre-trained GloVe vectors from the Stanford webpage. Specifically, vectors trained on Wikipedia data (glove.6B.zip (http://nlp.stanford.edu/data/glove.6B.zip)) --> glove.6B.txt

### Clearly understand further reading
Implementing a CNN for Text Classification in TensorFlow
http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

RNNs in Tensorflow, a Practical Guide and Undocumented Features
http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/


In [None]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
    vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

# # load embedding from file
# raw_embedding = load_embedding('glove.6B.100d.txt')
# # get vectors in the right order
# embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# # create the embedding layer
# embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], 
#                             input_length=max_length, trainable=False)
 