## The CBOW Architecture

### Build the corpus vocabulary

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline

In [None]:
corpus = ['The sky is blue and beautiful,',
         'Love this blue and beautiful sky!',
         'The quick brown fox jumps over the lazy dog.',
         "A king's breakfast has sausages, ham, bacon, eggs, toast, and beans",
         'I love green eggs, ham, sausages and bacon!',
         'The brown fox is quick anf the blue dog is lazy!',
         'The sky is very blue and the sky is very beautiful today',
         'The dog is lazy but the brown fox is quick!']
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document' : corpus,
                         'Category' : labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

In [None]:
wpt = nltk.WordPunctTokenizer()

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    
    return doc

In [None]:
normalize_corpus = np.vectorize(normalize_document)

In [None]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

In [None]:
from nltk.corpus import gutenberg
from string import punctuation

In [None]:
bible = gutenberg.sents('bible-kjv.txt')

In [None]:
remove_terms = punctuation + '0123456789'

In [None]:
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])

## Implementing the Continuous Bag of Words (CBOW) model

### Build the corpus vocabulary

In [None]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

In [None]:
# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v : k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

### Build a CBOW (contex, target) generator

In [None]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i]
                                for i in range(start, end)
                                if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)
            
            x = sequence.pad_sequences(context_words, maxlen = context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield(x, y)

In [None]:
# Test this out for some examples

i = 0
for x, y in generate_context_word_pairs(corpus = wids, window_size = window_size, vocab_size = vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
        
        if i == 10:
            break
        i += 1

In [None]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

In [None]:
# bulld CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim = vocab_size, output_dim = embed_size, input_length = window_size * 2))
cbow.add(Lambda(lambda x: K.mean(x, axis = 1), output_shape = (embed_size,)))
cbow.add(Dense(vocab_size, activation = 'softmax'))
cbow.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop')

In [None]:
# view model summary
print(cbow.summary())

# visualize model structure
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(cbow, show_shapes = True, show_layer_names = False,
                rankdir = 'TB').create(prog = 'dot', format = 'svg'))

### Train the model

In [None]:
for epoch in range(1, 6):
    loss = 0.0
    i = 0
    for x, y in generate_context_word_pairs(corpus = wids, window_size = window_size, vocab_size = vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))
            
print('Epoch:', epoch, '\tLoss:', loss)
print()

In [None]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index = list(id2word.values())[1:]).head()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

#view contextuality similar words
similar_words = {search_term : [id2word[idx] for idx in distance_matrix[word2id[search_term] - 1].argsort()[1 : 6] + 1]
                for search_term in ['god', 'jesus', 'noah', 'egypt', 'john', 'gospel', 'moses', 'famine']}

similar_words

## The Skip Gram model

### Build the corpus vocabulary

In [None]:
from keras.preprocessing import text

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)

word2id = tokenizer.word_index
id2word = {v : k for k, v in word2id.items()}

vocab_size = len(word2id) + 1
embed_size = 100

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

In [None]:
from keras.preprocessing.sequence import skipgrams

# generate skip-grams
skip_grams = [skipgrams(wid, vocabulary_size = vocab_size, window_size = 10) for wid in wids]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
    id2word[pairs[i][0]], pairs[i][0],
    id2word[pairs[i][1]], pairs[i][1],
    labels[i]))

### Build the skip-gram model architecture

In [None]:
from keras.layers import Concatenate
from keras.layers.core import Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras import Model

# build skip-gram architecture
word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,
                        embeddings_initializer = 'glorot_uniform',
                        input_length = 1))
word_model.add(Reshape((embed_size,)))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,
                           embeddings_initializer = 'glorot_uniform',
                           input_length = 1))
context_model.add(Reshape((embed_size,)))

model = Concatenate(axis = 1)([word_model.output, context_model.output])
model = Dense(1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid')(model)

model = Model([word_model.input, context_model.input], model)

model.compile(loss = 'mean_squared_error', optimizer = 'rmsprop')

# view model summary
print(model.summary())

# visualize model structure
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_shapes = True, show_layer_names = False,
                rankdir = 'TB').create(prog = 'dot', format = 'svg'))

### Train the model

In [None]:
for epoch in range(1, 6):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype = 'int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype = 'int32')
        labels = np.array(elem[1], dtype = 'int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X, Y)
        
    print('Epoch:', epoch, 'Loss:', loss)