# Self Made Word2Vec Embeddings

In [1]:
import lib
import numpy as np
import spacy
import tensorflow as tf

from functional import seq
from importlib import reload
from joblib import Memory
from lib.data import load_data as load_data_lib
from lib.text_preprocessing import preprocess_tokens, lemmatize, remove_stopwords, to_lower, tokenize, words_only
from lib.vocabulary import Vocabulary
from tensorflow import keras
from tqdm import tqdm

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# tf.debugging.set_log_device_placement(True)

reload(lib)

Num GPUs Available:  1


<module 'lib' (namespace)>

In [2]:
mem = Memory('./data/cache', verbose=0)


## Prepare Raw Data

In [3]:
preprocess_pipeline = [lemmatize, to_lower, words_only, remove_stopwords, tokenize]
lang = spacy.load('de_dep_news_trf')

@mem.cache
def load_data():
    res = load_data_lib()
    return res

data = load_data()

In [33]:
@mem.cache
def preprocess_data():
    return seq(tqdm(data['text'].to_numpy()))\
        .map(lambda text: preprocess_tokens(text, lang, preprocess_pipeline))\
        .to_list()

samples = preprocess_data()
len(samples)

20070

In [58]:
#preprocess_data.clear()
#samples[:3]
samples = samples[:10]

In [59]:
vocabulary = seq(tqdm(samples))\
    .fold_left(lib.vocabulary.Vocabulary.builder(), lambda voc, sentence: voc.add_sample(sentence))\
    .build()
vocabulary

100%|██████████| 10/10 [00:00<00:00, 1009.97it/s]


Vocabulary(name='default', size=1806, default_sample_length=2347)

In [63]:
list(vocabulary.token2index.items())[:10]

[('<EMPTY>', 0),
 ('the', 1),
 ('and', 2),
 ('we', 3),
 ('of', 4),
 ('to', 5),
 ('our', 6),
 ('i', 7),
 ('this', 8),
 ('for', 9)]

In [61]:
#
# Create model (traditional)
#

tm_model_in = keras.Input(shape=vocabulary.size, name='input')
tm_embedding_in = keras.layers.Dense(50, activation=keras.activations.relu, name='embedding')
tm_embedding_out = tm_embedding_in(tm_model_in)
tm_softmax_in = keras.layers.Dense(vocabulary.size, activation=keras.activations.softmax, name='softmax')
tm_softmax_out = tm_softmax_in(tm_embedding_out)

In [62]:
#
# ...the model to learn the embedding (traditional)
#

tm_train_model = keras.Model(inputs=[tm_model_in], outputs=[tm_softmax_out], name='word2vec_traditional')
tm_train_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"])


tm_train_model.summary()

Model: "word2vec_traditional"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1806)]            0         
_________________________________________________________________
embedding (Dense)            (None, 50)                90350     
_________________________________________________________________
softmax (Dense)              (None, 1806)              92106     
Total params: 182,456
Trainable params: 182,456
Non-trainable params: 0
_________________________________________________________________


In [39]:
#
# ...the model to calculate an embedding vector the tokens (taditional)
#

tm_calc_embedding = keras.Model(inputs=[tm_train_model.input], outputs=[tm_train_model.get_layer('embedding').output])

In [40]:
#
# Create model
#

dim = 50

word_input = keras.Input(shape=1)
context_word_input = keras.Input(shape=1)

embedding = keras.layers.Embedding(vocabulary.size, dim, input_length=1, name='embedding')
word = keras.layers.Reshape((dim, 1))(embedding(word_input))
context_word = keras.layers.Reshape((dim, 1))(embedding(context_word_input))
similarity = keras.layers.Dot(1, normalize=True, name='similarity')([word, context_word])

dot_product = keras.layers.Reshape((1,))(keras.layers.Dot(1, name='dot_product')([word, context_word]))
output = keras.layers.Dense(1, activation=keras.activations.sigmoid, name='output')(dot_product)

model = keras.Model(inputs=[word_input, context_word_input], outputs=output)
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.RMSprop())

validation_model = keras.Model(inputs=[word_input, context_word_input], outputs=similarity)

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 50)        7293200     input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
reshape_6 (Reshape)             (None, 50, 1)        0           embedding[0][0]            

## Train the embedding

In [52]:
#
# Create a custom generator for generating Training samples from text.
#

from typing import List, Union
import math

class Word2VecDataGenerator(tf.keras.utils.Sequence):

    def __init__(self, text_samples: Union[str, List[str]], voc: Vocabulary, window_size: int = 4, group_size: int = 100):
        self._window_size = window_size
        self._samples_idx = voc.samples_to_indices(text_samples, include_oov=False)
        self._voc = voc
        self._group_size = group_size

    def __getitem__(self, index):
        start_idx = index * self._group_size
        end_idx = start_idx + self._group_size

        words = np.zeros([0], dtype='int32')
        context_words = np.zeros([0], dtype='int32')
        labels = np.zeros([0], dtype='int8')

        for i in range(start_idx, min(end_idx, self._samples_idx.shape[0])):
            inputs_i, labels_i = self.__get_minibatch__(i)
            words = np.concatenate([words, inputs_i[0]])
            context_words = np.concatenate([context_words, inputs_i[1]])
            labels = np.concatenate([labels, labels_i])

        return [words, context_words], labels

    def __get_minibatch__(self, index):
        sample = self._samples_idx[index]
        couples, labels = tf.keras.preprocessing.sequence.skipgrams(
            sample, self._voc.size, window_size=self._window_size)

        if len(couples) > 0:
            words, context_words = zip(*couples)
        else:
            words, context_words, labels = [[],[],[]]

        words = np.array(words, dtype='int32')
        context_words = np.array(context_words, dtype='int32')
        labels = np.array(labels, dtype='int8')

        return [words, context_words], labels


    def __len__(self):
        return math.ceil(self._samples_idx.shape[0] / self._group_size)

In [56]:
class SimilarityLogger(tf.keras.callbacks.Callback):

    def __init__(
            self, val_model: keras.Model, voc: Vocabulary, interval: int = 10, validation_size: int = 10, k: int = 5):
        super().__init__()
        self.vocabulary = voc
        self.interval = interval
        self.validation_size = validation_size
        self.validation_model = val_model
        self.k = k

    def on_epoch_begin(self, epoch, logs=None):
        if epoch == 0:
            self.print_similarities()

    def on_epoch_end(self, epoch: int, logs: dict = None) -> None:
        if (epoch + 1) % self.interval != 0:
            return
        else:
            self.print_similarities()


    def print_similarities(self) -> None:
        items = list(self.vocabulary.token2index.keys())[1:self.validation_size+1]
        for item in items:
            sim = self.get_similarity(item)
            nearest = (-sim).argsort()[1:self.k + 1]

            log = f"Nearest neighbors to `{item}`:"
            for n in nearest:
                n_word = self.vocabulary.index_to_token(n)
                log = f"{log} `{n_word}`"

            print(log)

    def get_similarity(self, token: str):
        sim = np.zeros(self.vocabulary.size)
        word_array = np.zeros((1,1))
        word_array[0,0] = self.vocabulary.token_to_index(token)
        context_array = np.zeros((1,1))

        for i in tqdm(range(1, self.vocabulary.size), desc=f"`{token}` similarities"):
            context_array[0,0] = i
            sim[i] = self.validation_model.predict_on_batch([word_array, context_array])

        return sim


In [57]:
sim_logger = SimilarityLogger(validation_model, vocabulary, interval=5)

history = model.fit(
    Word2VecDataGenerator(samples, vocabulary),
    epochs=20,
    callbacks=[sim_logger])

the similarities:   0%|          | 0/145863 [00:00<?, ?it/s]


Epoch 1/20


AttributeError: 'Functional' object has no attribute 'predict_function'