# Self Made Word2Vec Embeddings

In [1]:
import lib
import numpy as np
import spacy
import tensorflow as tf

from functional import seq
from importlib import reload
from joblib import Memory
from lib.data import load_data as load_data_lib
from lib.text_preprocessing import preprocess_text, lemmatize, remove_stopwords, to_lower, tokenize, words_only
from lib.vocabulary import Vocabulary, OOV_TOKEN, EMPTY_TOKEN
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow import keras
from tqdm import tqdm

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# tf.debugging.set_log_device_placement(True)

reload(lib)

Num GPUs Available:  1


<module 'lib' (namespace)>

In [2]:
mem = Memory('./data/cache', verbose=0)


## Prepare Raw Data

In [3]:
preprocess_pipeline = [lemmatize, to_lower, words_only, remove_stopwords, tokenize]
lang = spacy.load('de_dep_news_trf')

@mem.cache
def load_data():
    res = load_data_lib()
    return res

data = load_data()

In [4]:
@mem.cache
def preprocess_data():
    return seq(tqdm(data['text'].to_numpy()))\
        .map(lambda text: preprocess_text(text, lang, preprocess_pipeline))\
        .to_list()

samples = preprocess_data()
len(samples)

20070

In [5]:
# preprocess_data.clear()

In [6]:
vocabulary = seq(tqdm(samples))\
    .fold_left(lib.vocabulary.Vocabulary.builder(), lambda voc, sentence: voc.add_sample(sentence))\
    .build()
vocabulary

100%|██████████| 20070/20070 [00:01<00:00, 17819.43it/s]


Vocabulary(name='default', size=145865, default_sample_length=9376)

In [7]:
#
# Create model
#

model_in = keras.Input(shape=vocabulary.size, name='input')
embedding_in = keras.layers.Dense(50, activation=keras.activations.relu, name='embedding')
embedding_out = embedding_in(model_in)
softmax_in = keras.layers.Dense(vocabulary.size, activation=keras.activations.softmax, name='softmax')
softmax_out = softmax_in(embedding_out)

In [8]:
model_in.dtype

tf.float32

In [9]:
#
# ...the model to learn the embedding
#

train_model = keras.Model(inputs=[model_in], outputs=[softmax_out], name='word2vec')
train_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"])


train_model.summary()

Model: "word2vec"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 145865)]          0         
_________________________________________________________________
embedding (Dense)            (None, 50)                7293300   
_________________________________________________________________
softmax (Dense)              (None, 145865)            7439115   
Total params: 14,732,415
Trainable params: 14,732,415
Non-trainable params: 0
_________________________________________________________________


In [10]:
#
# ...the model to calculate an embedding vector the tokens
#

calc_embedding = keras.Model(inputs=[train_model.input], outputs=[train_model.get_layer('embedding').output])

## Train the embedding

In [14]:
#
# Create a custom generator for generating Training samples from text.
#

from typing import List
import math

class Word2VecDataGenerator(tf.keras.utils.Sequence):

    def __init__(self, text_samples: List[str], voc: Vocabulary, batch_size: int = 1, n: int = 2):
        self._batch_size = batch_size
        self._samples_idx = voc.samples_to_indices(text_samples)
        self._voc = voc
        self._n = n

    def __getitem__(self, index):
        window_size = 2*self._n+1

        start = index * self._batch_size
        end = start + self._batch_size
        batch = self._samples_idx[start:end]

        input_result = np.empty([0, self._voc.size])
        output_result = np.empty([0, self._voc.size])

        for sample in batch:
            windows = [sample[j:j+window_size] for j in range(0, len(sample) - window_size + 1)]
            windows = list(filter(lambda w: EMPTY_TOKEN not in w.tolist(), windows))

            sample_input_result = np.zeros([len(windows), self._voc.size])
            sample_output_result = np.zeros([len(windows), self._voc.size])

            for i in range(0, len(windows)):
                window = windows[i]
                sample_input_result[i, window[self._n]] = 1

                for j in range(1, self._n + 1):
                    sample_output_result[i, window[self._n - j]] = 1
                    sample_output_result[i, window[self._n + j]] = 1

            input_result = sample_input_result # np.concatenate([input_result, sample_input_result])
            output_result = sample_output_result # np.concatenate([output_result, sample_output_result])
            print("huhu")

        #print("test")
        print(len(windows))
        return tf.SparseTensor(indices=[[0,0]], values=[1], dense_shape=[len(windows), self._voc.size]), tf.SparseTensor(indices=[[0,0]], values=[1], dense_shape=[len(windows), self._voc.size])

    def __len__(self):
        return math.ceil(self._samples_idx.shape[0] / self._batch_size)

history = train_model.fit(
    Word2VecDataGenerator(samples, vocabulary),
    epochs=100)

huhu
9372
Epoch 1/100
huhu
9372
huhu
9372


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was SparseTensor(indices=tf.Tensor([[0 0]], shape=(1, 2), dtype=int64), values=tf.Tensor([1], shape=(1,), dtype=int32), dense_shape=tf.Tensor([  9372 145865], shape=(2,), dtype=int64)).
Traceback (most recent call last):

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 910, in generator_py_func
    script_ops.FuncRegistry._convert(  # pylint: disable=protected-access

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 209, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/numpy/core/_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'SparseTensor'


During handling of the above exception, another exception occurred:


Traceback (most recent call last):

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 249, in __call__
    ret = func(*args)

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 620, in wrapper
    return func(*args, **kwargs)

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 914, in generator_py_func
    six.reraise(

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/six.py", line 702, in reraise
    raise value.with_traceback(tb)

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 910, in generator_py_func
    script_ops.FuncRegistry._convert(  # pylint: disable=protected-access

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 209, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/numpy/core/_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was SparseTensor(indices=tf.Tensor([[0 0]], shape=(1, 2), dtype=int64), values=tf.Tensor([1], shape=(1,), dtype=int32), dense_shape=tf.Tensor([  9372 145865], shape=(2,), dtype=int64)).


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_4]]
  (1) Invalid argument:  TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was SparseTensor(indices=tf.Tensor([[0 0]], shape=(1, 2), dtype=int64), values=tf.Tensor([1], shape=(1,), dtype=int32), dense_shape=tf.Tensor([  9372 145865], shape=(2,), dtype=int64)).
Traceback (most recent call last):

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 910, in generator_py_func
    script_ops.FuncRegistry._convert(  # pylint: disable=protected-access

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 209, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/numpy/core/_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'SparseTensor'


During handling of the above exception, another exception occurred:


Traceback (most recent call last):

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 249, in __call__
    ret = func(*args)

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 620, in wrapper
    return func(*args, **kwargs)

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 914, in generator_py_func
    six.reraise(

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/six.py", line 702, in reraise
    raise value.with_traceback(tb)

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 910, in generator_py_func
    script_ops.FuncRegistry._convert(  # pylint: disable=protected-access

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 209, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/home/michael/Workspaces/thesis--news-preparation/env/lib/python3.8/site-packages/numpy/core/_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was SparseTensor(indices=tf.Tensor([[0 0]], shape=(1, 2), dtype=int64), values=tf.Tensor([1], shape=(1,), dtype=int32), dense_shape=tf.Tensor([  9372 145865], shape=(2,), dtype=int64)).


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1238]

Function call stack:
train_function -> train_function


In [None]:
history = train_model.fit(
    Word2VecDataGenerator(samples, vocabulary),
    epochs=100)

In [None]:
#
# Calculate embedding vector for each token
#
embeddings_input = vocabulary.sample_to_binary_sparse(' '.join(vocabulary.token2index.keys()), max_len=0)
embeddings_input.shape

In [None]:
embeddings = calc_embedding(embeddings_input).numpy()
embeddings.shape

In [None]:
similarities = cosine_similarity(embeddings)
oov_idx = vocabulary.token_to_index(OOV_TOKEN)

def similar_words(word):
    word = preprocess_text(word, lang, preprocess_pipeline).strip()
    idx = vocabulary.token_to_index(word)

    if idx == oov_idx:
        print(f"Warn: `{word}` is out of index.")

    best_matches = np.flip(np.argsort(similarities[idx])[-11: -1])
    return seq(best_matches).map(vocabulary.index_to_token)

In [None]:
# Test Embedding
similar_words('gegessen')