# Self Made Word2Vec Embeddings

In [1]:
import lib
import numpy as np
import spacy
import tensorflow as tf

from functional import seq
from importlib import reload
from joblib import Memory
from lib.data import load_data as load_data_lib
from lib.text_preprocessing import preprocess_text, lemmatize, remove_stopwords, to_lower, tokenize, words_only
from lib.vocabulary import Vocabulary, OOV_TOKEN, EMPTY_TOKEN
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow import keras
from tqdm import tqdm

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# tf.debugging.set_log_device_placement(True)

reload(lib)

Num GPUs Available:  1


<module 'lib' (namespace)>

In [2]:
mem = Memory('./data/cache', verbose=0)


## Prepare Raw Data

In [3]:
preprocess_pipeline = [lemmatize, to_lower, words_only, remove_stopwords, tokenize]
lang = spacy.load('de_dep_news_trf')

@mem.cache
def load_data():
    res = load_data_lib()
    return res

data = load_data()

In [4]:
@mem.cache
def preprocess_data():
    return seq(tqdm(data['text'].to_numpy()))\
        .map(lambda text: preprocess_text(text, lang, preprocess_pipeline))\
        .to_list()

samples = preprocess_data()
len(samples)

20070

In [5]:
# preprocess_data.clear()

In [6]:
vocabulary = seq(tqdm(samples))\
    .fold_left(lib.vocabulary.Vocabulary.builder('corpus'), lambda voc, sentence: voc.add_sample(sentence))\
    .build()
vocabulary

100%|██████████| 20070/20070 [00:01<00:00, 16027.28it/s]


Vocabulary(name='corpus', size=145865, default_sample_length=9376)

In [7]:
samples_indexed = vocabulary.samples_to_binary_sparse(samples)
samples_indexed.shape

100%|██████████| 20070/20070 [06:47<00:00, 49.26it/s]


TensorShape([20070, 9376, 145865])

In [8]:
#
# Create model
#

model_in = keras.Input(shape=vocabulary.size, name='input')
embedding_in = keras.layers.Dense(50, activation=keras.activations.relu, name='embedding')
embedding_out = embedding_in(model_in)
softmax_in = keras.layers.Dense(vocabulary.size, activation=keras.activations.softmax, name='softmax')
softmax_out = softmax_in(embedding_out)

In [9]:
model_in.shape

TensorShape([None, 145865])

In [10]:
#
# ...the model to learn the embedding
#

train_model = keras.Model(inputs=[model_in], outputs=[softmax_out], name='word2vec')
train_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"])


train_model.summary()

Model: "word2vec"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 145865)]          0         
_________________________________________________________________
embedding (Dense)            (None, 50)                7293300   
_________________________________________________________________
softmax (Dense)              (None, 145865)            7439115   
Total params: 14,732,415
Trainable params: 14,732,415
Non-trainable params: 0
_________________________________________________________________


In [11]:
#
# ...the model to calculate an embedding vector the tokens
#

calc_embedding = keras.Model(inputs=[train_model.input], outputs=[train_model.get_layer('embedding').output])

## Prepare Training Data

In [18]:
#
# Prepare some helpers to generate pairs of [input, context_word] over the sentences.
#
n = 2
dummy_tokens = np.zeros([n, vocabulary.size])
dummy_tokens[:] = vocabulary.sample_to_binary_indices(EMPTY_TOKEN)[0]

base_window_pair_indices = list(range(0,n)) + list(range(n+1, 2*n+1))
window_pair_indices = seq(range(0, vocabulary.default_sample_length + 2*n))\
    .sliding(2 * n + 1)\
    .flat_map(lambda sentence_indices: seq(base_window_pair_indices).map(lambda base_idx: [sentence_indices[n], sentence_indices[base_idx]]))\
    .to_list()

In [19]:
#
# Create the pairs.
#
training_samples = np.array(seq(samples)\
    .map(lambda sentence: vocabulary.sample_to_binary_indices(sentence))\
    .map(lambda encoded: np.concatenate([dummy_tokens, encoded, dummy_tokens]))\
    .flat_map(lambda encoded: seq(window_pair_indices)\
              .filter(lambda pair: not np.all(encoded[pair[1]] == dummy_tokens[0]))\
              .filter(lambda pair: not np.all(encoded[pair[0]] == dummy_tokens[0]))\
              .map(lambda pair: np.stack([encoded[pair[0]], encoded[pair[1]]], axis=0)))\
    .to_list())
training_samples.shape

(18780, 2, 2275)

## Train the embedding

In [20]:
history = train_model.fit(
    training_samples[:,0,:],
    training_samples[:,1,:],
    batch_size=512,
    epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [21]:
#
# Calculate embedding vector for each token
#
embeddings_input = vocabulary.sample_to_binary_indices(' '.join(vocabulary.token2index.keys()), max_len=0)
embeddings_input.shape

(2275, 2275)

In [22]:
embeddings = calc_embedding(embeddings_input).numpy()
embeddings.shape

(2275, 50)

In [44]:
similarities = cosine_similarity(embeddings)
oov_idx = vocabulary.token_to_index(OOV_TOKEN)

def similar_words(word):
    word = preprocess_text(word, lang, preprocess_pipeline).strip()
    idx = vocabulary.token_to_index(word)

    if idx == oov_idx:
        print(f"Warn: `{word}` is out of index.")

    best_matches = np.flip(np.argsort(similarities[idx])[-11: -1])
    return seq(best_matches).map(vocabulary.index_to_token)

In [45]:
# Test Embedding
similar_words('gegessen')

Warn: `essen` is out of index.


['<EMPT>', 'mark', 'disunion', 'foes', 'gravest', 'resilience', 'guardrails', 'bottom', 'thrived', 'pledge']