# Self Made Word2Vec Embeddings

In [6]:
import lib
import numpy as np
import spacy
import tensorflow as tf

from functional import seq
from importlib import reload
from joblib import Memory
from lib.data import load_data as load_data_lib
from lib.text_preprocessing import preprocess_tokens, lemmatize, remove_stopwords, to_lower, tokenize, words_only
from lib.vocabulary import Vocabulary
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow import keras
from tqdm import tqdm
from typing import Optional

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# tf.debugging.set_log_device_placement(True)

reload(lib)

Num GPUs Available:  1


<module 'lib' (namespace)>

In [7]:
mem = Memory('./data/cache', verbose=0)


## Prepare Raw Data

In [8]:
@mem.cache
def load_data():
    res = load_data_lib()
    return res

data = load_data()
data.head()

Unnamed: 0,crawled,hash,source,title,type,text,lang
0,2021-02-18T16:16:27.508472Z,43d003c39a936c6a25e3fc02f71df795dddc1a0a7aa75d...,https://www.spiegel.de/sport/fussball/juventus...,Zu früher Einsatz nach Krankheit Morata fällt ...,Spiegel Online,Das Achtelfinalhinspiel der Champions League g...,de
1,2020-11-19T08:00:00.329121Z,7367d742827140cec892ac6ec940e28160d0cf2bebf300...,https://www.ard-text.de/mobil/141,Ecuador: Illegale Mine eingestürzt,ARD Teletext,Beim Einsturz einer Mine in Ecuador sind mehre...,de
2,2020-12-19T10:40:16.731763Z,83cd29369d52c1d1e3f132aa672402567c15f66878c719...,https://www.spiegel.de/politik/ausland/kara-te...,Humanitäre Katastrophe in Griechenland »Babys ...,Spiegel Online,Entwicklungsminister Gerd Müller hat die katas...,de
3,2020-11-03T10:00:00.970729Z,466c496cfcbc46116e0251e1401a363617ea8b14535e85...,https://www.spiegel.de/politik/ausland/elfenbe...,Umstrittene dritte Amtszeit Präsident Ouattara...,Spiegel Online,Es ist vordergründig ein klarer Sieg für Präsi...,de
4,2021-01-22T15:00:00.475531Z,559b96358bbd2132e111507b33e61112d5fad1bbfe89db...,https://www.spiegel.de/geschichte/franzosenhas...,Franzosenhass im 19. Jahrhundert Wie deutsche ...,Spiegel Online,"Gefallene Franzosen seien es nicht wert, in de...",de


In [5]:
data = data[data['lang'] == 'de']


(19975, 7)

In [30]:
@mem.cache
def preprocess_data():
    return seq(tqdm(data['text'].to_numpy()))\
        .map(lambda text: preprocess_tokens(text, lang, preprocess_pipeline))\
        .to_list()

samples = preprocess_data()
len(samples)

100%|██████████| 20070/20070 [1:41:49<00:00,  3.28it/s]  


20070

In [29]:
#preprocess_data.clear()
#samples[:3]

In [31]:
vocabulary = seq(tqdm(samples))\
    .fold_left(lib.vocabulary.Vocabulary.builder(), lambda voc, sentence: voc.add_sample(sentence))\
    .build()
vocabulary

100%|██████████| 20070/20070 [00:01<00:00, 15620.37it/s]


Vocabulary(name='default', size=170933, default_sample_length=9375)

In [32]:
list(vocabulary.token2index.items())[1:20]

[('the', 1),
 ('menschen', 2),
 ('prozent', 3),
 ('deutschland', 4),
 ('millionen', 5),
 ('laut', 6),
 ('trump', 7),
 ('of', 8),
 ('euro', 9),
 ('to', 10),
 ('spiegel', 11),
 ('pandemie', 12),
 ('biden', 13),
 ('deutschen', 14),
 ('usa', 15),
 ('and', 16),
 ('land', 17),
 ('unternehmen', 18),
 ('mal', 19)]

In [33]:
#
# Create model
#

dim = 50

word_input = keras.Input(shape=1)
context_word_input = keras.Input(shape=1)

embedding = keras.layers.Embedding(vocabulary.size, dim, input_length=1, name='embedding')
word_embedding = embedding(word_input)
word = keras.layers.Reshape((dim, 1))(word_embedding)
context_word = keras.layers.Reshape((dim, 1))(embedding(context_word_input))
similarity = keras.layers.Dot(1, normalize=True, name='similarity')([word, context_word])

dot_product = keras.layers.Reshape((1,))(keras.layers.Dot(1, name='dot_product')([word, context_word]))
output = keras.layers.Dense(1, activation=keras.activations.sigmoid, name='output')(dot_product)

model = keras.Model(inputs=[word_input, context_word_input], outputs=output, name="training_model")
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.RMSprop())

validation_model = keras.Model(inputs=[word_input, context_word_input], outputs=similarity, name="validation_model")
embedding_model = keras.Model(inputs=word_input, outputs=word_embedding, name="embedding_model")

model.summary()

Model: "training_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 50)        8546650     input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
reshape_6 (Reshape)             (None, 50, 1)        0           embedding[0][0]     

In [34]:
embedding_model.summary()

Model: "embedding_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 1, 50)             8546650   
Total params: 8,546,650
Trainable params: 8,546,650
Non-trainable params: 0
_________________________________________________________________


## Train the embedding

In [35]:
#
# Create a custom generator for generating Training samples from text.
#

from typing import List, Union
import math

class Word2VecDataGenerator(tf.keras.utils.Sequence):

    def __init__(self, text_samples: Union[str, List[str]], voc: Vocabulary, window_size: int = 4, group_size: int = 100):
        self._window_size = window_size
        self._samples_idx = voc.samples_to_indices(text_samples, include_oov=False)
        self._voc = voc
        self._group_size = group_size

    def __getitem__(self, index):
        start_idx = index * self._group_size
        end_idx = start_idx + self._group_size

        words = np.zeros([0], dtype='int32')
        context_words = np.zeros([0], dtype='int32')
        labels = np.zeros([0], dtype='int8')

        for i in range(start_idx, min(end_idx, self._samples_idx.shape[0])):
            inputs_i, labels_i = self.__get_minibatch__(i)
            words = np.concatenate([words, inputs_i[0]])
            context_words = np.concatenate([context_words, inputs_i[1]])
            labels = np.concatenate([labels, labels_i])

        return [words, context_words], labels

    def __get_minibatch__(self, index):
        sample = self._samples_idx[index]
        couples, labels = tf.keras.preprocessing.sequence.skipgrams(
            sample, self._voc.size, window_size=self._window_size)

        if len(couples) > 0:
            words, context_words = zip(*couples)
        else:
            words, context_words, labels = [[],[],[]]

        words = np.array(words, dtype='int32')
        context_words = np.array(context_words, dtype='int32')
        labels = np.array(labels, dtype='int8')

        return [words, context_words], labels


    def __len__(self):
        return math.ceil(self._samples_idx.shape[0] / self._group_size)

In [36]:
class SimilarityLogger(tf.keras.callbacks.Callback):

    def __init__(
            self, emb_model: keras.Model, voc: Vocabulary,
            interval: int = 10, validation_size: int = 10, k: int = 5):
        super().__init__()
        self.vocabulary = voc
        self.interval = interval
        self.validation_size = validation_size
        self.embedding_model = emb_model
        self.k = k

    def on_epoch_begin(self, epoch, logs=None):
        if epoch == 0:
            self.print_similarities()

    def on_epoch_end(self, epoch: int, logs: dict = None) -> None:
        if (epoch + 1) % self.interval != 0:
            return
        else:
            self.print_similarities()


    def print_similarities(self, items: Optional[List[str]] = None) -> None:
        if items is None:
            items = list(self.vocabulary.token2index.keys())[1:self.validation_size+1]

        items_idx = np.array([self.vocabulary.token_to_index(item) for item in items])
        print("Calculate embeddings ...")
        item_embeddings = self.get_embeddings(items_idx)
        embeddings = self.get_embeddings()

        print("Calculate similarities ...")
        similarities = cosine_similarity(item_embeddings, embeddings)

        for i in range(0, len(items)):
            item = items[i]
            sim = similarities[i]
            nearest = (-sim).argsort()[1:self.k + 1]

            log = f"Nearest neighbors to `{item}`:"
            for n in nearest:
                n_word = self.vocabulary.index_to_token(n)
                log = f"{log} `{n_word}`"

            print(log)

    def get_embeddings(self, tokens = None) -> np.array:
        if tokens is None:
            tokens = list(self.vocabulary.index2token.keys())

        word_indices = np.array(tokens)
        embeddings = self.embedding_model.predict_on_batch(word_indices)
        return np.squeeze(embeddings)

In [38]:
sim_logger = SimilarityLogger(embedding_model, vocabulary, interval=5)

history = model.fit(
    Word2VecDataGenerator(samples, vocabulary),
    epochs=80,
    callbacks=[sim_logger])

Epoch 1/80
Calculate embeddings ...
Calculate similarities ...
Nearest neighbors to `the`: `frances` `subways` `wednesday` `rockets` `estimates`
Nearest neighbors to `menschen`: `zweitkleinsten` `israelis` `lachend` `fachzeitschrift` `seeleute`
Nearest neighbors to `prozent`: `sexpartner` `staatsgebiet` `hosni` `bannmeile` `quantifizierbarer`
Nearest neighbors to `deutschland`: `geändert` `einwohnern` `bezirke` `csse` `pflegerischen`
Nearest neighbors to `millionen`: `milliarden` `staatssekretärs` `subtil` `gedachte` `helikoptergeld`
Nearest neighbors to `laut`: `zufolge` `wonach` `geschätzte` `ergab` `zusammengebrochen`
Nearest neighbors to `trump`: `trumps` `dodon` `kronzeugen` `gefecht` `impfen`
Nearest neighbors to `of`: `spent` `officers` `prosecutor` `fiele` `behind`
Nearest neighbors to `euro`: `dollar` `unredlich` `milliardenausfälle` `reiseziele` `betrunkenen`
Nearest neighbors to `to`: `examining` `passierten` `built` `universal` `worn`
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 

In [21]:
model.save('./data/model/training_model')
embedding_model.save('./data/model/embedding_model')


INFO:tensorflow:Assets written to: ./data/model/training_model/assets
INFO:tensorflow:Assets written to: ./data/model/embedding_model/assets


In [22]:
[ k for k in vocabulary.token2index.keys() if "frank" in k ]

['frankreich',
 'frankfurt',
 'frankfurter',
 'frank',
 'franke',
 'oberfranken',
 'franklin',
 'frankenstein',
 'unterfranken',
 'mittelfranken',
 'südfrankreich',
 'frankel',
 'frankenstrat',
 'frankes',
 'frankopan',
 'unionsfranktion',
 'frankenderby',
 'frankieren',
 'frankokanadierin',
 'frankofonen',
 'frankly',
 'frankenberg',
 'frankeich',
 'franken',
 'frankowski',
 'franklins',
 'nordfrankreich',
 'kontinentalfrankreich']

In [1]:
sim_logger.print_similarities(items=['frankreich', 'frankfurt'])

NameError: name 'sim_logger' is not defined

In [136]:
!pwd

/home/michael/Workspaces/thesis--news-preparation


In [24]:
list(vocabulary.token2index.keys())[1:10]


['the',
 'mensch',
 'land',
 'prozent',
 'deutschland',
 'woche',
 'million',
 'stehen',
 'laut']