In [1]:
"""
@authors: faurand, chardes, ehagensieker
"""
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import tensorflow as tf
from tensorflow import keras
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import datetime
import tqdm
import re

%load_ext tensorboard

2023-02-04 17:52:48.402170: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-04 17:52:48.592541: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-04 17:52:48.592580: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-04 17:52:49.582721: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

# 2.1 The dataset

In [2]:
# Open the dataset
with open("bible.txt", "r") as f:
    txt = f.read()

# print a short example
print(txt[:500])

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light: and there was light.

1:4 And God saw the light, that it was good: and God divided the light
from the darkness.

1:5 And God called the light Day, and the darkness he called Night.
And the evening and the mornin


# 2.2 Word embeddings

In [3]:
# convert to lower case, remove \n and special characters
txt = txt.replace("\n", " ").lower()
txt = re.sub('[^A-Za-z]+', ' ', txt) 
print(f"converted: {txt[:100]} \n")

# tokenize the text
txt = tf_text.WhitespaceTokenizer().split(txt)
txt = list(txt.numpy().astype('U'))
print(f"tokenized: {txt[:50]}")

converted: the first book of moses called genesis in the beginning god created the heaven and the earth and the 



2023-02-04 17:52:52.554266: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-02-04 17:52:52.554300: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-04 17:52:52.554325: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (default): /proc/driver/nvidia/version does not exist
2023-02-04 17:52:52.554632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenized: ['the', 'first', 'book', 'of', 'moses', 'called', 'genesis', 'in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth', 'and', 'the', 'earth', 'was', 'without', 'form', 'and', 'void', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep', 'and', 'the', 'spirit', 'of', 'god', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters', 'and', 'god', 'said', 'let']


In [4]:
# counting all words from the corpus and get 10000 most frequent words
count = Counter(txt).most_common(10000)
words = [x[0] for x in count]
print("The 20 most common words: ", words[:50])

The 20 most common words:  ['the', 'and', 'of', 'to', 'that', 'in', 'he', 'shall', 'unto', 'for', 'i', 'his', 'a', 'lord', 'they', 'be', 'is', 'him', 'not', 'them', 'it', 'with', 'all', 'thou', 'thy', 'was', 'god', 'which', 'my', 'me', 'said', 'but', 'ye', 'their', 'have', 'will', 'thee', 'from', 'as', 'are', 'when', 'this', 'out', 'were', 'upon', 'man', 'by', 'you', 'israel', 'king']


In [5]:
# create a dictionary to save mappings from tokens to integer indices
vocab = {token: idx for idx, token in enumerate(words)}
# create another dictionary to store the inverse mapping
inverse_vocab = {idx: token for token, idx in vocab.items()}

# convert the words into integers
txt = [vocab[word] for word in txt if word in vocab.keys()]
print(txt[:50])
print(len(txt))

[0, 216, 401, 2, 132, 160, 8616, 5, 0, 679, 26, 1295, 0, 170, 1, 0, 111, 1, 0, 111, 25, 220, 1999, 1, 2000, 1, 497, 25, 44, 0, 227, 2, 0, 971, 1, 0, 189, 2, 26, 867, 44, 0, 227, 2, 0, 304, 1, 26, 30, 79]
789262


In [6]:
s = 0.001
word_counts = Counter(txt)
total_count = len(txt)

# calculating for each word the fraction of the total words being this specific word
freqs = {word: count/total_count for word, count in word_counts.items()}
# calculating for each word the probability of keeping this specific word
p_keep = {word: ((np.sqrt(freqs[word]/s) + 1)*s/freqs[word]) for word in word_counts}

# apply subsampling to discard words that appear very often
txt = [word for word in txt if random.random() < p_keep[word]]
print(txt[:50])
print(len(txt))

[216, 401, 2, 132, 160, 8616, 679, 26, 1295, 170, 111, 111, 220, 1999, 2000, 497, 25, 44, 227, 971, 189, 867, 44, 227, 304, 26, 30, 79, 52, 15, 326, 52, 25, 326, 26, 177, 326, 20, 147, 26, 930, 0, 326, 497, 160, 0, 326, 70, 497, 160]
526163


In [7]:
# generate the input-target pairs
pairs = []
for i in range(len(txt)):
    for j in [-2, -1, 1, 2]: # window size of 4
        if i + j >= 0 and i + j < len(txt):
            pairs.append((txt[i], txt[i + j]))

for pair in pairs[:15]:
    print(f"input word: {inverse_vocab[pair[0]]}, target word: {inverse_vocab[pair[1]]} \n")

input word: first, target word: book 

input word: first, target word: of 

input word: book, target word: first 

input word: book, target word: of 

input word: book, target word: moses 

input word: of, target word: first 

input word: of, target word: book 

input word: of, target word: moses 

input word: of, target word: called 

input word: moses, target word: book 

input word: moses, target word: of 

input word: moses, target word: called 

input word: moses, target word: genesis 

input word: called, target word: of 

input word: called, target word: moses 



In [8]:
# Create the data set
pairs = np.array(pairs)
ds = tf.data.Dataset.from_tensor_slices((pairs[:,0], pairs[:,1]))

# shuffle, batch, prefetch and cache
ds = ds.shuffle(1024).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE).cache()

# investigate the newly created data set
for batch in ds.take(1):
    tf.print(tf.shape(batch), "\n", batch)

[2 128] 
 ([0 44 150 ... 109 79 37], [930 111 2475 ... 70 1 326])


2023-02-04 17:52:57.223033: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


# 2.3 Model

In [9]:
 class SkipGramModel(tf.keras.layers.Layer):
    
    def __init__(self, optimizer, embedding_size, vocabulary_size, counts, num_negative_samples=64):
        super(SkipGramModel, self).__init__()
        self.optimizer = optimizer
        self.embedding_size = embedding_size
        self.vocabulary_size = vocabulary_size
        self.counts = counts
        self.num_negative_samples = num_negative_samples
        
        self.metrics_list = [tf.keras.metrics.Mean(name="loss")]
        
        # Neural network weights and biases
        self.nce_weights = tf.Variable(tf.random.truncated_normal([self.vocabulary_size, self.embedding_size], stddev=0.1 / np.sqrt(self.embedding_size)))
        self.nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))
        
        # Embedding layer
        self.embedding = tf.Variable(tf.random.uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))

    def call(self, input_words):
        # Look up embeddings for a batch of inputs
        embed = tf.nn.embedding_lookup(self.embedding, input_words)
    
        return embed
    
    @property
    def metrics(self):
        return self.metrics_list

    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()
    
    def get_nce_loss(self, target_words, input_words, embed):
        # Sample a number of negative samples for given input words
        sampled_values = tf.random.fixed_unigram_candidate_sampler(true_classes=tf.reshape(input_words, (128,1)),
                                                                   num_true=1,
                                                                   num_sampled=self.num_negative_samples,
                                                                   unique=True,
                                                                   range_max=self.vocabulary_size,
                                                                   unigrams=self.counts,
                                                                   name="negative_sampling")
        
        # Compute the noise contrastive loss
        nce_loss = tf.nn.nce_loss(weights =self.nce_weights,
                                  biases=self.nce_biases,
                                  labels=target_words, 
                                  inputs=embed,  
                                  num_sampled=self.num_negative_samples, 
                                  num_classes=self.vocabulary_size,
                                  sampled_values=sampled_values)
        
        return tf.reduce_mean(nce_loss)
    
    @tf.function
    def train(self, data):
        input_words, target_words = data
      
        with tf.GradientTape() as tape:
            # get the embedding
            embed = self(input_words)
            # compute the loss
            loss = self.get_nce_loss(target_words, input_words, embed)
        
        # compute the gradients and update the network
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        # update the metrics 
        self.metrics[0].update_state(loss)
        
        return {metric.name: metric.result() for metric in self.metrics}
    
    @tf.function
    def validate(self, val_ds):
        # Compute the cosine similarity between a few common words and all embeddings
        norm = tf.sqrt(tf.reduce_sum(tf.square(self.embedding), 1, keepdims=True))
        normalized_embedding = self.embedding / norm
        val_embedding = tf.nn.embedding_lookup(normalized_embedding, val_ds)
        sim = tf.matmul(val_embedding, tf.transpose(normalized_embedding))
        
        return sim

# 2.4 Training

In [10]:
def create_summary_writer(config_name):
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    log_path = f"logs/skipgram/{config_name}/{current_time}"
    summary_writer = tf.summary.create_file_writer(log_path)

    return summary_writer

In [11]:
def training_loop(ds, model, epochs, val_ds, val_words, summary_writer): 
    # iterate over given amount of epochs
    for epoch in range(epochs):     
        print(f"Epoch {epoch}: ")

        for data in tqdm.tqdm(ds, position = 0, leave = True):
            # get the loss
            metrics = model.train(data)

            # keep track of the metrices
            with summary_writer.as_default():
                for metric in model.metrics: 
                    tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

        # print the training metrics
        print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])
        # reset metrics 
        model.reset_metrics()
    
        # computing the cosine similarity to evaluate if our embedding table is grouping together words with similar semantic meanings
        sim = model.validate(val_ds).numpy()
        # evaluate the training by printing the closest words to our validation words
        for i, word in enumerate(val_words):
            top_k = 8 # number of nearest neighbors
            log = f"Nearest to {word}: "
            nearest = (-sim[i, :]).argsort()[1:top_k+1]
            for k in range(top_k):
                neighbour = inverse_vocab[nearest[k]]
                log += f" {neighbour},"
            print(log)

In [12]:
# Keep track of the counts
_, _, counts = map(lambda x:x.numpy(), tf.unique_with_counts(txt))
counts = list(counts.astype(float)/len(txt))

# create a tf data set with the indices of our validation words
val_words = ["holy", "father", "wine", "poison", "love", "strong", "day"]
val_words_idx = [vocab[val_words[i]] for i in range(len(val_words))]
val_ds = tf.constant(val_words_idx, dtype=tf.int32)

vocabulary_size = len(vocab)
embedding_size = 64
epochs = 15

optimizer = tf.optimizers.Adam(0.001) 
summary_writer = create_summary_writer(config_name = f'RUN')

model = SkipGramModel(optimizer = optimizer,
                      embedding_size = embedding_size,
                      vocabulary_size = vocabulary_size,
                      counts = counts)

training_loop(ds, model, epochs, val_ds, val_words, summary_writer)

Epoch 0: 


100%|██████████| 16442/16442 [02:33<00:00, 106.88it/s]


['loss: 31.33449363708496']
Nearest to holy:  righteousness, sent, god, so, now, light, whom, hath,
Nearest to father:  king, house, land, saying, day, lord, give, great,
Nearest to wine:  fled, shut, prepare, open, born, established, worshipped, turn,
Nearest to poison:  hundreds, devoured, confusion, suffice, frankincense, fishes, remove, sanctified,
Nearest to love:  would, see, should, cast, who, own, moreover, death,
Nearest to strong:  we, people, lord, let, stand, days, cast, because,
Nearest to day:  lord, what, as, but, chosen, death, yet, flesh,
Epoch 1: 


100%|██████████| 16442/16442 [02:31<00:00, 108.82it/s]


['loss: 8.095026016235352']
Nearest to holy:  righteousness, light, knoweth, crying, strengthen, giveth, obey, sign,
Nearest to father:  servant, wife, sake, chosen, king, house, throne, glory,
Nearest to wine:  fled, prepare, shut, confess, declare, write, standing, hid,
Nearest to poison:  hundreds, confusion, suffice, fishes, frankincense, devoured, remove, sanctified,
Nearest to love:  moreover, would, arise, see, lift, whether, fountain, sign,
Nearest to strong:  washed, blind, ground, sleep, live, seek, salvation, arise,
Nearest to day:  weeping, raised, killed, lord, shield, what, trespass, abode,
Epoch 2: 


100%|██████████| 16442/16442 [02:31<00:00, 108.67it/s]


['loss: 6.496453285217285']
Nearest to holy:  strengthen, remaineth, served, giveth, knoweth, light, righteousness, crying,
Nearest to father:  servant, sake, wife, oath, jonathan, glory, chosen, presence,
Nearest to wine:  confess, fled, prepare, business, falleth, write, shut, sickle,
Nearest to poison:  hundreds, suffice, confusion, fishes, devoured, remove, determined, repay,
Nearest to love:  seek, arise, ways, sign, lift, vineyards, ought, moreover,
Nearest to strong:  washed, strengthen, trust, blind, sleep, salvation, believe, seek,
Nearest to day:  balaam, raised, killed, time, beginning, weeping, light, abode,
Epoch 3: 


100%|██████████| 16442/16442 [02:30<00:00, 109.06it/s]


['loss: 6.070240497589111']
Nearest to holy:  served, strengthen, remaineth, giveth, beautiful, sign, knoweth, gentiles,
Nearest to father:  servant, sake, jonathan, presence, oath, womb, husband, shewed,
Nearest to wine:  confess, falleth, business, sickle, prepare, washed, continue, shut,
Nearest to poison:  hundreds, suffice, confusion, devoured, determined, scribe, remove, enchantments,
Nearest to love:  seek, ways, teach, understand, sign, forgive, ought, lamentation,
Nearest to strong:  washed, blind, trust, believe, strengthen, food, seek, sleep,
Nearest to day:  balaam, beginning, trouble, light, time, hearkened, month, killed,
Epoch 4: 


100%|██████████| 16442/16442 [02:30<00:00, 109.29it/s]


['loss: 6.046149253845215']
Nearest to holy:  beautiful, vale, giveth, where, least, sign, remaineth, served,
Nearest to father:  servant, sake, jonathan, husband, presence, youth, womb, shewed,
Nearest to wine:  falleth, business, confess, sickle, continue, copy, merry, food,
Nearest to poison:  hundreds, suffice, determined, devoured, confusion, remove, cometh, enchantments,
Nearest to love:  seek, lamentation, teach, understand, rejoice, desire, ways, ought,
Nearest to strong:  believe, washed, trust, strengthen, food, confounded, upright, blind,
Nearest to day:  balaam, beginning, hour, month, light, time, dwellings, trouble,
Epoch 5: 


100%|██████████| 16442/16442 [02:29<00:00, 109.69it/s]


['loss: 5.973578453063965']
Nearest to holy:  lamentation, beautiful, vale, touching, love, dealeth, high, remaineth,
Nearest to father:  servant, sake, jonathan, youth, camps, husband, womb, shewed,
Nearest to wine:  merry, oil, continue, business, falleth, sickle, copy, wheat,
Nearest to poison:  hundreds, devoured, odours, suffice, determined, enchantments, dungeon, acceptable,
Nearest to love:  seek, desire, rejoice, riding, lamentation, teach, understand, oblation,
Nearest to strong:  believe, confounded, trust, food, became, bestowed, washed, upright,
Nearest to day:  hour, dwellings, month, even, beginning, balaam, time, yet,
Epoch 6: 


100%|██████████| 16442/16442 [02:29<00:00, 109.85it/s]


['loss: 6.006770610809326']
Nearest to holy:  love, high, luz, dealeth, lamentation, beautiful, vale, where,
Nearest to father:  servant, jonathan, husband, youth, sake, camps, womb, promise,
Nearest to wine:  oil, merry, business, continue, wheat, falleth, copy, sickle,
Nearest to poison:  odours, devoured, dungeon, determined, oven, suffice, lean, hundreds,
Nearest to love:  seek, riding, rejoice, understand, desire, saidst, teach, prosper,
Nearest to strong:  became, confounded, trust, believe, food, upright, flaming, purge,
Nearest to day:  hour, month, even, time, dwellings, sabbath, yet, word,
Epoch 7: 


100%|██████████| 16442/16442 [02:29<00:00, 110.12it/s]


['loss: 6.036264419555664']
Nearest to holy:  luz, love, high, lamentation, dealeth, beautiful, touching, where,
Nearest to father:  servant, sake, jonathan, husband, youth, womb, performed, camps,
Nearest to wine:  oil, merry, vinegar, business, continue, fail, falleth, privately,
Nearest to poison:  odours, dungeon, devoured, lean, determined, oven, wells, possible,
Nearest to love:  seek, riding, desire, sakes, understand, rejoice, skill, prosper,
Nearest to strong:  became, flaming, confounded, believe, persuade, trust, frame, jew,
Nearest to day:  hour, even, month, yet, sabbath, pass, word, dwellings,
Epoch 8: 


100%|██████████| 16442/16442 [02:28<00:00, 110.62it/s]


['loss: 6.0014777183532715']
Nearest to holy:  high, love, luz, dealeth, lamentation, where, accord, true,
Nearest to father:  servant, sake, jonathan, master, husband, mother, performed, promise,
Nearest to wine:  oil, merry, vinegar, else, privately, fail, continue, answer,
Nearest to poison:  dungeon, odours, devoured, lean, oven, determined, possible, sober,
Nearest to love:  rejoice, seek, riding, believe, sakes, understand, desire, saidst,
Nearest to strong:  became, flaming, believe, frame, fool, jew, confounded, trust,
Nearest to day:  hour, even, month, yet, wherefore, commandment, finished, sabbath,
Epoch 9: 


100%|██████████| 16442/16442 [02:28<00:00, 110.77it/s]


['loss: 6.091377258300781']
Nearest to holy:  high, love, luz, dealeth, where, accord, touching, affrighted,
Nearest to father:  servant, master, jonathan, sake, husband, mother, damnation, promise,
Nearest to wine:  oil, merry, vinegar, fail, else, continue, bottles, corn,
Nearest to poison:  dungeon, devoured, odours, possible, pressed, oven, sober, repay,
Nearest to love:  rejoice, seek, believe, riding, sakes, desire, saidst, understand,
Nearest to strong:  became, flaming, believe, fool, frame, exalted, trust, confounded,
Nearest to day:  even, hour, month, commandment, pray, yet, wherefore, when,
Epoch 10: 


100%|██████████| 16442/16442 [02:27<00:00, 111.42it/s]


['loss: 6.153596878051758']
Nearest to holy:  high, love, luz, accord, where, lamentation, touching, dealeth,
Nearest to father:  servant, master, sake, health, jonathan, mother, husband, damnation,
Nearest to wine:  oil, merry, vinegar, answer, fail, else, yet, bottles,
Nearest to poison:  dungeon, devoured, pressed, oven, sober, possible, spoilers, hedge,
Nearest to love:  seek, desire, believe, perfect, rejoice, sakes, prosper, understand,
Nearest to strong:  became, fool, flaming, exalted, believe, cruel, trust, foolish,
Nearest to day:  even, hour, commandment, month, when, yet, wherefore, last,
Epoch 11: 


100%|██████████| 16442/16442 [02:27<00:00, 111.72it/s]


['loss: 6.139459609985352']
Nearest to holy:  high, love, accord, where, luz, persecutors, touching, oblation,
Nearest to father:  servant, health, master, mother, damnation, jonathan, husband, sake,
Nearest to wine:  oil, merry, answer, else, vinegar, yet, flesh, poured,
Nearest to poison:  dungeon, devoured, pressed, sober, dawning, spoilers, wells, possible,
Nearest to love:  seek, believe, desire, perfect, rejoice, prosper, skill, sakes,
Nearest to strong:  became, fool, flaming, cruel, ready, exalted, foolish, trust,
Nearest to day:  hour, even, commandment, yet, last, when, wherefore, month,
Epoch 12: 


100%|██████████| 16442/16442 [02:27<00:00, 111.38it/s]


['loss: 6.2075276374816895']
Nearest to holy:  high, accord, luz, love, where, oblation, doctrines, persecutors,
Nearest to father:  servant, health, master, jonathan, damnation, performed, changed, promise,
Nearest to wine:  oil, poured, vinegar, merry, answer, flesh, corn, bottles,
Nearest to poison:  pressed, dungeon, sober, spoilers, dawning, gileadite, devoured, reserved,
Nearest to love:  rejoice, seek, desire, sakes, skill, believe, understand, prosper,
Nearest to strong:  became, flaming, fool, cruel, ready, exalted, foolish, frame,
Nearest to day:  hour, even, last, commandment, month, yet, when, lord,
Epoch 13: 


100%|██████████| 16442/16442 [02:27<00:00, 111.42it/s]


['loss: 6.2622528076171875']
Nearest to holy:  high, luz, accord, doctrines, love, most, where, oblation,
Nearest to father:  servant, health, master, damnation, jonathan, performed, shoe, promise,
Nearest to wine:  oil, corn, poured, vinegar, fail, bottles, answer, merry,
Nearest to poison:  pressed, dungeon, dawning, gileadite, reserved, spoilers, sober, devoured,
Nearest to love:  seek, comforter, rejoice, believe, desire, skill, prosper, sakes,
Nearest to strong:  fool, cruel, became, exalted, ready, flaming, power, foolish,
Nearest to day:  hour, even, last, commandment, month, coming, sabbath, fourteenth,
Epoch 14: 


100%|██████████| 16442/16442 [02:27<00:00, 111.55it/s]


['loss: 6.316689968109131']
Nearest to holy:  luz, accord, love, doctrines, high, most, where, oblation,
Nearest to father:  servant, master, performed, shoe, health, damnation, bonds, changed,
Nearest to wine:  oil, breath, corn, merry, nails, vinegar, eaten, bottles,
Nearest to poison:  pressed, dawning, dungeon, reserved, gileadite, cieled, spoilers, sober,
Nearest to love:  comforter, rejoice, skill, sakes, desire, perfect, saidst, believe,
Nearest to strong:  cruel, fool, became, ready, exalted, flaming, foolish, power,
Nearest to day:  hour, even, last, fourteenth, commandment, coming, sabbath, month,
