In [1]:
"""
@authors: faurand, chardes, ehagensieker
"""
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import tensorflow as tf
from tensorflow import keras
from collections import Counter
from google.colab import drive
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import datetime
import tqdm
import re

%load_ext tensorboard

# **2.1 The dataset**

In [2]:
# bash code to mount the drive
drive.mount("/content/drive")
os.chdir("drive/MyDrive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 2.1 the dataset
with open("/content/bible.txt", "r") as f:
  txt = f.read()

# print a short example
print(txt[:500])

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light: and there was light.

1:4 And God saw the light, that it was good: and God divided the light
from the darkness.

1:5 And God called the light Day, and the darkness he called Night.
And the evening and the mornin


# **2.2 Word embeddings**

In [4]:
# convert to lower case, remove \n and special characters
txt = txt.replace("\n", " ").lower()
txt = re.sub('[^A-Za-z]+', ' ', txt) 
print(f"converted: {txt[:100]} \n")

# tokenize the text
txt = tf_text.WhitespaceTokenizer().split(txt)
txt = list(txt.numpy().astype('U'))
print(f"tokenized: {txt[:50]}")

converted: the first book of moses called genesis in the beginning god created the heaven and the earth and the 

tokenized: ['the', 'first', 'book', 'of', 'moses', 'called', 'genesis', 'in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth', 'and', 'the', 'earth', 'was', 'without', 'form', 'and', 'void', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep', 'and', 'the', 'spirit', 'of', 'god', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters', 'and', 'god', 'said', 'let']


In [5]:
# counting all words from the corpus and get 10000 most frequent words
count = Counter(txt).most_common(10000)
words = [x[0] for x in count]
print("The 20 most common words: ", words[:50])

The 20 most common words:  ['the', 'and', 'of', 'to', 'that', 'in', 'he', 'shall', 'unto', 'for', 'i', 'his', 'a', 'lord', 'they', 'be', 'is', 'him', 'not', 'them', 'it', 'with', 'all', 'thou', 'thy', 'was', 'god', 'which', 'my', 'me', 'said', 'but', 'ye', 'their', 'have', 'will', 'thee', 'from', 'as', 'are', 'when', 'this', 'out', 'were', 'upon', 'man', 'by', 'you', 'israel', 'king']


In [6]:
# create a dictionary to save mappings from tokens to integer indices
vocab = {token: idx for idx, token in enumerate(words)}
# create another dictionary to store the inverse mapping
inverse_vocab = {idx: token for token, idx in vocab.items()}

# convert the words into integers
txt = [vocab[word] for word in txt if word in vocab.keys()]
print(txt[:50])
print(len(txt))

[0, 216, 401, 2, 132, 160, 8616, 5, 0, 679, 26, 1295, 0, 170, 1, 0, 111, 1, 0, 111, 25, 220, 1999, 1, 2000, 1, 497, 25, 44, 0, 227, 2, 0, 971, 1, 0, 189, 2, 26, 867, 44, 0, 227, 2, 0, 304, 1, 26, 30, 79]
789262


In [7]:
s = 0.001
word_counts = Counter(txt)
total_count = len(txt)

# calculating for each word the fraction of the total words being this specific word
freqs = {word: count/total_count for word, count in word_counts.items()}
# calculating for each word the probability of keeping this specific word
p_keep = {word: ((np.sqrt(freqs[word]/s) + 1)*s/freqs[word]) for word in word_counts}

# apply subsampling to discard words that appear very often
txt = [word for word in txt if random.random() < p_keep[word]]
print(txt[:50])
print(len(txt))

[216, 401, 2, 132, 160, 8616, 679, 1295, 170, 111, 111, 220, 1999, 2000, 1, 497, 44, 227, 971, 189, 26, 867, 227, 304, 79, 52, 15, 326, 52, 25, 326, 177, 326, 4, 25, 147, 26, 930, 0, 326, 497, 26, 160, 326, 70, 497, 160, 286, 1039, 387]
526231


In [8]:
# generate the input-target pairs
pairs = []
for i in range(len(txt)):
    for j in [-2, -1, 1, 2]: # window size of 4
        if i + j >= 0 and i + j < len(txt):
            pairs.append((txt[i], txt[i + j]))

for pair in pairs[:15]:
    print(f"input word: {inverse_vocab[pair[0]]}, target word: {inverse_vocab[pair[1]]} \n")

input word: first, target word: book 

input word: first, target word: of 

input word: book, target word: first 

input word: book, target word: of 

input word: book, target word: moses 

input word: of, target word: first 

input word: of, target word: book 

input word: of, target word: moses 

input word: of, target word: called 

input word: moses, target word: book 

input word: moses, target word: of 

input word: moses, target word: called 

input word: moses, target word: genesis 

input word: called, target word: of 

input word: called, target word: moses 



In [9]:
# Create the data set
pairs = np.array(pairs)
ds = tf.data.Dataset.from_tensor_slices((pairs[:,0], pairs[:,1]))

# shuffle, batch, prefetch and cache
ds = ds.shuffle(1024).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE).cache()

# investigate the newly created data set
for batch in ds.take(1):
    tf.print(tf.shape(batch), "\n", batch)

[2 128] 
 ([160 3 111 ... 387 1167 1216], [71 956 170 ... 170 6 37])


# **2.3 Model**

In [10]:
class SkipGramModel(tf.keras.layers.Layer):
    
    def __init__(self, optimizer, embedding_size, vocabulary_size, counts, num_negative_samples=64):
        super(SkipGramModel, self).__init__()
        self.optimizer = optimizer
        self.embedding_size = embedding_size
        self.vocabulary_size = vocabulary_size
        self.counts = counts
        self.num_negative_samples = num_negative_samples
        
        self.metrics_list = [tf.keras.metrics.Mean(name="loss")]
        
        # Neural network weights and biases
        self.nce_weights = tf.Variable(tf.random.truncated_normal([self.vocabulary_size, self.embedding_size], stddev=0.1 / np.sqrt(self.embedding_size)))
        self.nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))
        
        # Embedding layer
        self.embedding = tf.Variable(tf.random.uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))

    def call(self, input_words):
        # Look up embeddings for a batch of inputs
        embed = tf.nn.embedding_lookup(self.embedding, input_words)
    
        return embed
    
    @property
    def metrics(self):
        return self.metrics_list

    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()

    def get_nce_loss(self, target_words, input_words, embed):
        # Sample a number of negative samples for given input words
        sampled_values = tf.random.fixed_unigram_candidate_sampler(true_classes=tf.reshape(input_words, (128,1)),
                                                                   num_true=1,
                                                                   num_sampled=self.num_negative_samples,
                                                                   unique=True,
                                                                   range_max=self.vocabulary_size,
                                                                   unigrams=self.counts,
                                                                   name="negative_sampling")
        
        # Compute the noise contrastive loss
        nce_loss = tf.nn.nce_loss(weights =self.nce_weights,
                                  biases=self.nce_biases,
                                  labels=target_words, 
                                  inputs=embed,  
                                  num_sampled=self.num_negative_samples, 
                                  num_classes=self.vocabulary_size,
                                  sampled_values=sampled_values)
        
        return tf.reduce_mean(nce_loss)
    
    @tf.function
    def train(self, data):
        input_words, target_words = data
      
        with tf.GradientTape() as tape:
            # get the embedding
            embed = self(input_words)
            # compute the loss
            loss = self.get_nce_loss(target_words, input_words, embed)
        
        # compute the gradients and update the network
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        # update the metrics 
        self.metrics[0].update_state(loss)
        
        return {metric.name: metric.result() for metric in self.metrics}

    @tf.function
    def validate(self, val_ds):
        # Compute the cosine similarity between a few common words and all embeddings
        norm = tf.sqrt(tf.reduce_sum(tf.square(self.embedding), 1, keepdims=True))
        normalized_embedding = self.embedding / norm
        val_embedding = tf.nn.embedding_lookup(normalized_embedding, val_ds)
        sim = tf.matmul(val_embedding, tf.transpose(normalized_embedding))
        
        return sim

# **2.4 Training**

In [11]:
def create_summary_writer(config_name):
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    log_path = f"logs/skipgram/{config_name}/{current_time}"
    summary_writer = tf.summary.create_file_writer(log_path)

    return summary_writer

In [12]:
def training_loop(ds, model, epochs, val_ds, val_words, summary_writer): 
    # iterate over given amount of epochs
    for epoch in range(epochs):     
        print(f"Epoch {epoch}: ")

        for data in tqdm.tqdm(ds, position = 0, leave = True):
            # get the loss
            metrics = model.train(data)

            # keep track of the metrices
            with summary_writer.as_default():
                for metric in model.metrics: 
                    tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

        # print the training metrics
        print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])
        # reset metrics 
        model.reset_metrics()
    
        # computing the cosine similarity to evaluate if our embedding table is grouping together words with similar semantic meanings
        sim = model.validate(val_ds).numpy()
        # evaluate the training by printing the closest words to our validation words
        for i, word in enumerate(val_words):
            top_k = 8 # number of nearest neighbors
            log = f"Nearest to {word}: "
            nearest = (-sim[i, :]).argsort()[1:top_k+1]
            for k in range(top_k):
                neighbour = inverse_vocab[nearest[k]]
                log += f" {neighbour},"
            print(log)

In [13]:
# Keep track of the counts
_, _, counts = map(lambda x:x.numpy(), tf.unique_with_counts(txt))
counts = list(counts.astype(float)/len(txt))

# create a tf data set with the indices of our validation words
val_words = ["holy", "father", "wine", "poison", "love", "strong", "day"]
val_words_idx = [vocab[val_words[i]] for i in range(len(val_words))]
val_ds = tf.constant(val_words_idx, dtype=tf.int32)

vocabulary_size = len(vocab)
embedding_size = 64
epochs = 15

optimizer = tf.optimizers.Adam(0.001) 
summary_writer = create_summary_writer(config_name = f'RUN')

model = SkipGramModel(optimizer = optimizer,
                      embedding_size = embedding_size,
                      vocabulary_size = vocabulary_size,
                      counts = counts)

training_loop(ds, model, epochs, val_ds, val_words, summary_writer)

Epoch 0: 


100%|██████████| 16444/16444 [07:21<00:00, 37.21it/s]


['loss: 31.258331298828125']
Nearest to holy:  before, deliver, stood, children, brother, peace, again, now,
Nearest to father:  fathers, men, mother, servant, this, say, said, us,
Nearest to wine:  they, bring, do, people, turn, away, seen, thee,
Nearest to poison:  branches, better, slew, magnified, man, brute, searched, truth,
Nearest to love:  life, which, done, seek, by, would, built, ready,
Nearest to strong:  teach, cause, time, is, ground, yea, cast, did,
Nearest to day:  know, say, do, hearken, sea, prophets, many, spirit,
Epoch 1: 


100%|██████████| 16444/16444 [06:21<00:00, 43.06it/s]


['loss: 8.107242584228516']
Nearest to holy:  else, worshipped, manner, joined, almighty, iniquities, habitation, whole,
Nearest to father:  mother, servant, solomon, confess, fathers, knowest, ears, flock,
Nearest to wine:  read, receive, sure, live, stronger, they, hired, feared,
Nearest to poison:  better, branches, searched, slew, magnified, truth, turneth, beat,
Nearest to love:  life, please, fallen, bow, remain, hated, truly, which,
Nearest to strong:  teach, thereon, ground, vain, cause, whereof, child, yea,
Nearest to day:  hill, loosed, hold, deeds, prophets, goeth, vain, understood,
Epoch 2: 


100%|██████████| 16444/16444 [06:21<00:00, 43.06it/s]


['loss: 6.477917671203613']
Nearest to holy:  joined, almighty, else, figs, manner, iniquities, habitation, service,
Nearest to father:  mother, servant, confess, solomon, jezreel, oath, follow, add,
Nearest to wine:  receive, read, stronger, hired, learn, behalf, churches, lives,
Nearest to poison:  better, searched, truth, slew, chasten, branches, turneth, magnified,
Nearest to love:  life, please, bow, fallen, remain, declare, remember, truly,
Nearest to strong:  teach, thereon, price, vain, bitter, root, pleasant, continue,
Nearest to day:  hill, first, loosed, seventh, fourteenth, whilst, understood, passover,
Epoch 3: 


100%|██████████| 16444/16444 [05:27<00:00, 50.26it/s]


['loss: 6.055075168609619']
Nearest to holy:  almighty, joined, habitation, figs, else, created, evermore, building,
Nearest to father:  mother, servant, confess, abraham, oath, wealth, jezreel, solomon,
Nearest to wine:  receive, eat, drink, behalf, hired, stronger, considered, learn,
Nearest to poison:  chasten, truth, better, searched, abel, turneth, please, slew,
Nearest to love:  life, declare, bow, perish, please, remember, judge, gifts,
Nearest to strong:  price, teach, root, bitter, continue, remembrance, soon, valleys,
Nearest to day:  fourteenth, seventh, first, passover, hill, night, understood, vision,
Epoch 4: 


100%|██████████| 16444/16444 [06:21<00:00, 43.06it/s]


['loss: 5.942498207092285']
Nearest to holy:  almighty, habitation, service, created, joined, figs, hosts, evermore,
Nearest to father:  oath, salvation, abraham, servant, confess, wealth, flock, tongue,
Nearest to wine:  drink, eat, receive, satisfied, rage, drank, holds, behalf,
Nearest to poison:  chasten, abel, better, truth, slew, please, need, searched,
Nearest to love:  life, declare, judge, bow, perish, would, slain, killed,
Nearest to strong:  root, price, bitter, remembrance, continue, pleasant, teach, soon,
Nearest to day:  fourteenth, seventh, passover, first, night, vision, understood, hosts,
Epoch 5: 


100%|██████████| 16444/16444 [06:21<00:00, 43.06it/s]


['loss: 5.925527095794678']
Nearest to holy:  almighty, redeemer, hosts, seemed, service, behold, house, subjection,
Nearest to father:  abraham, oath, salvation, confess, wealth, servant, rebellion, saul,
Nearest to wine:  drink, satisfied, eat, rage, holds, drank, behalf, strong,
Nearest to poison:  chasten, abel, better, slew, vessel, truth, refuge, makest,
Nearest to love:  life, declare, slain, perish, killed, why, please, wherefore,
Nearest to strong:  root, price, bitter, minds, remembrance, soon, scourge, misery,
Nearest to day:  passover, seventh, fourteenth, first, vision, night, hosts, morrow,
Epoch 6: 


100%|██████████| 16444/16444 [05:28<00:00, 50.00it/s]


['loss: 5.970157146453857']
Nearest to holy:  almighty, redeemer, behold, hosts, within, house, seemed, service,
Nearest to father:  abraham, oath, saul, salvation, beor, confess, wealth, jesus,
Nearest to wine:  drink, satisfied, rage, eat, holds, drive, drank, cup,
Nearest to poison:  abel, vessel, makest, following, chasten, refuge, please, anguish,
Nearest to love:  declare, life, slain, killed, why, seemeth, remember, wouldest,
Nearest to strong:  root, price, minds, bitter, scourge, misery, consent, mark,
Nearest to day:  passover, seventh, fourteenth, first, vision, night, cleansing, convenient,
Epoch 7: 


100%|██████████| 16444/16444 [06:21<00:00, 43.06it/s]


['loss: 6.106332778930664']
Nearest to holy:  behold, within, hosts, redeemer, seemed, house, almighty, service,
Nearest to father:  saul, abraham, oath, salvation, jesus, servant, beor, gift,
Nearest to wine:  drink, drive, rage, satisfied, eat, holds, cup, drank,
Nearest to poison:  vessel, abel, makest, oftentimes, anguish, chasten, following, ministers,
Nearest to love:  declare, choose, liveth, wouldest, seemeth, wrong, why, killed,
Nearest to strong:  root, minds, price, bitter, scourge, misery, consent, seeing,
Nearest to day:  seventh, passover, first, night, fourteenth, vision, likewise, convenient,
Epoch 8: 


100%|██████████| 16444/16444 [05:21<00:00, 51.08it/s]


['loss: 6.105348587036133']
Nearest to holy:  behold, within, hosts, seemed, house, zion, redeemer, adulteries,
Nearest to father:  saul, abraham, oath, jesus, salvation, servant, who, gift,
Nearest to wine:  drink, rage, drive, satisfied, eat, holds, morsel, fellows,
Nearest to poison:  abel, vessel, makest, oftentimes, ministers, repenteth, anguish, offence,
Nearest to love:  declare, choose, wouldest, why, killed, determined, likewise, slain,
Nearest to strong:  root, minds, bitter, price, scourge, seeing, soon, mark,
Nearest to day:  seventh, first, passover, convenient, likewise, fourteenth, night, even,
Epoch 9: 


100%|██████████| 16444/16444 [05:21<00:00, 51.08it/s]


['loss: 6.12803840637207']
Nearest to holy:  hosts, within, behold, seemed, love, adulteries, called, zion,
Nearest to father:  saul, abraham, oath, servants, jesus, faith, who, salvation,
Nearest to wine:  drink, drive, eat, rage, satisfied, morsel, holds, fellows,
Nearest to poison:  vessel, oftentimes, abel, makest, repenteth, offence, theirs, skill,
Nearest to love:  declare, choose, widows, wouldest, mayest, likewise, truth, slain,
Nearest to strong:  root, bitter, minds, scourge, soon, seeing, price, consent,
Nearest to day:  seventh, first, even, likewise, now, night, passover, convenient,
Epoch 10: 


100%|██████████| 16444/16444 [06:21<00:00, 43.06it/s]


['loss: 6.234816551208496']
Nearest to holy:  hosts, seemed, behold, love, truth, within, called, zion,
Nearest to father:  saul, abraham, oath, faith, master, servants, liar, jesus,
Nearest to wine:  drink, satisfied, eat, drive, rage, hindered, morsel, fellows,
Nearest to poison:  oftentimes, vessel, abel, makest, filthiness, threw, offence, repenteth,
Nearest to love:  choose, declare, loved, widows, truth, determined, wouldest, swallowed,
Nearest to strong:  root, bitter, minds, soon, seeing, price, consent, scourge,
Nearest to day:  seventh, first, now, even, convenient, likewise, passover, hosts,
Epoch 11: 


100%|██████████| 16444/16444 [05:25<00:00, 50.45it/s]


['loss: 6.25201940536499']
Nearest to holy:  hosts, behold, zion, seemed, within, love, called, truth,
Nearest to father:  abraham, saul, faith, oath, god, master, wealth, jesus,
Nearest to wine:  drink, eat, satisfied, hindered, drive, rage, morsel, fellows,
Nearest to poison:  vessel, filthiness, oftentimes, threw, makest, abel, offence, jahaziel,
Nearest to love:  choose, loved, widows, stay, declare, now, wrong, determined,
Nearest to strong:  root, bitter, seeing, minds, soon, price, water, consent,
Nearest to day:  seventh, first, convenient, now, even, hosts, likewise, fourteenth,
Epoch 12: 


100%|██████████| 16444/16444 [06:21<00:00, 43.06it/s]


['loss: 6.317963600158691']
Nearest to holy:  hosts, behold, seemed, zion, called, house, say, truth,
Nearest to father:  saul, abraham, master, faith, beor, kindred, liar, wealth,
Nearest to wine:  drink, eat, satisfied, hindered, drive, holds, increase, vinegar,
Nearest to poison:  vessel, threw, filthiness, oftentimes, jahaziel, ministers, offence, skill,
Nearest to love:  loved, choose, widows, stay, wrong, swallowed, declare, now,
Nearest to strong:  root, bitter, seeing, water, consent, soon, scourge, whether,
Nearest to day:  seventh, first, fourteenth, morrow, convenient, hour, hosts, sabbath,
Epoch 13: 


100%|██████████| 16444/16444 [05:20<00:00, 51.36it/s]


['loss: 6.318968772888184']
Nearest to holy:  hosts, behold, seemed, redeemer, called, desired, house, justified,
Nearest to father:  saul, jesus, abraham, kindred, oath, faith, wealth, who,
Nearest to wine:  drink, eat, satisfied, vinegar, cup, hindered, drive, increase,
Nearest to poison:  vessel, threw, filthiness, oftentimes, jahaziel, offence, firmament, natural,
Nearest to love:  widows, wrong, loved, swallowed, choose, now, stay, endured,
Nearest to strong:  root, bitter, seeing, water, gathering, consent, misery, price,
Nearest to day:  seventh, morrow, fourteenth, first, convenient, sabbath, passover, hour,
Epoch 14: 


100%|██████████| 16444/16444 [05:21<00:00, 51.08it/s]

['loss: 6.378677845001221']
Nearest to holy:  hosts, behold, called, seemed, redeemer, truth, healing, desired,
Nearest to father:  wealth, abraham, god, kindred, jesus, saul, dance, who,
Nearest to wine:  drink, eat, vinegar, satisfied, spirit, drive, loft, cup,
Nearest to poison:  vessel, threw, filthiness, oftentimes, jahaziel, calleth, ministers, offence,
Nearest to love:  wrong, choose, loved, widows, swallowed, stay, creep, now,
Nearest to strong:  seeing, root, gathering, water, famous, bitter, trench, deep,
Nearest to day:  seventh, morrow, fourteenth, sabbath, hour, passover, until, spring,



